In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import chart_studio.plotly as py
import cufflinks as cf
from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot

from sklearn.model_selection import train_test_split


init_notebook_mode(connected=True)

# Preprocessing

## Loading and visualizing the data

In [2]:
train_df = pd.read_csv('mnist_train.csv')
test_df = pd.read_csv('mnist_test.csv')

## Preparing the data

In [3]:
x = train_df[train_df.columns[1:]] 
y = train_df['label']

## Split data into a train / test dataset

In [4]:
x_train, x_validation, y_train, y_validation = train_test_split(x, y, test_size = 0.2)

# Logistic Regression

In [5]:
from sklearn.linear_model import LogisticRegression

## Train logistic regression model

In [6]:
logmodel = LogisticRegression()
logmodel.fit(x_train,y_train)


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



LogisticRegression()

## Use the model to predict

In [8]:
predictions = logmodel.predict(x_validation)

## Evaluate the accuracy of the model

In [9]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [10]:
print(classification_report(y_validation,predictions))
print('---------------')
print(confusion_matrix(y_validation,predictions))

              precision    recall  f1-score   support

           0       0.94      0.96      0.95      1181
           1       0.95      0.97      0.96      1325
           2       0.91      0.90      0.90      1204
           3       0.91      0.88      0.89      1234
           4       0.93      0.92      0.93      1181
           5       0.89      0.87      0.88      1093
           6       0.92      0.96      0.94      1173
           7       0.94      0.92      0.93      1230
           8       0.88      0.89      0.88      1184
           9       0.90      0.90      0.90      1195

    accuracy                           0.92     12000
   macro avg       0.92      0.92      0.92     12000
weighted avg       0.92      0.92      0.92     12000

---------------
[[1137    0    5    1    1   14   13    2    6    2]
 [   0 1282    9    4    1    4    2    3   15    5]
 [   9   12 1080   23   15    4   15    8   32    6]
 [   4    6   36 1083    1   47    7   14   23   13]
 [   7    8  

# K-Nearest Neighbors (KNN)

## Standarizing data

In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [12]:
scaler = StandardScaler()
scaler.fit(train_df.drop('label', axis = 1))
scaled_features = scaler.transform(train_df.drop('label', axis = 1))
df_feat = pd.DataFrame(scaled_features, columns = train_df.columns[1:])
x = df_feat[train_df.columns[1:]]

In [13]:
x_train, x_validation, y_train, y_validation = train_test_split(x, y, test_size = 0.2)

## Train KNN model

In [14]:
from sklearn.neighbors import KNeighborsClassifier

In [15]:
knn = KNeighborsClassifier(n_neighbors = 1)
knn.fit(x_train, y_train)

KNeighborsClassifier(n_neighbors=1)

## Evaluate the model

In [16]:
predictions = knn.predict(x_validation)

In [17]:
print(classification_report(y_validation,predictions))
print('---------')
print(confusion_matrix(y_validation,predictions))

              precision    recall  f1-score   support

           0       0.97      0.98      0.97      1158
           1       0.96      0.99      0.98      1339
           2       0.95      0.94      0.95      1185
           3       0.94      0.93      0.94      1258
           4       0.96      0.92      0.94      1229
           5       0.92      0.93      0.93      1081
           6       0.96      0.97      0.96      1156
           7       0.93      0.93      0.93      1231
           8       0.96      0.91      0.93      1186
           9       0.89      0.93      0.91      1177

    accuracy                           0.94     12000
   macro avg       0.94      0.94      0.94     12000
weighted avg       0.94      0.94      0.94     12000

---------
[[1136    0    1    2    1    4   11    1    1    1]
 [   0 1327    5    0    1    1    0    4    0    1]
 [  10    6 1117   14    6    2    8    8   10    4]
 [   2    0   21 1173    0   23    1   22   12    4]
 [   1   15    7   

# Decision Tree

## Train Decision Tree model

In [18]:
from sklearn.tree import DecisionTreeClassifier

In [19]:
dtree = DecisionTreeClassifier()
dtree.fit(x_train, y_train)

DecisionTreeClassifier()

##  Evaluate the model

In [20]:
predictions = dtree.predict(x_validation)

In [21]:
print(classification_report(y_validation,predictions))
print('---------')
print(confusion_matrix(y_validation,predictions))

              precision    recall  f1-score   support

           0       0.91      0.91      0.91      1158
           1       0.94      0.95      0.94      1339
           2       0.87      0.85      0.86      1185
           3       0.83      0.84      0.84      1258
           4       0.87      0.88      0.87      1229
           5       0.81      0.81      0.81      1081
           6       0.90      0.90      0.90      1156
           7       0.90      0.88      0.89      1231
           8       0.82      0.82      0.82      1186
           9       0.83      0.84      0.83      1177

    accuracy                           0.87     12000
   macro avg       0.87      0.87      0.87     12000
weighted avg       0.87      0.87      0.87     12000

---------
[[1051    0   13   15   10   16   21    4   15   13]
 [   0 1266   14   12    1    9    3   11   13   10]
 [  12   18 1009   29   15   17   20   21   36    8]
 [  12   11   29 1060    8   56    4   18   41   19]
 [   9    6   15   

# Random Forest

## Train Random Forests model

In [22]:
from sklearn.ensemble import RandomForestClassifier

In [23]:
rfc = RandomForestClassifier(n_estimators=200,n_jobs=-1)
rfc.fit(x_train, y_train)

RandomForestClassifier(n_estimators=200, n_jobs=-1)

##  Evaluate the model

In [24]:
predictions = rfc.predict(x_validation)

In [25]:
print(classification_report(y_validation,predictions))
print('---------')
print(confusion_matrix(y_validation,predictions))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98      1158
           1       0.99      0.98      0.99      1339
           2       0.96      0.97      0.97      1185
           3       0.97      0.95      0.96      1258
           4       0.97      0.97      0.97      1229
           5       0.97      0.96      0.97      1081
           6       0.98      0.98      0.98      1156
           7       0.97      0.96      0.97      1231
           8       0.95      0.96      0.95      1186
           9       0.95      0.96      0.95      1177

    accuracy                           0.97     12000
   macro avg       0.97      0.97      0.97     12000
weighted avg       0.97      0.97      0.97     12000

---------
[[1143    0    1    0    1    2    6    0    5    0]
 [   0 1318    7    4    4    1    0    2    1    2]
 [   3    3 1150    1    5    0    5    6   12    0]
 [   0    0   16 1199    0   11    1   11   15    5]
 [   2    2    1   

# Support Vector Machine (SVM)

## Train SVM model

In [26]:
from sklearn.svm import SVC

In [27]:
svc = SVC()
svc.fit(x_train, y_train)

SVC()

In [28]:
predictions = svc.predict(x_validation)

In [30]:
print(classification_report(y_validation,predictions))
print('---------')
print(confusion_matrix(y_validation,predictions))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1158
           1       0.98      0.99      0.98      1339
           2       0.95      0.97      0.96      1185
           3       0.96      0.95      0.96      1258
           4       0.97      0.97      0.97      1229
           5       0.96      0.96      0.96      1081
           6       0.98      0.97      0.98      1156
           7       0.94      0.97      0.95      1231
           8       0.96      0.95      0.96      1186
           9       0.96      0.95      0.95      1177

    accuracy                           0.97     12000
   macro avg       0.97      0.97      0.97     12000
weighted avg       0.97      0.97      0.97     12000

---------
[[1140    0    3    0    1    4    9    0    1    0]
 [   0 1319    6    1    2    0    0    7    0    4]
 [   2    4 1152    4    2    0    2    7   10    2]
 [   0    2   16 1197    0    8    1   17   14    3]
 [   2    2    6   

# Results

Random Forest и SVC показали наилучшие результаты (97%). 

Задачу также ещё можно решить, например, методом кластеризации, для определния оптимального колличества кластеров можно использовать, например TSNE. Также модели можно улучшить с помощь PCA , определив максимальное колличество самых информативных компонент, которые будут отражать 90% дисперсии. 