In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [29]:
def show_img_and_label(img,label,pred_label):
    img = np.array(img)
    plt.title("Label: "+str(label)+",predicted: "+str(pred_label))
    plt.imshow(img.reshape(28,28),cmap='gray',vmin=0, vmax=255)

In [30]:
def show_image(array):
    array = np.array(array)
    label = array[0]
    img = np.delete(array,0)
    plt.imshow(img.reshape(28,28),cmap='gray',vmin=0, vmax=255)

In [31]:
df = pd.read_csv('train.csv');

## Analytics of data

In [32]:
df.info() #get info from dataframe

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42000 entries, 0 to 41999
Columns: 785 entries, label to pixel783
dtypes: int64(785)
memory usage: 251.5 MB


In [33]:
df.describe() # get info about every colons

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
count,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,...,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0
mean,4.456643,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.219286,0.117095,0.059024,0.02019,0.017238,0.002857,0.0,0.0,0.0,0.0
std,2.88773,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6.31289,4.633819,3.274488,1.75987,1.894498,0.414264,0.0,0.0,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,254.0,254.0,253.0,253.0,254.0,62.0,0.0,0.0,0.0,0.0


In [34]:
corr_matrix = df.corr() #get matrix of correlation between colons
corr_matrix['label'].sort_values(ascending = False)

KeyboardInterrupt: KeyboardInterrupt: 

## Clasification method from sklearning

In [9]:
X = df.drop(['label'], axis=1) 

In [10]:
y = df['label']

In [11]:
scaler = StandardScaler() # main idea is normalize and standardize value before using some train algorithms 

In [12]:
scaler.fit(X) # compute the mean and std to be used for later scaling.

X=scaler.transform(X) # perform standardization by centering and scaling

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) # split data for training and test sets

In [14]:
X_train.shape #get our size of train set

(37800, 784)

In [15]:
X_test.shape #get our size of test set

(4200, 784)

In [17]:
clf = LogisticRegression(max_iter =1000).fit(X_train, y_train) #Logistic Regression from sklearning and train on train data

In [18]:
clf.coef_

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [35]:
predicted = clf.predict(X_test) # get prediction on test data

In [36]:
delta =(predicted == y_test) # get a successful index of prediction value

In [37]:
unique, counts = np.unique(y_test[~delta], return_counts=True) # get a failed number where we wrong predict a value
print("failed predict value:",dict(zip(unique, counts)))

failed predict value: {0: 17, 1: 15, 2: 55, 3: 67, 4: 38, 5: 47, 6: 28, 7: 41, 8: 51, 9: 48}


In [38]:
clf.score(X_train,y_train) # return the mean accuracy on the given train data and labels. 

0.9551058201058201

In [39]:
clf.score(X_test,y_test) # return the mean accuracy on the given test data and labels. 

0.9030952380952381

In [40]:
accuracy_score(y_test,predicted) # accuracy classification score.

0.9030952380952381

## Method to predict result with custom logistic regression

In [41]:
def normalize(X):
    mean = np.mean(X,axis=0)
    std = np.std(X,axis=0)
    std[std < 0.001] =1
    X_new = (X-mean)/std
    return X_new, mean, std

In [42]:
def normalize_(X,mean,std):
    X_new = (X-mean)/std;
    return X_new

In [43]:
def prepare_X(X):
    m = X.shape[0]
    ones = np.ones((m, 1))
    X_new = np.column_stack((ones, X))
    return X_new

In [44]:
def sigmoid( valueToActivate ):

    g = 1/(1+np.exp(valueToActivate*-1)) #activation function

    return g

In [45]:
def h(X, theta):
    z = X.dot(theta)
    return sigmoid(z)

In [46]:
def cost_function(X, y, theta):
    m = X.shape[0]
    if m ==0:
        return None
    temp =h(X,theta)
    J = sum(-1*y*np.log(temp) + (1-y)*np.log(1-temp) )/m
    return J

In [47]:
def derivative_theta(X, y, theta):
    m = X.shape[0]
    if m == 0:
        return None

    d_theta = X_new.T.dot(h(X_new,theta) -y_new)/m;
    
    return d_theta

In [48]:
def gradient_descent(X, y, theta, alpha, epsilon, num_iters, print_J = True):
    m = X.shape[0]
    J_history = []
    J = cost_function(X,y,theta)
    if print_J == True:
        print(J)
    J_history.append(J)
    for i in range(num_iters):
        theta -= alpha* derivative_theta(X,y,theta)
        J = cost_function(X,y,theta)
        J_history.append(J)
        if abs(J-J_history[-2]) < epsilon:
            break
    return theta, J_history

In [49]:

def predict(X, mean, std, models,labels):
    
    X_new =normalize_(X,mean,std);
    X_new = prepare_X(X_new);
    predictions = [];
    for theta in models:
        predictions.append(h(X_new,theta))
    
    predict =[];
    for p in range(0,len(predictions[0])):
        max = abs(predictions[0][p]);
        max_label  =labels[0];
        for l in range(1,len(labels)):
            if(abs(predictions[l][p]) > max):
                max = abs(predictions[l][p]);
                max_label  =labels[l];
        predict.append(max_label);
    return predict

In [50]:
X = df.drop(['label'], axis=1)

In [51]:
y = df['label']

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [53]:
from sklearn.preprocessing import OneHotEncoder

In [54]:
enc = OneHotEncoder(handle_unknown='error') # transforms lists of feature-value mappings to vectors.

In [55]:
enc.fit(np.array(y).reshape(-1,1))

OneHotEncoder()

In [56]:
enc.categories_ #get all categoricals feachers

[array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])]

In [57]:
y_train_encoded  = enc.transform(np.array(y_train).reshape(-1,1)).toarray()

In [58]:
models =[] 

In [59]:
X_new,mean,std = normalize(X_train) #normalize a train data

In [60]:
X_new = prepare_X(X_new) #prepare data for train model

In [114]:
for m in range(0,10):
    theta = np.zeros((X_new.shape[1], 1))
    y_new =y_train_encoded[:,m].reshape((X_train.shape[0], 1))
    new_theta, Js = gradient_descent(X_new, y_new, theta, 0.01, 1e-5, 1000, False)
    models.append(theta) #train our model

KeyboardInterrupt: KeyboardInterrupt: 

In [86]:
y_train_pred=predict(X_train,mean,std,models,[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) # predict value in train set

ValueError: ValueError: shapes (37800,785) and (784,1) not aligned: 785 (dim 1) != 784 (dim 0)

In [71]:
accuracy_score(y_train, y_train_pred) # our accuracy score in train set

NameError: NameError: name 'y_train_pred' is not defined

In [61]:
y_test_pred =predict(X_test,mean,std,models,[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) #try to predict value in test set

ValueError: ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 4200 and the array at index 1 has size 42000

In [50]:
accuracy_score(y_test, y_test_pred) # accuracy score for test data

NameError: NameError: name 'y_test' is not defined