In [1]:
import pandas as pd
import numpy as np
from numpy import argmax
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV


In [2]:
data_df = pd.read_csv('data.csv')
#to make it faster I am working only with the numbers
data_df = data_df[72000:]


In [3]:
#separating features from label
X_df = data_df.drop(['character'], axis=1)
#scaling the features by dividing it by 255
X_df = X_df/255
#converting X to matrix form 
X = X_df.as_matrix()

In [4]:
y=data_df['character'].as_matrix()

In [5]:
lr = LogisticRegression(multi_class = 'ovr',solver = 'lbfgs')

In [6]:
X_train, X_test, y_train,y_test = train_test_split(X,y,random_state=123, test_size=0.33)

In [7]:
lr.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)

In [8]:
lr.score(X_test,y_test)

0.9381818181818182

In [9]:
#with a simple LR model we are getting an accuracy of 93.8% which is okay. Lets try hyper tuning the parameters
#I am not cross validating because I am using the entire dataset hence it is taking a much longer time for even one set
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10] }
clf = GridSearchCV(LogisticRegression(penalty='l2',multi_class = 'ovr',solver = 'lbfgs'), param_grid,cv=5)

In [10]:
clf.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [11]:
print("Tuned Logistic Regression Parameters: {}".format(clf.best_params_)) 
print("Best score is {}".format(clf.best_score_))

Tuned Logistic Regression Parameters: {'C': 0.1}
Best score is 0.9415671641791045


In [12]:
lr = LogisticRegression(multi_class = 'ovr',solver = 'lbfgs', C =0.1)

In [13]:
lr.fit(X_train,y_train)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)

In [14]:
lr.score(X_test,y_test)

0.941969696969697

In [15]:
#As you can see changing the regularization rate, we are getting an increased accuracy of 94.1% from 93.8%