# Logistic Regression via Randomized Search CV # 

In [2]:
import numpy as np
import pandas
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score
import time

Run Logistic Regression on TWO datasets: One for the 28x28 images, one for the 64x64 images. 

## 28 x 28 images ##

In [3]:
data = pandas.read_csv('data28.csv')
y = data['label'].values
X = data[data.columns[1:]].values

# 70-30 train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.7, random_state = 1)

In [4]:
grid={"C" : np.append(np.logspace(-3,3,7), 0), 'penalty' : ["l2"], "max_iter": [100,150,200,250,300,350,400]} # l1: lasso, l2: ridge
logreg=LogisticRegression()
logreg_cv_=RandomizedSearchCV(logreg,grid,cv=5,n_jobs=-1,verbose=4,random_state=1,n_iter=10) # run all jobs in parallel (n_jobs=-1), and print the CV accuracy per fold on the terminal (verbose = 4)
t0 = time.time()
logreg_cv_.fit(X_train,y_train)
t1 = time.time()
print('Randomized search (10 iterations) for 28 x 28 images took', t1 - t0, 'seconds')

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Randomized search (10 iterations) for 28 x 28 images took 23.3782320022583 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [5]:
print("tuned hyperparameters: (best parameters) ",logreg_cv_.best_params_)
print("CV accuracy :",logreg_cv_.best_score_)

tuned hyperparameters: (best parameters)  {'penalty': 'l2', 'max_iter': 200, 'C': 0.001}
CV accuracy : 0.8386666666666667


In [6]:
C = logreg_cv_.best_params_['C']
max_iter = logreg_cv_.best_params_['max_iter']
penalty = logreg_cv_.best_params_['penalty']

lr = LogisticRegression(max_iter=max_iter, C=C, penalty=penalty)
t0 = time.time()
lr.fit(X_train, y_train.flatten())
t1 = time.time()
print('Training for 28 x 28 images took', t1 - t0, 'seconds')

Training for 28 x 28 images took 0.7330210208892822 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [7]:
# training accuracy
train_acc = accuracy_score(y_train, lr.predict(X_train)) 
train_acc

0.9713333333333334

In [8]:
# test accuracy
test_acc = accuracy_score(y_test, lr.predict(X_test)) 
test_acc

0.8517142857142858

In [9]:
# write cv results (such as mean fit times for each hyperparam configuration) to file.
with open('logreg_28_randomcv.txt', 'w') as file:
    file.write(str(logreg_cv_.cv_results_))
    file.close()

In [10]:
# Note: The 'mean_fit_time' property (in files like above) is particularly useful for our report,
# yet the name is misleading, as this is the TOTAL fitting time it takes 
# for performing ALL the folds for one set of hyperparameters.
# The 'mean_fit_time' is a 1-D array, placed in the same corresponding order as the 'params' field.

## 64 x 64 images ##

In [11]:
data = pandas.read_csv('data64.csv')
y = data['label'].values
X = data[data.columns[1:]].values

# 70-30 train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.7, random_state = 1)

In [12]:
grid={"C" : np.append(np.logspace(-3,3,7), 0), 'penalty' : ["l2"], "max_iter": [100,150,200,250,300,350,400]} # l1: lasso, l2: ridge
logreg=LogisticRegression()
logreg_cv=RandomizedSearchCV(logreg,grid,cv=5,n_jobs=-1,verbose=4,random_state=1,n_iter=10) # run all jobs in parallel (n_jobs=-1), and print the CV accuracy per fold on the terminal (verbose = 4)
t0 = time.time()
logreg_cv.fit(X_train,y_train)
t1 = time.time()
print('Randomized search (10 iterations) for 64 x 64 images took', t1 - t0, 'seconds')

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Randomized search (10 iterations) for 64 x 64 images took 123.62241625785828 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [13]:
print("tuned hyperparameters: (best parameters) ",logreg_cv.best_params_)
print("CV accuracy :",logreg_cv.best_score_)

tuned hyperparameters: (best parameters)  {'penalty': 'l2', 'max_iter': 200, 'C': 0.001}
CV accuracy : 0.8306666666666667


In [14]:
C = logreg_cv.best_params_['C']
max_iter = logreg_cv.best_params_['max_iter']
penalty = logreg_cv.best_params_['penalty']

lr = LogisticRegression(max_iter=max_iter, C=C, penalty=penalty)
t0 = time.time()
lr.fit(X_train, y_train.flatten())
t1 = time.time()
print('Training for 64 x 64 images took', t1 - t0, 'seconds')

Training for 64 x 64 images took 3.9675979614257812 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [15]:
# training accuracy
train_acc = accuracy_score(y_train, lr.predict(X_train)) 
train_acc

0.9946666666666667

In [16]:
# test accuracy
test_acc = accuracy_score(y_test, lr.predict(X_test)) 
test_acc

0.8425714285714285

In [17]:
# write cv results (such as mean fit times for each hyperparam configuration) to file
with open('logreg_64_randomcv.txt', 'w') as file:
    file.write(str(logreg_cv.cv_results_))
    file.close()

### Resources (cite for report):
* https://www.kaggle.com/enespolat/grid-search-with-logistic-regression 