## Logistic regressions

In [1]:
import numpy as np
import os
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

# Load the numpy .npz file
with np.load(os.path.join('data', 'cifar4-train.npz'), allow_pickle=False) as data:
    cifar4_data = dict(data.items())
    
print('Data loaded')
print('It is a dictionary with keys:', list(cifar4_data.keys()))

Data loaded
It is a dictionary with keys: ['pixels', 'overfeat', 'labels', 'names', 'allow_pickle']


In [2]:
X = cifar4_data['overfeat']
y = cifar4_data['labels']

print('X shape:',X.shape)
print('y shape:', y.shape)

X shape: (5000, 4096)
y shape: (5000,)


From the data exploration we use 164 features and we apply it to the data set.

In [3]:
# Splitting the data
from sklearn.model_selection import train_test_split

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=4000,test_size=1000,
                                          random_state=0, stratify=y)

# checking the shapes
print('X_train:', X_train.shape, X_train.dtype)
print('y_train:', y_train.shape, y_train.dtype)
print('X_test:', X_test.shape, X_test.dtype)
print('y_test:', y_test.shape, y_test.dtype)

X_train: (4000, 4096) float32
y_train: (4000,) int64
X_test: (1000, 4096) float32
y_test: (1000,) int64


In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

# Create pipeline
pipe = Pipeline([
    ('pca',None),
    ('logreg',LogisticRegression())
])

# Create cross-validation object
grid_cv=GridSearchCV(pipe,[{
    'logreg__multi_class':['ovr'],
    'logreg__C':[0.1,1,10],
    'logreg__solver':['liblinear']
},{
    'pca':[PCA(n_components=164)],
    'logreg__multi_class':['ovr'],
    'logreg__C':[0.1,1,10],
    'logreg__solver':['liblinear']
}],cv=5)

# Fit estimator
grid_cv.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('pca', None), ('logreg', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'logreg__multi_class': ['ovr'], 'logreg__C': [0.1, 1, 10], 'logreg__solver': ['liblinear']}, {'pca': [PCA(copy=True, iterated_power='auto', n_components=164, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)], 'logreg__multi_class': ['ovr'], 'logreg__C': [0.1, 1, 10], 'logreg__solver': ['liblinear']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [25]:
df = pd.DataFrame.from_items([
    ('alpha',grid_cv.cv_results_['param_logreg__C']),
    ('mean accuracy',grid_cv.cv_results_['mean_test_score']),
    ('standard deviation',grid_cv.cv_results_['std_test_score'])
])
df.sort_values(by='mean accuracy',ascending=False)

Unnamed: 0,alpha,mean accuracy,standard deviation
3,0.1,0.82975,0.015839
5,10.0,0.82875,0.017428
4,1.0,0.82775,0.017738
0,0.1,0.827,0.012614
1,1.0,0.82,0.013784
2,10.0,0.815,0.015871


In [26]:
# Best combination
idx_max = df.loc[df['mean accuracy'].idxmax()]

print('Top accuracy across folds:',idx_max[1],
      ' (std:',idx_max[2],
      ') with 164 components and alpha:',idx_max[0])

Top accuracy across folds: 0.82975  (std: 0.015839034061457154 ) with 164 components and alpha: 0.1


In [27]:
# Compute predictions with the "best_estimator_" attribute
grid_cv.best_estimator_.predict(X_test)

array([2, 2, 2, 0, 3, 0, 2, 0, 0, 1, 3, 1, 2, 2, 3, 2, 3, 0, 1, 0, 0, 1,
       3, 3, 0, 1, 1, 3, 1, 0, 0, 2, 1, 3, 1, 1, 3, 2, 3, 3, 2, 0, 0, 1,
       2, 0, 3, 1, 1, 2, 0, 0, 1, 2, 2, 3, 3, 2, 2, 2, 3, 2, 1, 0, 3, 3,
       3, 0, 1, 3, 2, 3, 1, 2, 3, 0, 3, 3, 2, 1, 3, 2, 2, 0, 2, 1, 2, 0,
       1, 0, 0, 2, 1, 3, 1, 3, 1, 3, 0, 1, 3, 2, 2, 1, 1, 2, 1, 1, 0, 0,
       3, 1, 1, 1, 3, 2, 1, 1, 1, 1, 1, 2, 1, 0, 1, 2, 3, 3, 2, 2, 2, 3,
       0, 1, 1, 1, 2, 3, 0, 2, 1, 3, 0, 3, 1, 1, 2, 2, 2, 1, 2, 2, 2, 3,
       2, 3, 3, 2, 2, 2, 2, 2, 1, 2, 1, 1, 2, 1, 0, 1, 0, 0, 2, 3, 3, 1,
       2, 0, 0, 2, 1, 0, 2, 1, 2, 1, 3, 1, 1, 3, 3, 3, 1, 0, 1, 3, 0, 0,
       2, 2, 0, 0, 1, 3, 0, 3, 1, 2, 3, 3, 2, 3, 3, 1, 3, 2, 0, 3, 0, 3,
       2, 0, 1, 1, 0, 3, 3, 1, 0, 2, 1, 2, 1, 1, 2, 2, 3, 1, 2, 2, 1, 3,
       3, 2, 0, 0, 2, 0, 1, 3, 0, 2, 3, 3, 2, 0, 3, 1, 3, 3, 3, 3, 0, 3,
       1, 3, 3, 1, 0, 2, 1, 2, 1, 2, 0, 3, 1, 3, 1, 3, 2, 0, 3, 2, 3, 0,
       3, 1, 1, 0, 0, 2, 3, 2, 0, 2, 3, 2, 2, 2, 2,

In [32]:
accuracy = grid_cv.score(X_test,y_test)

In [33]:
print('Logistic regression accuracy (test set):',accuracy)

Logistic regression accuracy (test set): 0.813


## Second task
#### pick ten images from the test set and compute probabilities

In [34]:
import random

## 10 random indices 
ten_imgs_idx = random.choices(range(1000), k=10)

preds = grid_cv.predict_proba(X_test[ten_imgs_idx])

prob_preds = pd.DataFrame(np.around(preds, decimals=2),
                          columns= cifar4_data['names'],
                          index=['image {}'.format(i) for i in range(1, 11)])

In [35]:
prob_preds

Unnamed: 0,truck,car,airplane,ship
image 1,0.0,0.0,1.0,0.0
image 2,0.67,0.23,0.01,0.08
image 3,0.0,0.0,0.99,0.01
image 4,0.01,0.89,0.1,0.01
image 5,0.0,0.04,0.56,0.4
image 6,0.58,0.42,0.0,0.0
image 7,0.02,0.01,0.97,0.0
image 8,0.24,0.53,0.01,0.22
image 9,0.92,0.0,0.01,0.06
image 10,0.31,0.68,0.01,0.0


### Try with different regularization strengths. Write your observations.

In [36]:
# with best alpha/1000

# Create cross-validation object
grid_cv=GridSearchCV(pipe,[{
    'logreg__multi_class':['ovr'],
    'logreg__C':[0.001],
    'logreg__solver':['liblinear']
},{
    'pca':[PCA(n_components=164)],
    'logreg__multi_class':['ovr'],
    'logreg__C':[0.001],
    'logreg__solver':['liblinear']
}],cv=5)

# Fit estimator
grid_cv.fit(X_train,y_train)

# Create dataframe
df = pd.DataFrame.from_items([
    ('alpha',grid_cv.cv_results_['param_logreg__C']),
    ('mean accuracy',grid_cv.cv_results_['mean_test_score']),
    ('standard deviation',grid_cv.cv_results_['std_test_score'])
])
df.sort_values(by='mean accuracy',ascending=False)

Unnamed: 0,alpha,mean accuracy,standard deviation
0,0.001,0.841,0.015195
1,0.001,0.826,0.015029


In [37]:
preds = grid_cv.predict_proba(X_test[ten_imgs_idx])

prob_preds_low_alpha = pd.DataFrame(np.around(preds, decimals=2),
                          columns= cifar4_data['names'],
                          index=['image {}'.format(i) for i in range(1, 11)])

prob_preds_low_alpha

Unnamed: 0,truck,car,airplane,ship
image 1,0.0,0.0,0.98,0.01
image 2,0.64,0.18,0.02,0.17
image 3,0.0,0.0,0.95,0.05
image 4,0.02,0.82,0.15,0.02
image 5,0.0,0.13,0.61,0.26
image 6,0.58,0.41,0.0,0.01
image 7,0.02,0.01,0.96,0.01
image 8,0.26,0.47,0.04,0.22
image 9,0.87,0.02,0.02,0.09
image 10,0.23,0.75,0.01,0.0


In [38]:
# with best alpha*1000

# Create cross-validation object
grid_cv=GridSearchCV(pipe,[{
    'logreg__multi_class':['ovr'],
    'logreg__C':[1000],
    'logreg__solver':['liblinear']
},{
    'pca':[PCA(n_components=164)],
    'logreg__multi_class':['ovr'],
    'logreg__C':[1000],
    'logreg__solver':['liblinear']
}],cv=5)

# Fit estimator
grid_cv.fit(X_train,y_train)

# Create dataframe
df = pd.DataFrame.from_items([
    ('alpha',grid_cv.cv_results_['param_logreg__C']),
    ('mean accuracy',grid_cv.cv_results_['mean_test_score']),
    ('standard deviation',grid_cv.cv_results_['std_test_score'])
])
df.sort_values(by='mean accuracy',ascending=False)

Unnamed: 0,alpha,mean accuracy,standard deviation
1,1000,0.8305,0.01495
0,1000,0.811,0.014904


In [39]:
preds = grid_cv.predict_proba(X_test[ten_imgs_idx])

prob_preds_high_alpha = pd.DataFrame(np.around(preds, decimals=2),
                          columns= cifar4_data['names'],
                          index=['image {}'.format(i) for i in range(1, 11)])

prob_preds_high_alpha

Unnamed: 0,truck,car,airplane,ship
image 1,0.0,0.0,1.0,0.0
image 2,0.72,0.19,0.0,0.09
image 3,0.0,0.0,1.0,0.0
image 4,0.0,0.9,0.09,0.0
image 5,0.0,0.03,0.6,0.37
image 6,0.59,0.41,0.0,0.0
image 7,0.01,0.0,0.99,0.0
image 8,0.25,0.53,0.01,0.22
image 9,0.92,0.0,0.01,0.06
image 10,0.28,0.71,0.01,0.0


It seems with a lower regularization we can obtain better results in the train set. Let's use 0.001 as our tuned regularization strength.

In [55]:
prob_preds_low_alpha

Unnamed: 0,truck,car,airplane,ship
image 1,0.0,0.0,0.98,0.01
image 2,0.64,0.18,0.02,0.17
image 3,0.0,0.0,0.95,0.05
image 4,0.02,0.82,0.15,0.02
image 5,0.0,0.13,0.61,0.26
image 6,0.58,0.41,0.0,0.01
image 7,0.02,0.01,0.96,0.01
image 8,0.26,0.47,0.04,0.22
image 9,0.87,0.02,0.02,0.09
image 10,0.23,0.75,0.01,0.0


The higher the alpha the more strong opinionated is the algorithm. For instance, in images 1 and 3 with a higher alpha our model has 100% probability of identifying the image as an airplane. However, with a low alpha our model recognizes that there is a small probability that images 1 and 3 could be a ship instead.

In [40]:
# Create cross-validation object
grid_cv=GridSearchCV(pipe,[{
    'logreg__multi_class':['ovr'],
    'logreg__C':[0.001],
    'logreg__solver':['liblinear']
},{
    'pca':[PCA(n_components=164)],
    'logreg__multi_class':['ovr'],
    'logreg__C':[0.001],
    'logreg__solver':['liblinear']
}],cv=5)

# Fit estimator
grid_cv.fit(X_train,y_train)

# Create dataframe
df = pd.DataFrame.from_items([
    ('alpha',grid_cv.cv_results_['param_logreg__C']),
    ('mean accuracy',grid_cv.cv_results_['mean_test_score']),
    ('standard deviation',grid_cv.cv_results_['std_test_score'])
])
df.sort_values(by='mean accuracy',ascending=False)

Unnamed: 0,alpha,mean accuracy,standard deviation
0,0.001,0.841,0.015195
1,0.001,0.826,0.014904


In [41]:
# Compute predictions with the "best_estimator_" attribute
grid_cv.best_estimator_.predict(X_test)

array([2, 2, 2, 0, 2, 0, 2, 0, 0, 1, 3, 1, 2, 2, 2, 2, 3, 1, 1, 0, 0, 1,
       3, 3, 0, 1, 1, 3, 1, 0, 0, 2, 1, 3, 1, 1, 3, 2, 3, 3, 2, 0, 0, 1,
       2, 0, 3, 1, 1, 2, 0, 0, 0, 2, 2, 3, 3, 2, 2, 2, 3, 2, 1, 0, 3, 3,
       3, 0, 1, 1, 2, 3, 1, 2, 3, 3, 3, 3, 2, 1, 3, 2, 2, 0, 2, 1, 2, 0,
       1, 0, 0, 2, 1, 3, 1, 3, 1, 3, 0, 1, 3, 2, 2, 1, 1, 2, 1, 1, 0, 1,
       3, 1, 1, 1, 3, 2, 1, 1, 1, 1, 1, 2, 1, 0, 1, 2, 3, 3, 3, 2, 2, 3,
       0, 0, 1, 1, 2, 1, 0, 2, 1, 3, 0, 3, 1, 1, 2, 3, 2, 1, 2, 2, 2, 3,
       2, 3, 3, 2, 2, 2, 2, 2, 1, 2, 1, 1, 2, 0, 0, 1, 0, 0, 2, 3, 3, 1,
       3, 0, 0, 2, 1, 0, 2, 1, 2, 1, 3, 1, 1, 3, 3, 3, 1, 0, 1, 3, 0, 0,
       2, 2, 0, 0, 1, 3, 0, 3, 0, 2, 3, 3, 2, 3, 3, 1, 3, 1, 0, 3, 0, 3,
       0, 0, 1, 1, 0, 3, 2, 1, 0, 2, 1, 2, 1, 1, 2, 3, 3, 1, 3, 2, 1, 3,
       3, 2, 0, 0, 2, 0, 1, 3, 0, 2, 3, 3, 2, 0, 3, 1, 3, 3, 3, 3, 0, 3,
       1, 3, 0, 1, 0, 2, 1, 2, 1, 2, 0, 3, 1, 3, 1, 3, 2, 0, 3, 2, 3, 1,
       3, 1, 1, 0, 0, 2, 3, 3, 0, 2, 3, 2, 2, 2, 2,

In [42]:
accuracy = grid_cv.score(X_test,y_test)

In [43]:
print('Logistic regression accuracy (test set):',accuracy)

Logistic regression accuracy (test set): 0.844


# Predictions

In [20]:
# Load the numpy .npz file
with np.load(os.path.join('data', 'cifar4-test.npz'), allow_pickle=False) as data:
    cifar4_test = dict(data.items())
    
print('Data loaded')
print('It is a dictionary with keys:', list(cifar4_test.keys()))

X = cifar4_test['overfeat']

print('X shape:',X.shape)

Data loaded
It is a dictionary with keys: ['pixels', 'overfeat', 'allow_pickle']
X shape: (1000, 4096)


In [24]:
# Compute predictions
y_predictions = grid_cv.best_estimator_.predict(X)
print(y_predictions.shape)

(1000,)


In [27]:
# Save as npy
np.save('test-predictions.npy',y_predictions)