In [1]:
# Import packages
import pandas as pd
import numpy as np
import itertools

# Set the random seed for consistent output
np.random.seed(18)

# Read in the data
data = pd.read_csv("data/dummy_data.csv", index_col=0)

In [2]:
from sklearn.model_selection import train_test_split

y = data.outcome
X = data.drop('outcome', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
print(f"Train {y_train.size}, Test {y_test.size}")

Train 37, Test 13


In [3]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver='lbfgs')
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [13]:
prediction_probs = lr.predict_proba(X_test)

# View the prediction type, shape, and print out a sample prediction
print(f"predictions is of type: {type(predictions)}")
print(f"predictions has shape: {predictions.shape}")
print(f"predicted class for 10th element in test set: {predictions[9]}")

predictions is of type: <class 'numpy.ndarray'>
predictions has shape: (13,)
predicted class for 10th element in test set: 0


In [14]:
lr.classes_

array([0, 1])

In [15]:
for i in range(5):
    print(f"Element number: {i}")
    print(f"Predicted class: {predictions[i]}")
    print(f"Probability of predicting class 0: {prediction_probs[i][0]}")
    print(f"Probability of predicting class 1: {prediction_probs[i][1]}\n")

Element number: 0
Predicted class: 1
Probability of predicting class 0: 0.4234829673784545
Probability of predicting class 1: 0.5765170326215455

Element number: 1
Predicted class: 1
Probability of predicting class 0: 0.4914968703166641
Probability of predicting class 1: 0.5085031296833359

Element number: 2
Predicted class: 1
Probability of predicting class 0: 0.483088763245348
Probability of predicting class 1: 0.516911236754652

Element number: 3
Predicted class: 0
Probability of predicting class 0: 0.869536534985778
Probability of predicting class 1: 0.13046346501422199

Element number: 4
Predicted class: 0
Probability of predicting class 0: 0.8470774295731546
Probability of predicting class 1: 0.15292257042684546



In [16]:
prediction_probs[:, 1]

array([0.57651703, 0.50850313, 0.51691124, 0.13046347, 0.15292257,
       0.26162479, 0.50831618, 0.3190805 , 0.37250246, 0.47736442,
       0.15743244, 0.51193665, 0.26832495])

## Tuning the Model

In [18]:
lr.score(X_test, y_test)

0.6153846153846154

In [20]:
params = {
    'solver': 'liblinear',
    'fit_intercept': False,
    'penalty': 'l1',
    'max_iter': 500
}

lr_tweaked = LogisticRegression(**params)
lr_tweaked.fit(X_train, y_train)

print(f"Tweaked hyperparameters: {lr_tweaked.get_params()}\n")
print(f"Mean Accuracy: {lr_tweaked.score(X_test, y_test)}\n")

Tweaked hyperparameters: {'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': False, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 500, 'multi_class': 'warn', 'n_jobs': None, 'penalty': 'l1', 'random_state': None, 'solver': 'liblinear', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}

Mean Accuracy: 0.5384615384615384



In [21]:
hyperparams = {
    'solver': ['liblinear'],
    'fit_intercept': [True, False],
    'penalty': ['l1', 'l2'],
    'class_weight': [None, "balanced"]
}

hp_values = list(hyperparams.values())
hp_values

[['liblinear'], [True, False], ['l1', 'l2'], [None, 'balanced']]

In [22]:
for hp in itertools.product(*hp_values):
    print(hp)

('liblinear', True, 'l1', None)
('liblinear', True, 'l1', 'balanced')
('liblinear', True, 'l2', None)
('liblinear', True, 'l2', 'balanced')
('liblinear', False, 'l1', None)
('liblinear', False, 'l1', 'balanced')
('liblinear', False, 'l2', None)
('liblinear', False, 'l2', 'balanced')


In [23]:
for hp in itertools.product(*hp_values):
    
    estimator = LogisticRegression(
        solver=hp[0],
        fit_intercept=hp[1],
        penalty=hp[2],
        class_weight=hp[3]
    )
    
    estimator.fit(X_train, y_train)
    print(f"Parameters Userd: {hp}")
    print(f"Mean accuracy of the model: {estimator.score(X_test, y_test)}\n")

Parameters Userd: ('liblinear', True, 'l1', None)
Mean accuracy of the model: 0.5384615384615384

Parameters Userd: ('liblinear', True, 'l1', 'balanced')
Mean accuracy of the model: 0.46153846153846156

Parameters Userd: ('liblinear', True, 'l2', None)
Mean accuracy of the model: 0.38461538461538464

Parameters Userd: ('liblinear', True, 'l2', 'balanced')
Mean accuracy of the model: 0.46153846153846156

Parameters Userd: ('liblinear', False, 'l1', None)
Mean accuracy of the model: 0.5384615384615384

Parameters Userd: ('liblinear', False, 'l1', 'balanced')
Mean accuracy of the model: 0.46153846153846156

Parameters Userd: ('liblinear', False, 'l2', None)
Mean accuracy of the model: 0.3076923076923077

Parameters Userd: ('liblinear', False, 'l2', 'balanced')
Mean accuracy of the model: 0.46153846153846156

