# Stroke Data Benchmark Model

In [16]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

In [17]:
# Read in preprocessed data
x_train = pd.read_csv('data/x_train.csv')
x_test = pd.read_csv('data/x_test.csv')
y_train = pd.read_csv('data/y_train.csv')
y_test = pd.read_csv('data/y_test.csv')

## Benchmark Model

### Linear Regression

In [18]:
model = LogisticRegression(solver='liblinear', random_state=0)

## Linear Regression

A simple binary classifier

In [19]:
# Instantiate model
model = LogisticRegression(solver='liblinear', random_state=0)

In [20]:
# Train model
model.fit(x_train, y_train.values.ravel())

LogisticRegression(random_state=0, solver='liblinear')

In [21]:
# Test model
model.predict(x_test)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

### Calculate F1 score

In [22]:
matrix = confusion_matrix(y_test, model.predict(x_test))
print(matrix)

[[1589    2]
 [  95    1]]


In [23]:
recall = matrix[1][1] / (matrix[1][1] + matrix[1][0])
print('Recall score: ' + str(round(recall, 4)))

precision = matrix[1][1] / (matrix[1][1] + matrix[0][1])
print('Precision score: ' + str(round(precision, 4)))

f1 = (2 * precision * recall / (precision + recall))
print('F1 score: ' + str(round(f1, 4)))

Recall score: 0.0104
Precision score: 0.3333
F1 score: 0.0202


### Tweak model parameters

In [24]:
model = LogisticRegression(solver='liblinear', class_weight='balanced', random_state=0)
model.fit(x_train, y_train.values.ravel())

LogisticRegression(class_weight='balanced', random_state=0, solver='liblinear')

In [25]:
matrix = confusion_matrix(y_test, model.predict(x_test))
print(matrix)

[[1178  413]
 [  24   72]]


In [26]:
recall = matrix[1][1] / (matrix[1][1] + matrix[1][0])
print('Recall score: ' + str(round(recall, 4)))

precision = matrix[1][1] / (matrix[1][1] + matrix[0][1])
print('Precision score: ' + str(round(precision, 4)))

f1 = (2 * precision * recall / (precision + recall))
print('F1 score: ' + str(round(f1, 4)))

Recall score: 0.75
Precision score: 0.1485
F1 score: 0.2478


In [27]:
model.get_params()

{'C': 1.0,
 'class_weight': 'balanced',
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 0,
 'solver': 'liblinear',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

### Using Grid Search

In [28]:
lr = LogisticRegression()             # initialize the model

solver_options = ['liblinear']
multi_class_options = ['ovr']
#class_weight_options = ['balanced']

#Setting the range for class weights
weights = np.linspace(0.0,0.99,20)

#Creating a dictionary grid for grid search
class_weight_options = [{0:x, 1:1.0-x} for x in weights]

max_iter_value = [1000000]
c_values = [1, 10, 100, 1000]

param_grid = dict(solver = solver_options, multi_class = multi_class_options, class_weight = class_weight_options, max_iter = max_iter_value, C = c_values)

grid = GridSearchCV(lr, param_grid, scoring = 'f1', n_jobs=-1)
grid.fit(x_train, y_train.values.ravel())

GridSearchCV(estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': [1, 10, 100, 1000],
                         'class_weight': [{0: 0.0, 1: 1.0},
                                          {0: 0.05210526315789474,
                                           1: 0.9478947368421052},
                                          {0: 0.10421052631578948,
                                           1: 0.8957894736842105},
                                          {0: 0.1563157894736842,
                                           1: 0.8436842105263158},
                                          {0: 0.20842105263157895,
                                           1: 0.791578947368421},
                                          {0: 0.2605263157894737,
                                           1: 0.7394736842105263},
                                          {0: 0.3126315...
                                           1: 0.3226315789473684},
                                         

In [29]:
grid.best_estimator_

LogisticRegression(C=1,
                   class_weight={0: 0.10421052631578948, 1: 0.8957894736842105},
                   max_iter=1000000, multi_class='ovr', solver='liblinear')

In [30]:
matrix = confusion_matrix(y_test, grid.best_estimator_.predict(x_test))
print(matrix)

recall = matrix[1][1] / (matrix[1][1] + matrix[1][0])
print('Recall score: ' + str(round(recall, 4)))

precision = matrix[1][1] / (matrix[1][1] + matrix[0][1])
print('Precision score: ' + str(round(precision, 4)))

f1 = (2 * precision * recall / (precision + recall))
print('F1 score: ' + str(round(f1, 4)))

[[1413  178]
 [  48   48]]
Recall score: 0.5
Precision score: 0.2124
F1 score: 0.2981
