# Stroke Data Benchmark Model

In [20]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

In [21]:
# Read in preprocessed data
x_train = pd.read_csv('data/x_train.csv')
x_test = pd.read_csv('data/x_test.csv')
y_train = pd.read_csv('data/y_train.csv')
y_test = pd.read_csv('data/y_test.csv')

## Benchmark Model

### Linear Regression

In [22]:
model = LogisticRegression(solver='liblinear', random_state=0)

In [23]:
x_train

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,gender_Female,gender_Male,gender_Other,ever_married_No,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,62090,51.00,0,0,219.17,29.8,0,1,0,1,...,0,0,1,0,1,0,0,0,1,0
1,15649,54.00,1,0,198.69,28.1,0,1,0,0,...,0,1,0,0,1,0,0,0,0,1
2,27647,80.00,0,1,95.49,31.6,0,1,0,0,...,0,0,1,0,1,0,1,0,0,0
3,4498,71.00,0,1,204.98,28.1,0,1,0,0,...,0,1,0,0,0,1,0,1,0,0
4,36811,23.00,0,0,94.09,30.9,1,0,0,1,...,0,1,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3418,13846,43.00,0,0,88.00,30.6,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
3419,1307,61.00,1,0,170.05,60.2,1,0,0,0,...,0,1,0,0,1,0,0,0,0,1
3420,31481,1.16,0,0,97.28,17.8,1,0,0,1,...,0,0,0,1,0,1,1,0,0,0
3421,61827,80.00,0,0,196.08,31.0,0,1,0,0,...,0,0,1,0,1,0,0,1,0,0


In [24]:
model.fit(x_train, y_train)

  return f(**kwargs)


LogisticRegression(random_state=0, solver='liblinear')

In [25]:
model.predict(x_test)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [26]:
model.score(x_test, y_test)

0.942501481920569

In [27]:
matrix = confusion_matrix(y_test, model.predict(x_test))
print(matrix)

[[1589    2]
 [  95    1]]


In [28]:
recall = matrix[1][1] / (matrix[1][1] + matrix[1][0])
print('Recall score: ' + str(round(recall, 4)))

precision = matrix[1][1] / (matrix[1][1] + matrix[0][1])
print('Precision score: ' + str(round(precision, 4)))

f1 = (2 * precision * recall / (precision + recall))
print('F1 score: ' + str(round(f1, 4)))

Recall score: 0.0104
Precision score: 0.3333
F1 score: 0.0202


### Improve the Model

In [29]:
model = LogisticRegression(solver='liblinear', class_weight='balanced', random_state=0)
model.fit(x_train, y_train)

  return f(**kwargs)


LogisticRegression(class_weight='balanced', random_state=0, solver='liblinear')

In [30]:
matrix = confusion_matrix(y_test, model.predict(x_test))
print(matrix)

[[1178  413]
 [  24   72]]


In [31]:
recall = matrix[1][1] / (matrix[1][1] + matrix[1][0])
print('Recall score: ' + str(round(recall, 4)))

precision = matrix[1][1] / (matrix[1][1] + matrix[0][1])
print('Precision score: ' + str(round(precision, 4)))

f1 = (2 * precision * recall / (precision + recall))
print('F1 score: ' + str(round(f1, 4)))

Recall score: 0.75
Precision score: 0.1485
F1 score: 0.2478


In [32]:
model.get_params()

{'C': 1.0,
 'class_weight': 'balanced',
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 0,
 'solver': 'liblinear',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

### Using Grid Search

In [34]:
lr = LogisticRegression()             # initialize the model

#solver_options = ['newton-cg', 'lbfgs', 'liblinear', 'sag']
solver_options = ['newton-cg']
#multi_class_options = ['ovr', 'multinomial']
#class_weight_options = ['None', 'balanced']
multi_class_options = ['ovr']
class_weight_options = ['balanced']
max_iter_value = [10000]
c_values = [1, 10, 100, 1000]

param_grid = dict(solver = solver_options, multi_class = multi_class_options, class_weight = class_weight_options, max_iter = max_iter_value, C = c_values)

grid = GridSearchCV(lr, param_grid, scoring = 'f1', n_jobs=-1)
grid.fit(x_train, y_train)

exception calling callback for <Future at 0x258231735e0 state=finished raised BrokenProcessPool>
joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "C:\Users\eogha\anaconda3\lib\site-packages\joblib\externals\loky\process_executor.py", line 404, in _process_worker
    call_item = call_queue.get(block=True, timeout=timeout)
  File "C:\Users\eogha\anaconda3\lib\multiprocessing\queues.py", line 116, in get
    return _ForkingPickler.loads(res)
  File "C:\Users\eogha\anaconda3\lib\site-packages\sklearn\__init__.py", line 80, in <module>
    from .base import clone
  File "C:\Users\eogha\anaconda3\lib\site-packages\sklearn\base.py", line 21, in <module>
    from .utils import _IS_32BIT
  File "C:\Users\eogha\anaconda3\lib\site-packages\sklearn\utils\__init__.py", line 23, in <module>
    from .class_weight import compute_class_weight, compute_sample_weight
  File "C:\Users\eogha\anaconda3\lib\site-packages\sklearn\utils\class_weight.py", 

BrokenProcessPool: A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.

In [95]:
grid.best_estimator_

LogisticRegression(C=10, class_weight='balanced', max_iter=10000,
                   multi_class='ovr', solver='newton-cg')

In [96]:
matrix = confusion_matrix(y_test, grid.best_estimator_.predict(x_test))
print(matrix)

recall = matrix[1][1] / (matrix[1][1] + matrix[1][0])
print('Recall score: ' + str(round(recall, 4)))

precision = matrix[1][1] / (matrix[1][1] + matrix[0][1])
print('Precision score: ' + str(round(precision, 4)))

f1 = (2 * precision * recall / (precision + recall))
print('F1 score: ' + str(round(f1, 4)))

[[1179  412]
 [  23   73]]
Recall score: 0.7604
Precision score: 0.1505
F1 score: 0.2513
