# Logistic Regression Modeling

In [11]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.svm import SVC
from sklearn.metrics import recall_score, precision_score, accuracy_score, make_scorer

import warnings
warnings.filterwarnings("ignore")

# 1.1 Load Data

In [12]:
# non-engineered data
df = pd.read_csv("train.csv")

# engineered data
df_eng = pd.read_csv("train_engineered.csv")

df.head()

Unnamed: 0,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis,class
0,80.111572,33.942432,85.101608,46.169139,125.593624,100.292107,Spondylolisthesis
1,63.404481,14.115327,48.136806,49.289153,111.916008,31.784495,Spondylolisthesis
2,52.419385,19.011561,35.87266,33.407825,116.559771,1.694705,Hernia
3,91.468741,24.508177,84.620272,66.960564,117.307897,52.623047,Spondylolisthesis
4,44.36249,8.945435,46.902096,35.417055,129.220682,4.994195,Normal


In [13]:
df_eng.head()

Unnamed: 0,lumbar_lordosis_angle,pelvic_radius,class__Hernia,class__Normal,class__Spondylolisthesis
0,85.101608,125.593624,0,0,1
1,48.136806,111.916007,0,0,1
2,35.87266,116.559771,1,0,0
3,84.620272,117.307897,0,0,1
4,46.902096,129.220682,0,1,0


# 1.2 One Hot Encode Target Variable

In [14]:
# ohe non-engineered data
dummies = pd.get_dummies(df["class"], prefix="class_")
df.drop("class", axis=1, inplace=True)
df = pd.concat([df, dummies], axis=1)

df.head()

Unnamed: 0,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis,class__Hernia,class__Normal,class__Spondylolisthesis
0,80.111572,33.942432,85.101608,46.169139,125.593624,100.292107,0,0,1
1,63.404481,14.115327,48.136806,49.289153,111.916008,31.784495,0,0,1
2,52.419385,19.011561,35.87266,33.407825,116.559771,1.694705,1,0,0
3,91.468741,24.508177,84.620272,66.960564,117.307897,52.623047,0,0,1
4,44.36249,8.945435,46.902096,35.417055,129.220682,4.994195,0,1,0


# 2.1 Preprocessing

Here I will seperate the independent and dependent variables. I will also scale the independent variables.

In [15]:
# seperate independent and dependent variables

# list targets
targets = [col for col in df.columns if col.startswith("class")]

# non-engineered data
X = df.drop(targets, axis=1)
y_hernia = df["class__Hernia"]
y_spond = df["class__Spondylolisthesis"]

# engineered data
Xe = df_eng.drop(targets, axis=1)
ye_hernia = df_eng["class__Hernia"]
ye_spond = df_eng["class__Spondylolisthesis"]

X.shape, y_hernia.shape, y_spond.shape, Xe.shape, ye_hernia.shape, ye_spond.shape

((279, 6), (279,), (279,), (279, 2), (279,), (279,))

In [16]:
# scale data
sc = StandardScaler()

# non-engineered data
X = sc.fit_transform(X)

# engineered data
Xe = sc.fit_transform(Xe)

# 3.1 Baseline Models

Here I will look at the baseline performance of the default LogisticRegression model on the data sets. I will use cross validation.

In [17]:
# instantiate the default logistic regression model
clf = SVC()

In [18]:
# create a function to print desired scores
scorers = [make_scorer(g) for g in [recall_score, precision_score, accuracy_score]]

def find_scores(classifier, X, y, scorers):
    for scorer, score_type in zip(scorers, ["recall", "precision", "accuracy"]):
        print(f"\n\n{score_type}")
        results = cross_validate(clf, X, y, scoring=scorer, cv=4)["test_score"]
        print(results)
        print("avg:", sum(results)/4)
        print("var:", np.array(results).var())

# run function on non-engineered data pertaining to hernias
find_scores(clf, X, y_hernia, scorers)



recall
[0.57142857 0.5        0.61538462 0.61538462]
avg: 0.5755494505494505
var: 0.0022245954594855705


precision
[0.8        0.875      0.57142857 0.66666667]
avg: 0.7282738095238095
var: 0.01376727253401361


accuracy
[0.88732394 0.88571429 0.84057971 0.86956522]
avg: 0.8707957892281223
var: 0.0003525667921510756


In [19]:
# find scores for non-engineered data pertaining to Spondylolisthesis
find_scores(clf, X, y_spond, scorers)



recall
[0.88235294 0.88235294 0.97058824 0.96969697]
avg: 0.9262477718360071
var: 0.0019268554529249732


precision
[0.9375     0.96774194 0.97058824 0.94117647]
avg: 0.9542516603415561
var: 0.00022511247897005738


accuracy
[0.91428571 0.92857143 0.97142857 0.95652174]
avg: 0.9427018633540374
var: 0.0005058398595733191


In [20]:
# find scores for engineered data pertaining to hernia
find_scores(clf, Xe, ye_hernia, scorers)



recall
[0.28571429 0.21428571 0.15384615 0.23076923]
avg: 0.22115384615384615
var: 0.002209500664170993


precision
[0.57142857 0.75       0.66666667 0.6       ]
avg: 0.6470238095238096
var: 0.004728954081632654


accuracy
[0.81690141 0.82857143 0.82608696 0.82608696]
avg: 0.8244116875164028
var: 1.9830197442504622e-05


In [21]:
# find scores for engineered data pertaining to Spondylolisthesis
find_scores(clf, Xe, ye_spond, scorers)



recall
[0.79411765 0.82352941 0.91176471 0.90909091]
avg: 0.8596256684491979
var: 0.002689882467328206


precision
[0.84375    0.82352941 0.81578947 0.85714286]
avg: 0.8350529356479434
var: 0.0002668690323777485


accuracy
[0.82857143 0.82857143 0.85714286 0.88405797]
avg: 0.8495859213250517
var: 0.0005321618250324698


Overall the non-engineered data seemed to work the best so far.

# 4.1 Hyper paramerer tuning

Now I will tune the hyper parameters for the models. From now on I am only concerned with the non-engineered data.

In [23]:
# explore parameter tuning for finding hernias
params = {"C":[0.01, 0.1, 1, 10, 100],
         "kernel":["linear", "sigmoid", "poly"]}
gscv = GridSearchCV(clf, params, cv=4)
gscv.fit(X, y_hernia)

GridSearchCV(cv=4, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [0.01, 0.1, 1, 10, 100],
                         'kernel': ['linear', 'sigmoid', 'poly']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [26]:
gscv.best_estimator_.get_params()["C"], gscv.best_estimator_.get_params()["kernel"]

(1, 'linear')

In [27]:
# baseline scores
find_scores(clf, X, y_hernia, scorers)



recall
[0.57142857 0.5        0.61538462 0.61538462]
avg: 0.5755494505494505
var: 0.0022245954594855705


precision
[0.8        0.875      0.57142857 0.66666667]
avg: 0.7282738095238095
var: 0.01376727253401361


accuracy
[0.88732394 0.88571429 0.84057971 0.86956522]
avg: 0.8707957892281223
var: 0.0003525667921510756


In [28]:
# after tuning
find_scores(gscv.best_estimator_, X, y_hernia, scorers)



recall
[0.57142857 0.5        0.61538462 0.61538462]
avg: 0.5755494505494505
var: 0.0022245954594855705


precision
[0.8        0.875      0.57142857 0.66666667]
avg: 0.7282738095238095
var: 0.01376727253401361


accuracy
[0.88732394 0.88571429 0.84057971 0.86956522]
avg: 0.8707957892281223
var: 0.0003525667921510756


In [29]:
# explore parameter tuning for finding hernias
params = {"C":[0.01, 0.1, 1, 10, 100]}
gscv2 = GridSearchCV(clf, params, cv=4)
gscv2.fit(X, y_spond)

GridSearchCV(cv=4, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None, param_grid={'C': [0.01, 0.1, 1, 10, 100]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [30]:
gscv2.best_estimator_.get_params()["C"], gscv2.best_estimator_.get_params()["kernel"]

(10, 'rbf')

In [31]:
# baseline scores
find_scores(clf, X, y_spond, scorers)



recall
[0.88235294 0.88235294 0.97058824 0.96969697]
avg: 0.9262477718360071
var: 0.0019268554529249732


precision
[0.9375     0.96774194 0.97058824 0.94117647]
avg: 0.9542516603415561
var: 0.00022511247897005738


accuracy
[0.91428571 0.92857143 0.97142857 0.95652174]
avg: 0.9427018633540374
var: 0.0005058398595733191


In [32]:
# after tuning
find_scores(gscv2.best_estimator_, X, y_spond, scorers)



recall
[0.88235294 0.88235294 0.97058824 0.96969697]
avg: 0.9262477718360071
var: 0.0019268554529249732


precision
[0.9375     0.96774194 0.97058824 0.94117647]
avg: 0.9542516603415561
var: 0.00022511247897005738


accuracy
[0.91428571 0.92857143 0.97142857 0.95652174]
avg: 0.9427018633540374
var: 0.0005058398595733191
