# Logistic Regression Modeling

In [22]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

# 1.1 Load Data

In [23]:
train = pd.read_csv("train.csv")
val = pd.read_csv("val.csv")

# 1.2 Preprocessing

First I will have to make sure there are at least 6 rows in each class for SMOTE to work properly.

In [30]:
# see number of rows per a class
train.Type.value_counts()

2    48
1    34
7    19
3    10
5     6
6     3
Name: Type, dtype: int64

In [31]:
# copy rows for class 6
copy = train[train["Type"] == 6]
train = pd.concat([train, copy], axis=0)

Here I will scale the data

In [32]:
X_train, y_train = train.drop("Type", axis=1), train.Type
X_val, y_val = val.drop("Type", axis=1), val.Type

In [33]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_val = sc.transform(X_val)

Now I will apply smote to make the classes balanced.

In [35]:
sm = SMOTE()
X_train, y_train = sm.fit_resample(X_train, y_train)

# 2.1 Hyper Parameter Tuning

To find the optimal parameters I will use a grid search.

In [36]:
params = {"C":[0.01, 0.1, 1, 10]}
gscv = GridSearchCV(LogisticRegression(), params)

gscv.fit(X_train, y_train)
gscv.best_params_



{'C': 10}

Now I will do a more fine combed search

In [37]:
params = {"C":[7, 8, 9, 10, 11, 12, 13]}
gscv = GridSearchCV(LogisticRegression(), params)

gscv.fit(X_train, y_train)
gscv.best_params_



{'C': 12}

In [38]:
# save model
model = gscv.best_estimator_

# 3.1 Evaluation

Here I will see how well the model does on the training and validation sets.

In [39]:
# fit
model.fit(X_train, y_train)

# training predictions
train_pred = model.predict(X_train)

# validation predictions
val_pred = model.predict(X_val)

# scores
print("training performance")
print(classification_report(y_train, train_pred))

print("validation performance")
print(classification_report(y_val, val_pred))

training performance
              precision    recall  f1-score   support

           1       0.68      0.81      0.74        48
           2       0.76      0.33      0.46        48
           3       0.77      0.96      0.85        48
           5       0.91      1.00      0.95        48
           6       0.96      1.00      0.98        48
           7       1.00      0.98      0.99        48

    accuracy                           0.85       288
   macro avg       0.85      0.85      0.83       288
weighted avg       0.85      0.85      0.83       288

validation performance
              precision    recall  f1-score   support

           1       0.86      0.67      0.75        18
           2       0.50      0.50      0.50         6
           3       0.57      1.00      0.73         4
           5       0.67      1.00      0.80         4
           6       1.00      1.00      1.00         2
           7       1.00      0.83      0.91         6

    accuracy                     

