# Logistic Regression Modeling

In [1]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# 1 Load Data

In [2]:
train = pd.read_csv("train_engineered.csv")
val = pd.read_csv("val_engineered.csv")

train.head()

Unnamed: 0,class_p,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_g,cap-surface_s,cap-surface_y,cap-color_c,...,population_n,population_s,population_v,population_y,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,1,0,0,0,0,1,0,1,0,0,...,0,0,1,0,0,0,0,0,1,0
1,1,0,1,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,1,0,0,0
3,1,0,1,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,1,0,0
4,0,0,0,0,0,1,0,1,0,0,...,0,1,0,0,1,0,0,0,0,0


# 2 Preprocessing

Since the entire data set here is one-hot-encoded scaling is not necessary

In [3]:
# split x and y variables
X_train, y_train = train.drop("class_p", axis=1), train.class_p
X_val, y_val = val.drop("class_p", axis=1), val.class_p

# 3 Hyper parameter tuning

In [14]:
params = {"C":[0.01, 0.1, 1, 10, 100], "tol":[0.0001, 0.001, 0.01]}
gs = GridSearchCV(LogisticRegression(), params)
gs.fit(X_train, y_train)
gs.best_params_





{'C': 10, 'tol': 0.0001}

In [15]:
params = {"C":[5, 10, 15, 20, 30, 40, 50],  "tol":[0.0001, 0.001, 0.01]}
gs = GridSearchCV(LogisticRegression(), params)
gs.fit(X_train, y_train)
gs.best_params_





{'C': 5, 'tol': 0.0001}

In [16]:
model = gs.best_estimator_

# 4 Evaluation

In [17]:
model.fit(X_train, y_train)
train_pred = model.predict(X_train)
print("training results")
print(classification_report(y_train, train_pred))

val_pred = model.predict(X_val)
print("\n\n\n validation results")
print(classification_report(y_val, val_pred))

training results
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2357
           1       1.00      1.00      1.00      2212

    accuracy                           1.00      4569
   macro avg       1.00      1.00      1.00      4569
weighted avg       1.00      1.00      1.00      4569




 validation results
              precision    recall  f1-score   support

           0       0.71      0.32      0.44       811
           1       0.52      0.85      0.65       713

    accuracy                           0.57      1524
   macro avg       0.62      0.59      0.55      1524
weighted avg       0.62      0.57      0.54      1524





The model is over fitting severely, so I will increase the "C" parameter to 100 and lower the tolerance

In [30]:
model = model.set_params(C=1000, tol=0.0000001)
model

LogisticRegression(C=1000, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=1e-07, verbose=0,
                   warm_start=False)

In [31]:
model.fit(X_train, y_train)
train_pred = model.predict(X_train)
print("training results")
print(classification_report(y_train, train_pred))

val_pred = model.predict(X_val)
print("\n\n\n validation results")
print(classification_report(y_val, val_pred))

training results
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2357
           1       1.00      1.00      1.00      2212

    accuracy                           1.00      4569
   macro avg       1.00      1.00      1.00      4569
weighted avg       1.00      1.00      1.00      4569




 validation results
              precision    recall  f1-score   support

           0       0.69      0.44      0.54       811
           1       0.55      0.77      0.64       713

    accuracy                           0.60      1524
   macro avg       0.62      0.61      0.59      1524
weighted avg       0.62      0.60      0.59      1524



