# Logistic Regression
* In this notebook we are going to train a model using logistic regression, experiment with various params using grid search and try to come up with best `Logistic Regression` version.

## Install Libraries

In [21]:
# %pip install scikit-learn

## Import Libraries

In [22]:
import os
import sys
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,recall_score,precision_score,precision_recall_curve
import seaborn as sns


# Build an absolute path from this notebook's parent directory
module_path = os.path.abspath(os.path.join('..'))

# Add to sys.path if not already present
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.utils import preprocessing
from src.utils import common
from src.utils.training import refit_strategy

## Initialize Directories

In [23]:
data_root_dir = Path("..", "data/")
models_root_dir = Path("..", "models/")

## Read Data

In [24]:
X_train = pd.read_csv(Path(data_root_dir,"X_train.csv"))
y_train = pd.read_csv(Path(data_root_dir,"y_train.csv"))

In [25]:
# preprocessed_data_df = pd.DataFrame(preprocessing.pipeline.fit_transform(
#     X_train,y_train), columns=preprocessing.pipeline.get_feature_names_out())
# preprocessed_data_df.head()

In [26]:
# preprocessed_data_df.isna().sum()

## Training Default Model

In [27]:
# import sklearn


# sklearn.metrics.get_scorer_names() 

In [28]:
from sklearn.discriminant_analysis import StandardScaler
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.pipeline import Pipeline


default_logistic_regression_model = LogisticRegression(max_iter=1000,random_state=42)

model_pipeline = Pipeline([
    ("preprocessing", preprocessing.pipeline),
    ("normalizing", StandardScaler()),
    ("prediction", default_logistic_regression_model)
])

scoring = ["recall", "precision", "f1"]

default_logistic_regression_scores = cross_validate(
    estimator=model_pipeline, 
    X=X_train, 
    y=y_train.values.ravel(), 
    cv=3, scoring=scoring,
    n_jobs=-1, verbose=2)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.


[CV] END .................................................... total time=   1.9s
[CV] END .................................................... total time=   1.9s
[CV] END .................................................... total time=   1.9s


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    3.3s finished


In [29]:
default_logistic_regression_scores

{'fit_time': array([1.18940568, 1.08584452, 1.07034445]),
 'score_time': array([0.73122835, 0.84357762, 0.82897019]),
 'test_recall': array([0.89233242, 0.88269054, 0.88016529]),
 'test_precision': array([0.85541373, 0.85349612, 0.86176669]),
 'test_f1': array([0.87348315, 0.86784787, 0.87086882])}

In [30]:
cv_scores = default_logistic_regression_scores


In [31]:
mean_recall,mean_precision,mean_f1 = common.calculate_mean_from_cv(default_logistic_regression_scores)

Mean Recall: 0.8850627486991124, Mean Precision: 0.8850627486991124,Mean F1: 0.8707332810998434


In [32]:
# commenting this code out to avoid overwriting the metrics file. 
# common.update_models_metrics("Logistic Regression", "v0", mean_recall,mean_precision,mean_f1)

Observations:
* With average recall of `0.88` and average precision of `0.85` we already have a better model than baseline estimator. Although the recall is less than most frequent verion of baseline, average precision and F1 score makes this model more promising.


## GridSearch CV v1

In [33]:
## checking params
# preprocessing.pipeline.get_params()

In [34]:
from sklearn.model_selection import GridSearchCV


model_pipeline = Pipeline([
    ("preprocessing", preprocessing.pipeline),
    ("normalizing", StandardScaler()),
    ("prediction", LogisticRegression(max_iter=1000,random_state=42))
])

scoring = ["recall", "precision", "f1"]

param_grid = {
    "prediction__solver": ["liblinear", "saga"],
    "prediction__penalty": ["l1", "l2"],
    "prediction__C": [0.1, 1, 10]
}

grid_search = GridSearchCV(model_pipeline, param_grid, scoring=scoring, cv=3,n_jobs=-1,refit=refit_strategy)
grid_search.fit(X_train, y_train.values.ravel())


In [35]:
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Scores : {grid_search.best_index_}")
average_recall,average_precision,average_f1 = common.read_best_mean_grid_search_metrics(grid_search.cv_results_,grid_search.best_index_)

Best Parameters: {'prediction__C': 10, 'prediction__penalty': 'l2', 'prediction__solver': 'liblinear'}
Best Scores : 10
Mean Recall: 0.8858279767370676, Mean Precision: 0.8569368664548503,Mean F1: 0.8711237682528649


In [36]:
# commenting this code out to avoid overwriting the metrics file.
# _, file_name = common.save_model(
#     "Logistic Regression", "v1", grid_search.best_estimator_)
# common.update_models_metrics("Logistic Regression", "v1", average_recall,
#                              average_precision, average_f1, file_name=file_name)
# common.update_model_params(
#     "LogisticRegression", "v1", grid_search.best_params_)

## GridSearch CV v2

In [37]:
preprocessing.pipeline.get_params(deep=True)

{'force_int_remainder_cols': True,
 'n_jobs': None,
 'remainder': 'drop',
 'sparse_threshold': 0.3,
 'transformer_weights': None,
 'transformers': [('preprocess_gender',
   Pipeline(steps=[('default_cat_pipeline',
                    Pipeline(steps=[('fill_empty_strings',
                                     FunctionTransformer(feature_names_out='one-to-one',
                                                         func=<function fill_empty_strings_fn at 0x7f97813b7a60>)),
                                    ('strip_spaces',
                                     FunctionTransformer(feature_names_out='one-to-one',
                                                         func=<function strip_spaces_fn at 0x7f97813b7ce0>)),
                                    ('to_lower_case',
                                     FunctionTransformer(feature_names_out='one-to-one',
                                                         func=<function to_lower_case_fn at 0x7f9780e1f920>)),
                

In [38]:

model_pipeline = Pipeline([
    ("preprocessing", preprocessing.pipeline),
    ("normalizing", StandardScaler()),
    ("prediction", LogisticRegression(max_iter=1000, random_state=42))
])

scoring = ["recall", "precision", "f1"]

## experiment between onehot and ordinal encoding of various features.

param_grid = {
    "preprocessing__sleep_duration_pipeline__sleep_duration_encoding__encoding":["onehot", "ordinal"],
    "preprocessing__dietary_habits_pipeline__dietary_habits_encoding__encoding":["onehot", "ordinal"],
    "preprocessing__degree_pipeline__degree_encoding__degree_level_encoding__encoding":["onehot", "ordinal"],
    "preprocessing__age_pipeline__age_encoding__age_range_encoding__encoding":["onehot", "ordinal"],
    "preprocessing__cgpa_pipeline__cgpa_encoding__cgpa_range_encoding__encoding":["onehot", "ordinal"],
    "preprocessing__hours_pipeline__hours_encoding__hours_range_encoding__encoding":["onehot", "ordinal"],
    "prediction__solver": ["liblinear", "saga"],
    "prediction__penalty": ["l1", "l2"],
    "prediction__C": [0.1, 1, 10]
}

grid_search = GridSearchCV(model_pipeline, param_grid, scoring=scoring, cv=3,n_jobs=-1,refit=refit_strategy)
grid_search.fit(X_train, y_train.values.ravel())


In [39]:
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Scores : {grid_search.best_index_}")
average_recall,average_precision,average_f1 = common.read_best_mean_grid_search_metrics(grid_search.cv_results_,grid_search.best_index_)

Best Parameters: {'prediction__C': 10, 'prediction__penalty': 'l2', 'prediction__solver': 'saga', 'preprocessing__age_pipeline__age_encoding__age_range_encoding__encoding': 'ordinal', 'preprocessing__cgpa_pipeline__cgpa_encoding__cgpa_range_encoding__encoding': 'ordinal', 'preprocessing__degree_pipeline__degree_encoding__degree_level_encoding__encoding': 'onehot', 'preprocessing__dietary_habits_pipeline__dietary_habits_encoding__encoding': 'ordinal', 'preprocessing__hours_pipeline__hours_encoding__hours_range_encoding__encoding': 'onehot', 'preprocessing__sleep_duration_pipeline__sleep_duration_encoding__encoding': 'ordinal'}
Best Scores : 757
Mean Recall: 0.886899295990205, Mean Precision: 0.85702636802118,Mean F1: 0.8716857659300494


In [40]:
# commenting this code out to avoid overwriting the metrics file.
# _, file_name = common.save_model(
#     "Logistic Regression", "v2", grid_search.best_estimator_)
# common.update_models_metrics("Logistic Regression", "v2", average_recall,
#                              average_precision, average_f1, file_name=file_name)
# common.update_model_params(
#     "LogisticRegression", "v2", grid_search.best_params_)

Observations:
* In general there is not significan't improvement between 2 versions of Logistic Regression. 
* We need to try few more linear and non linear models to see if we can get better performance. 