# Training & Evaluation
* We are going to break down training and evaluation into multiple notebooks, one for each algorithm that we train and evalutate. 
* In this first notebook, we'll create baseline models to get the predictions based on `stratified` and `most frequent` classes

## Install Libraries

In [1]:
# %pip install scikit-learn

## Import Libraries

In [2]:
import os
import sys
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,recall_score,precision_score,precision_recall_curve
import seaborn as sns


# Build an absolute path from this notebook's parent directory
module_path = os.path.abspath(os.path.join('..'))

# Add to sys.path if not already present
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.utils import preprocessing

## Initialize Directories

In [3]:
data_root_dir = Path("..", "data/")
models_root_dir = Path("..", "models/")

## Read Data

In [4]:
X_train = pd.read_csv(Path(data_root_dir,"X_train.csv"))
y_train = pd.read_csv(Path(data_root_dir,"y_train.csv"))

In [5]:
# preprocessed_data_df = pd.DataFrame(preprocessing.pipeline.fit_transform(
#     X_train,y_train), columns=preprocessing.pipeline.get_feature_names_out())
# preprocessed_data_df.head()

In [6]:
# preprocessed_data_df.isna().sum()

## Training Default Model

In [7]:
# import sklearn


# sklearn.metrics.get_scorer_names() 

In [8]:
from sklearn.discriminant_analysis import StandardScaler
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.pipeline import Pipeline


default_logistic_regression_model = LogisticRegression(max_iter=1000)

model_pipeline = Pipeline([
    ("preprocessing", preprocessing.pipeline),
    ("normalizing", StandardScaler()),
    ("prediction", default_logistic_regression_model)
])

scoring = ["recall", "precision", "f1"]

default_logistic_regression_scores = cross_validate(estimator=model_pipeline,X=X_train,y=y_train.values.ravel(),cv=3,scoring=scoring,n_jobs=-1,verbose=2)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.


[CV] END .................................................... total time=   1.8s
[CV] END .................................................... total time=   1.9s
[CV] END .................................................... total time=   1.9s


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    3.1s finished


In [9]:
default_logistic_regression_scores

{'fit_time': array([1.14548469, 1.04674959, 1.02783537]),
 'score_time': array([0.70852733, 0.81547213, 0.80321646]),
 'test_recall': array([0.89370983, 0.88269054, 0.88016529]),
 'test_precision': array([0.85522847, 0.85349612, 0.86176669]),
 'test_f1': array([0.8740458 , 0.86784787, 0.87086882])}

In [10]:
# model_pipeline = Pipeline([
#     ("preprocessing", preprocessing.pipeline),
#     ("normalizing", StandardScaler()),
# ])

# temp = model_pipeline.fit_transform(X_train,y_train)
# temp

In [11]:
# print(X_train.shape)
# model_pipeline = Pipeline([
#     ("preprocessing", preprocessing.pipeline),
#     # ("normalizing", StandardScaler()),
# ])

# temp = model_pipeline.fit_transform(X_train,y_train)
# print(temp.shape)

In [12]:
# X_train.isnull().sum()