# Training & Evaluation
* We are going to break down training and evaluation into multiple notebooks, one for each algorithm that we train and evalutate. 
* In this first notebook, we'll create baseline models to get the predictions based on `stratified` and `most frequent` classes

## Install Libraries

In [1]:
# %pip install scikit-learn

## Import Libraries

In [2]:
import os
import sys
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,recall_score,precision_score,precision_recall_curve
import seaborn as sns


# Build an absolute path from this notebook's parent directory
module_path = os.path.abspath(os.path.join('..'))

# Add to sys.path if not already present
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.utils import preprocessing

## Initialize Directories

In [3]:
data_root_dir = Path("..", "data/")
models_root_dir = Path("..", "models/")

## Read Data

In [4]:
X_train = pd.read_csv(Path(data_root_dir,"X_train.csv"))
y_train = pd.read_csv(Path(data_root_dir,"y_train.csv"))

In [5]:
preprocessed_data_df = pd.DataFrame(preprocessing.pipeline.fit_transform(
    X_train,y_train), columns=preprocessing.pipeline.get_feature_names_out())
preprocessed_data_df.head()

Unnamed: 0,preprocess_gender__gender_female,preprocess_gender__gender_male,preprocess_profession__profession_student,preprocess_profession__profession_working,sleep_duration_pipeline__sleep_duration_bt_5_6,sleep_duration_pipeline__sleep_duration_bt_7_8,sleep_duration_pipeline__sleep_duration_gt_8,sleep_duration_pipeline__sleep_duration_lt_5,dietary_habits_pipeline__dietary_habits_healthy,dietary_habits_pipeline__dietary_habits_moderate,...,age_pipeline__encode_age_range__age_range_gte_33,cgpa_pipeline__encode_cgpa_range__cgpa_range_4_to_7,cgpa_pipeline__encode_cgpa_range__cgpa_range_gte_7,cgpa_pipeline__encode_cgpa_range__cgpa_range_lt_4,hours_pipeline__encode_hours_range__hours_range_4_to_8,hours_pipeline__encode_hours_range__hours_range_gte_8,hours_pipeline__encode_hours_range__hours_range_lt_4,ratings_column_pipeline__academic_pressure,ratings_column_pipeline__study_satisfaction,ratings_column_pipeline__financial_stress
0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,4.0,1.0,5.0
1,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,3.0,2.0,1.0
2,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,3.0,2.0,5.0
3,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,3.0,5.0,3.0
4,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,3.0,4.0,5.0


In [6]:
preprocessed_data_df.isna().sum()

preprocess_gender__gender_female                                   0
preprocess_gender__gender_male                                     0
preprocess_profession__profession_student                          0
preprocess_profession__profession_working                          0
sleep_duration_pipeline__sleep_duration_bt_5_6                     0
sleep_duration_pipeline__sleep_duration_bt_7_8                     0
sleep_duration_pipeline__sleep_duration_gt_8                       0
sleep_duration_pipeline__sleep_duration_lt_5                       0
dietary_habits_pipeline__dietary_habits_healthy                    0
dietary_habits_pipeline__dietary_habits_moderate                   0
dietary_habits_pipeline__dietary_habits_unhealthy                  0
degree_pipeline__encode_degree_field__degree_field_architecture    0
degree_pipeline__encode_degree_field__degree_field_arts            0
degree_pipeline__encode_degree_field__degree_field_business        0
degree_pipeline__encode_degree_fie

## Training Default Model

In [7]:
# import sklearn


# sklearn.metrics.get_scorer_names() 

In [8]:
from sklearn.discriminant_analysis import StandardScaler
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.pipeline import Pipeline


default_logistic_regression_model = LogisticRegression(max_iter=1000)

model_pipeline = Pipeline([
    ("preprocessing", preprocessing.pipeline),
    ("normalizing", StandardScaler()),
    ("prediction", default_logistic_regression_model)
])

scoring = ["recall", "precision", "f1"]

default_logistic_regression_scores = cross_validate(estimator=model_pipeline,X=X_train,y=y_train,cv=3,scoring=scoring,n_jobs=-1)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [9]:
default_logistic_regression_scores

{'fit_time': array([1.13903427, 1.05161095, 1.03185582]),
 'score_time': array([0.71297598, 0.8227787 , 0.80029511]),
 'test_recall': array([0.89256198, 0.88567493, 0.87970615]),
 'test_precision': array([0.85695393, 0.85335103, 0.86170452]),
 'test_f1': array([0.87439559, 0.86921257, 0.87061229])}

In [10]:
# model_pipeline = Pipeline([
#     ("preprocessing", preprocessing.pipeline),
#     ("normalizing", StandardScaler()),
# ])

# temp = model_pipeline.fit_transform(X_train,y_train)
# temp

In [11]:
# print(X_train.shape)
# model_pipeline = Pipeline([
#     ("preprocessing", preprocessing.pipeline),
#     # ("normalizing", StandardScaler()),
# ])

# temp = model_pipeline.fit_transform(X_train,y_train)
# print(temp.shape)

In [12]:
# X_train.isnull().sum()