# Dry Run

## Import Packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import set_config
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold, train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, precision_recall_curve, roc_curve, roc_auc_score

In [2]:
import os
os.chdir('..')

## Load Data

In [3]:
df = pd.read_csv('data/heart.csv')

df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


## Train Test Split

In [4]:
from src.config import dataset_config

In [5]:
X = df[dataset_config.RAW_NUM_FEATS+dataset_config.RAW_ORD_FEATS+dataset_config.RAW_NOM_FEATS]
y = df[dataset_config.RAW_TGT_FEAT]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y, test_size=0.2)

In [7]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((734, 11), (734,), (184, 11), (184,))

## Build Feature pipeline

In [15]:
from src.config import dataset_config, hyperparams_config
from src.pipeline import training_pipeline

In [10]:
dataset_config.TRANS_NUM_FEATS, dataset_config.TRANS_NOM_FEATS, dataset_config.TRANS_ORD_FEATS

(['RestingBP', 'MaxHR', 'Oldpeak'],
 ['Sex', 'ChestPainType', 'RestingECG', 'ST_Slope'],
 ['Cholesterol_Bucket', 'FastingBS', 'ExerciseAngina', 'Age_Bucket'])

In [11]:
train_pipe = training_pipeline._fetch_pipeline()

In [12]:
train_pipe

In [16]:
x_train, x_test, y_train, y_test = train_test_split(
    df.drop(dataset_config.RAW_COL_HEART_DISEASE, axis=1),
    df[dataset_config.RAW_COL_HEART_DISEASE],
    stratify=df[dataset_config.RAW_COL_HEART_DISEASE],
    random_state=42,
    test_size=0.2
)

In [17]:
x_train.shape, y_train.shape

((734, 11), (734,))

In [18]:

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)
clf = GridSearchCV(
    train_pipe,
    hyperparams_config.LOGISTIC_REGRESSION_HYPERPARAMS,
    cv=cv, scoring='accuracy',
    n_jobs=-1
)