In [13]:
import os
import sys
import pickle

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import average_precision_score 

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier

import tensorflow as tf
from tensorflow import keras
from scikeras.wrappers import KerasClassifier
from mlens.ensemble import SuperLearner

from src.exception import CustomException
from src.logger import logging
from src.components.data_ingestion import DataIngestion
from src.components.data_transformation import DataTransformation
from src.utils import evaluate_base_model

In [2]:
# import cleaned_data
data = pd.read_csv('data/cleaned_data.csv')

In [8]:
# declare name of columns
num_cols = [
            'Urea', 
            'Cr', 
            'HbA1c', 
            'Chol', 
            'TG', 
            'HDL', 
            'LDL', 
            'VLDL', 
            'BMI', 
            'AGE'
                ]

# select all categorical / discrete columns
cat_cols = ['Gender', 'CLASS']

In [9]:
# prepare pipeline for numerical columns
num_pipeline = Pipeline(
            steps=[('scaler', StandardScaler(with_mean=False))]
            )
            
# prepare pipeline for categorical columns
cat_pipeline = Pipeline(
            steps=[
                    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'))
                ]
            )

In [10]:
# combine pipelines using ColumnTransformer
preprocessor = ColumnTransformer(
            transformers=[
                    ('numerical_pipeline', num_pipeline, num_cols),
                    ('categorical_pipeline', cat_pipeline, cat_cols)
                ]
            )

In [16]:
# split data
train_data, test_data = train_test_split(data, test_size=0.2, random_state=62)

In [31]:
train_data.head()

Unnamed: 0,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI,CLASS
579,M,54,6.8,73,7.5,5.1,2.0,1.3,1.4,1.5,39.0,Y
89,M,30,5.5,79,5.6,3.9,1.6,0.9,3.3,0.8,24.5,N
176,M,43,5.4,62,4.1,5.9,2.0,1.1,3.9,0.9,21.0,Y
977,F,20,4.6,70,9.6,4.1,1.8,1.0,2.3,0.8,30.0,Y
563,M,55,6.66,61,6.7,2.5,4.9,0.8,0.5,33.6,29.0,Y


In [21]:
# apply preprocessor object
input_feature_train_arr = preprocessor.fit_transform(train_data)
input_feature_test_arr = preprocessor.transform(test_data) 

In [34]:
# view one-hot encoded columns
ohe = OneHotEncoder()
encoded_data = ohe.fit_transform(train_data[cat_cols])
encoded_cols = ohe.get_feature_names_out(cat_cols)
print(encoded_cols)

['Gender_F' 'Gender_M' 'CLASS_N' 'CLASS_P' 'CLASS_Y']


# my assumption is correct that I didn't select the appropriate target columns on my modeltrainer

In [23]:
input_train = pd.DataFrame(input_feature_train_arr, columns=encoded_cols)

In [24]:
input_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,2.349064,1.43849,2.972546,3.953644,1.438729,1.856232,1.255888,0.405025,7.956533,6.165589,0.0,1.0,0.0,0.0,1.0
1,1.899978,1.556722,2.219501,3.023375,1.150983,1.285084,2.960307,0.216013,4.998335,3.425327,0.0,1.0,1.0,0.0,0.0
2,1.865433,1.221731,1.624992,4.573823,1.438729,1.570658,3.498545,0.243015,4.284287,4.909636,0.0,1.0,0.0,0.0,1.0
3,1.589072,1.379374,3.804859,3.178419,1.294856,1.427871,2.063245,0.216013,6.12041,2.283552,1.0,0.0,0.0,0.0,1.0
4,2.3007,1.202026,2.655474,1.938061,3.524885,1.142297,0.448531,9.07256,5.916397,6.279767,0.0,1.0,0.0,0.0,1.0


In [49]:
x_train, y_train, x_test, y_test = (
                input_feature_train_arr[:, :-3],
                input_feature_train_arr[:, -3:],
                input_feature_test_arr[:, :-3],
                input_feature_test_arr[:, -3:]
            )

In [51]:
y_train.shape

(798, 3)

In [41]:
y_train = pd.DataFrame(y_train)

In [48]:
y_train.head()

Unnamed: 0,0,1,2
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,0.0,1.0
3,0.0,0.0,1.0
4,0.0,0.0,1.0
