## Importing Modules and Data
##### * This section contains code to import modules used in this research notebook
##### * It also contains the code to extract the data from the file provided

In [None]:
import sqlite3
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, classification_report, confusion_matrix, make_scorer
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm


from pathlib import Path
import joblib

pd.options.display.max_columns = 50 

pd.options.display.max_rows = 20

print("Imported the modules.")

In [None]:
# Defining function to initiate connection to database using sqlite3.
# Defining function to pass SQL syntax and convert the obtained data as a pandas Dataframe.

def initiate_local_connection():
    """This function takes in a defined variable called file_name and generates a connection object to it 
    Parameter:
    file_name (object): name of the dataset file  e.g. "data/survive.db" or "calls.csv"
    
    Returns:
    Connection object 
    """
    try:
        conn = sqlite3.connect(file_name)
        print('[+] Local Connection Successful')
    except Exception as e: 
        
        print(f'[+] Local Connection Failed: {e}')
        conn = None

    return conn

def get_records(sql_query):
    """ Takes in a SQL syntax to obtain dataset and transform into a pandas dataframe.
    Parameter:
    sql_query (str): SQL syntax to obtain dataset from connection established in "initiate_local_connection()"

    Returns:
    Pandas dataframe of the information extracted
    """
    try:
        # create a cursor object and execute the given SQL syntax 
        cursor = conn.cursor()
        cursor.execute(sql_query)

        # Fetch all the records from SQL query output
        results = cursor.fetchall()
        
        # Convert results into pandas dataframe
        df = pd.DataFrame(results)
        
        print(f'Successfully retrieved records')
        
        return df
        
    except Exception as e:
        print(f'Error encountered: {e}')

In [None]:
file_name = "data/survive.db"

conn = initiate_local_connection()

In [None]:
sql_query = f'SELECT * FROM survive'

df = get_records(sql_query)

## Problem statement
### * Objective is to predict survival of coronary artery disease using dataset provided.
### * The proposed solution should help doctors formulate pre-emptive medical treatments.

## Preprocessing of data
##### * This section will contain the data cleaning performed on the data.

* First look of the data provided

In [None]:
df.head()

* Looking at the shape of the dataset that we are provided - 15000 rows, 16 columns

In [None]:
df.shape

* Labelling of the columns based of assessment write up

In [None]:
df = df.rename(columns = {
                    0: 'ID',
                    1: 'Survive',
                    2: 'Gender',
                    3: 'Smoke',
                    4: 'Diabetes',
                    5: 'Age',
                    6: 'Ejection_Fraction',
                    7: 'Sodium',
                    8: 'Creatinine',
                    9: 'Platelets',
                    10: 'Creatine_Phosphokinase',
                    11: 'Blood_Pressure',
                    12: 'Hemoglobin',
                    13: 'Height',
                    14: 'Weight',
                    15: 'Favourite_Color',
                    })

df

#### * The target variable "Survive" is found within the dataset.
#### * This can be considered Supervised Learning problem.


#### Checking and removing of duplicates found within "ID" column

In [None]:
duplicated_id = df.duplicated(subset=['ID']).sum()
print(f'There are {duplicated_id} duplicated ID')
df.drop_duplicates(subset='ID', keep='first', inplace=True)

#### Dropping "favourite_color" as it is a irrelevant column for the problem statement.
* 14042 rows remaining

In [None]:
df = df.drop(columns=['ID','Favourite_Color'])
df

#### Checking the values for each column to see if there are any errors.
---
* Error detected, Values that can be corrected according to table below

|Column Name|Error Type|Correction to perform|Remarks for potential feature engineering|
|-|-|-|-|
|Survive|Contains '0' and '1'|To be replaced with 'No' and 'Yes' respectively according to write up|Possible binary encoding to 0 and 1|
|Smoke|Contains 2 versions of No and Yes|To replace with 'No' and 'Yes' accordingly|Possible binary encoding to 0 and 1|
|Diabetes|NA|NA|Possible ordinal encoding for Normal>Pre-diabetes>Diabetes based on severity|
|Age|Contains negative values are present within the dataset|Assuming the negative values are a result of entry error, use the absolute value is the correct age|Scaling to be performed as the number range within dataset is varied|
|Ejection_Fraction|Contains 'L' and 'N'|To be replaced with 'Low' and 'Normal' assuming that was the intended value|Possible ordinal encoding for Low > Normal > High based on strength |
|Sodium|NA|NA|Scaling to be performed as the number range within dataset is varied|
|Creatinine|Nan present|To check amount of missing values|Scaling to be performed as the number range within dataset is varied|
|Platelets|NA|NA|Scaling to be performed as the number range within dataset is varied|
|Creatine_Phosphokinase|NA|NA|Scaling to be performed as the number range within dataset is varied|
|Blood_Pressure|NA|NA|Scaling to be performed as the number range within dataset is varied|
|Hemoglobin|NA|NA|Scaling to be performed as the number range within dataset is varied|
|Height|NA|NA|Possible to create new feature BMI with Weight| 
|Weight|NA|NA|Possible to create new feature BMI with Height|

In [None]:
for cols in df.columns:
    print (f'The unique values in this column {cols} are:')
    print (df[cols].unique())
    print ()

#### Replacing the unique values with appropriate ones

In [None]:
def replace_value(df: pd.DataFrame) -> pd.DataFrame:
    """This function takes reads a pandas dataframe and the following
    'Survive': Changes all values to 0 and 1 (int)
    'Smoke': Changes all values to No and Yes
    'Ejection_Fraction': Changes all values to Low, Normal and High 
    'Age': Negative values are assumed to be entry errors 
    Parameter:
    df: pandas dataframe
  
    Returns:
    df: pandas dataframe with all values replaced
    """
    df['Survive'].replace(['No','Yes'],['0', '1'], inplace=True)
    df['Smoke'].replace(['NO','YES'],['No','Yes'], inplace=True)
    df['Ejection_Fraction'].replace(['L','N'],['Low','Normal'], inplace=True)
    df['Age'] = df['Age'].abs()
    return df

Checking the results of the transformation

In [None]:
df = replace_value(df)
for col in ['Survive', 'Smoke', 'Ejection_Fraction','Age']:
    print(df[col].unique())

#### Counting the amount of missing values 

In [None]:
df.isnull().sum()

* Calculating the percentage of missing data
---
* Missing Values contributes to 3.32% of all observations
* Due to low missing values, they will be removed from the dataset.
* 13576 rows remaining

In [None]:
def calculate_missing_value_percentage(df: pd.DataFrame) -> pd.DataFrame:
    """ This function takes in a pandas dataframe (df) and returns the percentage of missing values
    Parameter:
    df: pandas dataframe

    Returns:
    percentage of missing value
    """
    # Counting missing values in each column
    missing_value_counts =  df.isnull().sum()

    # Counting the number of observations
    total_amount_of_rows = len(df)

    # Getting the percentage of that 
    missing_value_percentage = (missing_value_counts/total_amount_of_rows) * 100

    missing_value_df = pd.DataFrame({'column_name': missing_value_percentage.index, 
                                   'missing_value_percentage': missing_value_percentage.values})
    return missing_value_df

In [None]:
missing_value_percentage_df = calculate_missing_value_percentage(df)
print(missing_value_percentage_df)

In [None]:
df = df.dropna(subset=['Creatinine'])
df.isnull().sum()
df.info()

#### Creating a feature called BMI.
* Body mass index (BMI) is an estimate using height and weight. Commonly used as a body fat estimate. Healthy range is typically 19 to 24
* The formula used to calculate this:
BMI = weight (kg) / height_squared (m)

* e.g. row 1 = 93kg / (1.8*1.8) = 28.7
* Height and Weight row dropped as they are not required anymore

In [None]:
df['BMI'] = round(df['Weight']/((df['Height']/100)**2),1)
df = df.drop(columns=['Height','Weight'])
df

#### Taking a look at the data types assigned to each columns
----
Data type is correct for analysis

In [None]:
df.info()

In [None]:
df

## Exploratory Data Analysis (EDA)
### Univariate Analysis
* Here we split the data into continuous or categorical

In [None]:
cat_df = df[['Survive','Gender','Smoke','Diabetes','Ejection_Fraction']]
cont_df = df[['Age','Sodium','Creatinine','Platelets','Creatine_Phosphokinase','Blood_Pressure','Hemoglobin','BMI','Survive']]
# added Target variable into continous df for visualizations later on

In [None]:
# Checking the df output in correct
cat_df

In [None]:
# Checking the df output in correct
cont_df

* For categoricial data, checking the frequency to check the distribution
----
Findings
|Feature|Findings|
|-|-|
|Survive|32% positive class (survivor), 68% negative class (non-survivor), imbalanced class|
|Gender|65% males, 35% females|
|Smoke|67% Non-smoker, 33% smokers|
|Diabetes|59% Normal, 21% Pre-diabetes, 20% diabetes|
|Ejection_Fraction|88.2% weak heart, 11.4% normal, 0.4% strong| 

In [None]:
for columns in cat_df.columns:
    fig = px.histogram(cat_df, x=columns, title=f'Histogram of {columns}', histnorm='percent')
    fig.show()                   

* For continuous data, checking the frequency to check the distribution
---
Findings
|Feature|Findings|
|-|-|
|Age|Binning of observation shows a normal distibution|
|Sodium|Seems like a normal distibution curve, some outliers detected|
|Creatinine|Counts are skewed to the lower range, transformation might be feasible, outliers detected|
|Platelets|Seems like a normal distibution curve with some outliers on the high side |
|Creatine_Phosphokinase|Counts are skewed to the lower range, transformation might be feasible, some outliers detected|
|Blood_Pressure|Distribution seems even|
|Hemoglobin|Distribution seems even|
|BMI|Seems like a normal distibution curve, binning of observation can be considered|

From `.describe()`
* Age, Sodium, Creatinie and Platelets have means that are close to 50% indicating even distribution
* Creatine_Phosphokinase has a mean close to 75%, suggesting possible outliers and heavily skewing the data.

In [None]:
for columns in cont_df.columns:
    fig = px.histogram(cont_df, x=columns, 
                       marginal='box',
                       title=f'Histogram of {columns}')
    fig.show() 

In [None]:
fig = px.histogram(cont_df, x='Age', 
                   nbins= 12,
                   title=f'Histogram of {columns}')
fig.show() 

In [None]:
fig = px.histogram(cont_df, x='BMI', 
                   nbins= 6,
                   title=f'Histogram of {columns}')
fig.show() 

In [None]:
cont_df.describe()

## Bivariate analysis

* Looking at the proportion of target variable within each categorical feature
---
Findings when compared with each other
|Feature|Findings|
|-|-|
|Gender|Both genders have approximately half of the number of survivors than non-survivors|
|Smoke|Both smokers and non-smokers have approximately half of the number of survivors than non-survivors, only ~5% of smokers are female, non-smokers have equal male vs female|
|Diabetes|All diabetes categories have approximately half of the number of survivors than non-survivors, equal occurrences in pre-diabetes and diabetes|
|Ejection_Fraction|There were no surviors with High ejection fraction, for Normal category, there are 4x of non-survivors as survivors and for Low category, there are approximately half of the number of survivors than non-survivors and the portion of female increased with higher ejection_fraction|

Summary: 
* Given that this dataset is mild imbalanced (32%) with minority class (positive/survivors), similar ratios are found in Gender,Smoke and Diabetes features.
* Ejection_Fraction has a trend on higher mortality (lower chance of survival) with increasing ejection_fractions. 

Questions derived from observation:
* Majority of smokers are males, does it have an influence to survival? No, the proportion is similar to the imbalance of the whole dataset
    - An interesting finding, when observations were separated based on Gender and Smoke, only Female smokers had a different ratio, 3x more survivors.
* Are High ejection_fraction observations are they all non-diabetic? Yes all of them are non-diabetic and all did not survive

In [None]:
for columns in cat_df.columns:
    fig = px.histogram(cat_df, x=columns, 
                       color='Survive',
                       title=f'Histogram of {columns}', 
                        )
    fig.show()   

In [None]:
for columns in cat_df.columns:
    fig = px.histogram(cat_df, x=columns, 
                       color='Gender',
                       # color_discrete_map={0:'red', 1:'blue'},
                       title=f'Histogram of {columns}', 
                        )
    fig.show()   

In [None]:
for columns in cat_df.columns:
    fig = px.histogram(cat_df, x=columns, 
                       color='Smoke',
                       # color_discrete_map={0:'red', 1:'blue'},
                       title=f'Histogram of {columns}', 
                        )
    fig.show()   

In [None]:
for columns in cat_df.columns:
    fig = px.histogram(cat_df, x=columns, 
                       color='Diabetes',
                       # color_discrete_map={0:'red', 1:'blue'},
                       title=f'Histogram of {columns}', 
                        )
    fig.show() 

In [None]:
for columns in cat_df.columns:
    fig = px.histogram(cat_df, x=columns, 
                       color='Ejection_Fraction',
                       # color_discrete_map={0:'red', 1:'blue'},
                       title=f'Histogram of {columns}', 
                        )
    fig.show() 

In [None]:
gender_smoke_df = df.groupby(['Gender', 'Smoke','Survive'])
gender_smoke_df.size()

In [None]:
diabetes_ef_df = df.groupby(['Diabetes', 'Ejection_Fraction','Survive'])
diabetes_ef_df.size()

* Comparing target variable with the continous features
---
* Findings when compared with each other
|Feature|Findings|
|-|-|
|Age|NA|
|Sodium|NA|
|Creatinine|Positive class occupies the higher ranges|
|Platelets|NA|
|Creatine_Phophokinase|NA|
|Blood_Pressure|NA|
|Hemoglobin|NA|
|BMI|Positive Class occupies the higher value range|

Summary: 
* Creatinine and BMI are the only features that seems to have a difference between the positive and negative classes of the target
* Ejection_Fraction has a trend on higher mortality (lower chance of survival) with increasing ejection_fractions.
* Not much insights could be drawn when the features were compared to each other.

In [None]:
for columns in cont_df.columns:
    fig = px.histogram(cont_df, x=columns,
                       color='Survive',
                       marginal='box',
                       title=f'Histogram of {columns}')
    fig.show() 

In [None]:
for columns in cont_df.columns:
    fig = px.scatter(cont_df, x=columns,
                       color='Age',
                       title=f'Histogram of {columns}')
    fig.show() 

In [None]:
for columns in cont_df.columns:
    fig = px.scatter(cont_df, x=columns,
                       color='Sodium',
                       title=f'Histogram of {columns}')
    fig.show() 

In [None]:
for columns in cont_df.columns:
    fig = px.scatter(cont_df, x=columns,
                       color='Creatinine',
                       title=f'Histogram of {columns}')
    fig.show() 

In [None]:
for columns in cont_df.columns:
    fig = px.scatter(cont_df, x=columns,
                       color='Platelets',
                       title=f'Histogram of {columns}')
    fig.show() 

In [None]:
for columns in cont_df.columns:
    fig = px.scatter(cont_df, x=columns,
                       color='Creatine_Phosphokinase',
                       title=f'Histogram of {columns}')
    fig.show() 

In [None]:
for columns in cont_df.columns:
    fig = px.scatter(cont_df, x=columns,
                       color='Blood_Pressure',
                       title=f'Histogram of {columns}')
    fig.show() 

In [None]:
for columns in cont_df.columns:
    fig = px.scatter(cont_df, x=columns,
                       color='Hemoglobin',
                       title=f'Histogram of {columns}')
    fig.show() 

In [None]:
for columns in cont_df.columns:
    fig = px.scatter(cont_df, x=columns,
                       color='BMI',
                       title=f'Histogram of {columns}')
    fig.show() 

### Log Transformation for skewed columns
* 4 columns were observed to have some skewness to it
* Based on variance, Platelets, Blood_Pressure, Creatinine and Creatine_Phosphokinase have very high variance.
* Log normalization can help to transform the data
* Variance for Platelets and Blood_Pressure is reduced.
* Skewness of Creatinine and Creatine_Phosphokinase is reduced.
* Outliers are still present in log_Platelets and log_Creatinine
* All 4 features will assume the log normalization
  

Checking variance

In [None]:
print(df.var())

In [None]:
def log_norm(df: pd.DataFrame, target_column:str) -> pd.DataFrame:
    """This function takes reads a pandas dataframe creates a log normalization in a new column with the log suffix (log_target_column)
    Parameter:
    df: pandas dataframe
    column_name (str): column to be log normalized
    """
    df[('log_'+ target_column)] = np.log(df[target_column])
    return df

def log_norm_all(df: pd.DataFrame):
    df = log_norm(df, 'Platelets')
    df = log_norm(df, 'Creatine_Phosphokinase')
    df = log_norm(df, 'Blood_Pressure')
    df = log_norm(df, 'Creatinine')
    return df

Applying the log normalization

In [None]:
df = log_norm_all(df)
df

Checking the variance post application

In [None]:
df.var()

Visualizing the change

In [None]:
compare_log_df = df[['Platelets','log_Platelets',
                     'Creatine_Phosphokinase','log_Creatine_Phosphokinase',
                     'Blood_Pressure','log_Blood_Pressure',
                     'Creatinine','log_Creatinine']
                        ]
compare_log_df.describe()

In [None]:
for columns in compare_log_df.columns:
    fig = px.histogram(compare_log_df, x=columns, 
                       marginal='box',
                       title=f'Histogram of {columns}')
    fig.show() 

### Correlation between features
* Target is positively correlated to Age, Creatinine and BMI
* Target is negatively correlated to Sodium  
* Age is correlated to Creatinine and BMI and target
* Sodium is negatively correlated to target and Creatinine
* Sodium and BMI is negatively correlated
* Creatinie is highly correlated to target variable
* Creatine_Phosphokinase and Hemoglobin are correlated
* BMI is positively correlated to Age, Creatinie, Blood_Pressure
* BMI is negatively correlated to Hemoglobin

In [None]:
df.corr()

In [None]:
g = sns.pairplot(df, hue="Survive", palette="husl")

### Comparing categoricial and continuous features
* No meaningful differences 

In [None]:
df

In [None]:
for columns in df.columns:
    fig = px.histogram(df, x=columns, 
                       color='Gender',
                       title=f'Histogram of {columns}')
    fig.show() 

In [None]:
for columns in df.columns:
    fig = px.histogram(df, x=columns, 
                       color='Smoke',
                       title=f'Histogram of {columns}')
    fig.show() 

In [None]:
for columns in df.columns:
    fig = px.histogram(df, x=columns, 
                       color='Diabetes',
                       title=f'Histogram of {columns}')
    fig.show() 

In [None]:
for columns in df.columns:
    fig = px.histogram(df, x=columns, 
                       color='Ejection_Fraction',
                       title=f'Histogram of {columns}')
    fig.show() 

### Summary of EDA
##### Univariate 
* Imbalanced class for target variable at positive 32% vs negative 68%
* Gender Male 65% Female 35%
* Non Smoker 67% Smoker 33%
* Diabetes 20%, Diabetes 21%, Normal 59% 
* Sodium, Creatinine, Platelets, Creatine_Phosphokinase needs to handle outliers
* Creatinine and BMI have differences in terms of positive and negative classes
* Log transformation performed on Multiple columns to reduce the variance 

##### Bivariate
* ~5% of smokers are females
* Ejection Fraction has a trend of higher mortality with increased heart strength
* BMI and Creatinine has noticeable difference between positive and negative classes

##### Manipulations done up to this point
* Value Standardization for Survive, Smoke, Ejection_Fraction and Age
* Removing observations with missing data from Creatinine
* Duplicates dropped based on ID
* Features created = BMI
* Features dropped = ID, Weight, Height, Favourite Colour
* Log normalization for Platelets, Blood_Pressure, Creatinine and Creatine_Phosphokinase


## Feature Engineering
---
##### Handling of outliers
* Sodium, Creatinine, Platelets, Creatine_Phosphokinase needs to remove outliers

Visualizing dataset

In [None]:
for columns in df.columns:
    fig = px.histogram(df, x=columns, 
                       color='Survive',
                       color_discrete_map={0:'red', 1:'blue'},
                       marginal='box',
                       title=f'Histogram of {columns}', 
                        )
    fig.show()   

In [None]:
def remove_outlier(df: pd.DataFrame, target_column:str , lower_limit_percentile, upper_limit_percentile):
    '''This function removes outliers from a given feature
    Parameters:
    df: Pandas Dataframe
    target_column: Feature name
    lower_limit_percentile: lower limit of values to keep
    upper_limit_percentile: upper limit of values to keep
    
    Returns:
    df: Feature with outliers removed'''
    lower_lim = df[target_column].quantile(lower_limit_percentile)
    upper_lim = df[target_column].quantile(upper_limit_percentile)
    df = df[(df[target_column] < upper_lim) & (df[target_column] > lower_lim)]
    return df

def remove_outlier_all(df: pd.DataFrame):
    df = remove_outlier(df, 'Sodium', .05,0.95)
    df = remove_outlier(df, 'log_Platelets', .05,0.95)
    df = remove_outlier(df, 'log_Creatine_Phosphokinase', .05,0.95)
    df = remove_outlier(df, 'log_Creatinine', .05,0.95)
    return df

* Checking dataframe after removal of outliers
* Left with 8209 rows

In [None]:
df = remove_outlier_all(df)
df

* Visualize dataset after outlier treatment to see effects
* log_Creatinine and log_Platelets still has outliers
* Will not proceed with a smaller range as the number of observation would become too small. 

##### Encoding Features
* Label encoding for Gender, Smoke as the values do not have a hiearchy within them
* Ordinal Encoding for Diabetes and Ejection Fraction

Label coding for Gender and Smoke features

In [None]:
label_encoder = LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender'])
df['Smoke'] = label_encoder.fit_transform(df['Smoke'])
df['Survive'] = label_encoder.fit_transform(df['Survive'])

Checking if encoding was applied correctly

In [None]:
print(df['Gender'].unique())
print(df['Smoke'].unique())
print(df['Survive'].unique())
df

Ordinal Enconding for Diabetes and Ejection_Fraction feature

Getting the unique values of the features

In [None]:
print(df['Diabetes'].unique())
print(df['Ejection_Fraction'].unique())

Creating the list in ranked order for initializing of OrdinalEncoder

In [None]:
diabetes_rank = ['Normal', 'Pre-diabetes', 'Diabetes']
ef_rank = ['Low', 'Normal', 'High']
diabetes_encoder = OrdinalEncoder(categories = [diabetes_rank])
ef_encoder = OrdinalEncoder(categories = [ef_rank])

Applying the encoding

In [None]:
df['Diabetes'] = diabetes_encoder.fit_transform(df[['Diabetes']])
df['Ejection_Fraction'] = ef_encoder.fit_transform(df[['Ejection_Fraction']])

Checking if encoding was applied correctly

In [None]:
print(df['Diabetes'].unique())
print(df['Ejection_Fraction'].unique())
df

## Model Training

Checking data type - all correct
* Data is mild imbalanced with minority contributing to 30%. 
* Only stratification of the data will be applied for now
* Oversampling or undersampling techniques will be considered if problems arises later on.

In [None]:
print(df.info())
print(df['Survive'].value_counts())

Defining X and Y 
* Trying a small feature set to compare results, using only as little features as possible
* As the model aims to aid doctors in preemptive medical treatments
* The features used should prioritise information that a readily avaluable (e.g bmi, gender, age)
* Instead of some information that might require additional testing 

In [None]:
# This set has overfitting on all model less logreg (Recall score 0.73)
# X = df.drop(columns = ['Survive','Platelets','Creatine_Phosphokinase','Blood_Pressure','Creatinine'])

# Adding on interesting features on the light weight features) (Recall score 0.72)
# X = df[['Age','log_Creatinine','BMI','Sodium','Gender','Smoke','Ejection_Fraction']]

# Trying this set with minimal features (light weight model) (Recall score 0.74)
X = df[['Age','log_Creatinine','BMI','Sodium']] 
y = df['Survive']
print(X.shape,y.shape)


Checking the dataframe to confirm the features for model training

In [None]:
X

Checking the dataframe to confirm the target feature for model training

In [None]:
y

Splitting dataset into 80% temp, 20% test

In [None]:
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=21)
print(X_temp.shape, X_test.shape, y_temp.shape, y_test.shape)

Splits the 80% temp dataset into 60% train 20% test

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, stratify=y_temp, random_state=21)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

Verifying the number of positive and negative classes after train_test_split

In [None]:
print(f'Number of observation in the target variable is')
print(y.value_counts())
print()
print(f'Number of observation for target variable in Training set is')
print(y_train.value_counts())
print()
print(f'Number of observation for target variable in Validation set is')
print(y_val.value_counts())
print()
print(f'Number of observation for target variable in Testing set is')
print(y_test.value_counts())

* Scaling data from X_train
* Applying the scaling information onto X_test

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_val_scaled = scaler.transform(X_val)

Veriyfing the changes applied to both training and testing sets

In [None]:
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X.columns)
X_train_scaled_df

In [None]:
print(X_train_scaled_df.var())

In [None]:
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X.columns)
X_test_scaled_df

In [None]:
print(X_test_scaled_df.var())

## Model Training

### Choosing the correct metrics to evaluate models 
* Precision measures ratio of true positives against **all positive predictions** were correct
* Recall measures ratio of true positives against **all positive truths** in dataset
* False Positive = ( Model predicts 1, Truth is 0) (model says patient would survive when they **wont**)
* False Negative = ( Model predicts 0,Truth is 1) (model says patient would not survive when they **will**)
* In the context of the problem, it is more important to know the survival of the patient to administer preemptive treatment.
* A false positive is more costly than a false negative
* Therefore, it is more important to identify patients who will survive correctly. (Low False Positives rate)
* In ML context, this means to observe a **high recall** rate (instead of precision). 

## Training a Logistic Regression base model 

* Training a base model to observe performance
* Accuracy score of 0.852 on validation data
* Accuracy score of 0.857 on training data
* As the scores are similar, it can be concluded that this model is not overfitting or under fitting 

Generating a base model

In [None]:
# Instantiate Model
logreg_model = LogisticRegression()

# Fit the model with training data
logreg = logreg_model.fit(X_train_scaled, y_train)

# Passing in scaled validation feature data (X_val_scaled) in fitted model to obtain predictions 
logreg_y_predict = logreg.predict(X_val_scaled)

tn, fp, fn, tp = confusion_matrix(y_val, logreg_y_predict).ravel()
lr_validation_set_score = accuracy_score(y_val, logreg_y_predict)
# Printing the results when comparing predicts of scaled validation feature data (X_val_scaled) against ground truth (y_val)
print(confusion_matrix(y_val, logreg_y_predict))
print(tn, fp, fn, tp)
print(classification_report(y_val, logreg_y_predict))
print(accuracy_score(y_val, logreg_y_predict))


* The recall score is abit low. 
* We can use accuracy score of both training and test set to determine if model is underfitting or overfitting

Generating predictions on the training data using the model (trained with training data)

In [None]:
# This block of code uses the same fitted model above generate predictions on the training data itself 
# if the score is higher than the validation (testing) set, then it is an overfitting model  
lr_train_y_predict = logreg_model.predict(X_train_scaled)
lr_training_set_score = accuracy_score(y_train, lr_train_y_predict)
print(confusion_matrix(y_train, lr_train_y_predict))
print(tn, fp, fn, tp)
print(classification_report(y_train, lr_train_y_predict))
print(accuracy_score(y_train, lr_train_y_predict))

Training accuracy is slightly higher than validation accuracy by a small amount
Thus the model is generalising well to underseen data.

In [None]:
print(f'The accuracy on the training set is '+ str(lr_training_set_score))
print(f'The accuracy on the validation set is '+ str (lr_validation_set_score))

if lr_training_set_score > lr_validation_set_score:
    print (f'This model might be overfitting')
else:
    print (f'This model might be underfitting')

* Performing cross_validation on the trained logistic regression model
* Using the model trained earlier, scaled training data, 10 folds and looking at recall
* Average recall score was 0.74


In [None]:
# Defining the scoring matrix 
recall_scorer = make_scorer(recall_score)
# Kfold
kf = KFold(n_splits=10, shuffle=True, random_state = 21)
# Obtaining the cross_validation_scores (recall)
cv_results = cross_val_score(logreg_model, X_train_scaled, y_train, cv=kf, scoring=recall_scorer)

print(cv_results)
print(cv_results.mean())

* Accuracy score give a general gauge of how well the model is performing
* If training accuracy is high but testing accuracy is low, means the model is overfitting 
* If both training and testing accuracy is low this could be model underfitting.

## Generate a base model using all classifiers
---
* Based on recall score, the weaker models are Logistic_Regression are SVM. 
* As most models have comparable results . It would make sense to choose models that are more efficient and easier to train
* This will reduce training times and require less computation resources
* Models chosen in the end KNN, Decision Tree and XGB

In [None]:
models = {"Logistic Regression": LogisticRegression(), 
          "KNN": KNeighborsClassifier(),
          "Random_Forest": RandomForestClassifier(),
          "Decision_Tree": DecisionTreeClassifier(),
          "SVM": svm.SVC(),
          "XGB": xgb.XGBClassifier(),        
         }
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    model_y_predict = model.predict(X_val_scaled)
    print(confusion_matrix(y_val, model_y_predict))
    print(classification_report(y_val, model_y_predict))
    print(f'The recall score is of '+ f'{name}: {recall_score(y_val, model_y_predict)}')
    print()


* performing kfold cross validation to get a more accurate metric on the models

In [None]:
results = []
for model in models.values():
    kf = KFold(n_splits=10, random_state=21, shuffle=True)
    cv_results = cross_val_score(model, X_train_scaled, y_train, cv=kf, scoring=recall_scorer)
    results.append(cv_results)
plt.boxplot(results, labels=models.keys())
plt.show()

# Checking of fitting for KNN, Decision Tree and XGB

KNN cross validation

In [None]:
# Instantiate Model
knn = KNeighborsClassifier()

# Fit the model with training data
knn = knn.fit(X_train_scaled, y_train)

# Passing in scaled validation feature data (X_val_scaled) in fitted model to obtain predictions 
knn_y_predict = knn.predict(X_val_scaled)


tn, fp, fn, tp = confusion_matrix(y_val, knn_y_predict).ravel()
knn_validation_set_score = accuracy_score(y_val, knn_y_predict)
# Printing the results when comparing predicts of scaled validation feature data (X_val_scaled) against ground truth (y_val)
print(confusion_matrix(y_val, knn_y_predict))
print(tn, fp, fn, tp)
print(classification_report(y_val, knn_y_predict))
print(accuracy_score(y_val, knn_y_predict))


In [None]:
# This block of code uses the same fitted model above generate predictions on the training data itself 
# if the score is higher than the validation (testing) set, then it is an overfitting model  
knn_train_y_predict = knn.predict(X_train_scaled)
knn_training_set_score = accuracy_score(y_train, knn_train_y_predict)
print(confusion_matrix(y_train, knn_train_y_predict))
print(tn, fp, fn, tp)
print(classification_report(y_train, knn_train_y_predict))
print(accuracy_score(y_train, knn_train_y_predict))

In [None]:
print(f'The accuracy on the training set is '+ str(knn_training_set_score))
print(f'The accuracy on the validation set is '+ str (knn_validation_set_score))

if knn_training_set_score > knn_validation_set_score:
    print (f'This model might be overfitting')
else:
    print (f'This model might be underfitting')

* Performing cross_validation on the trained logistic regression model
* Using the model trained earlier, scaled training data, 10 folds and looking at recall
* Lowest recall score was 0.97
* Highest recall score was 1

In [None]:
# Defining the scoring matrix 
recall_scorer = make_scorer(recall_score)

# Kfold
kf = KFold(n_splits=10, shuffle=True, random_state = 21)
# Obtaining the cross_validation_scores (recall)
knn_cv_results = cross_val_score(knn, X_train_scaled, y_train, cv=kf, scoring=recall_scorer)

print(knn_cv_results)

Decision Tree cross validation

In [None]:
# Instantiate Model
dt = DecisionTreeClassifier()


# Fit the model with training data
dt = dt.fit(X_train_scaled, y_train)


# Passing in scaled validation feature data (X_val_scaled) in fitted model to obtain predictions 
dt_y_predict = dt.predict(X_val_scaled)

tn, fp, fn, tp = confusion_matrix(y_val, dt_y_predict).ravel()
dt_validation_set_score = accuracy_score(y_val, dt_y_predict)
# Printing the results when comparing predicts of scaled validation feature data (X_val_scaled) against ground truth (y_val)
print(confusion_matrix(y_val, dt_y_predict))
print(tn, fp, fn, tp)
print(classification_report(y_val, dt_y_predict))
print(accuracy_score(y_val, dt_y_predict))


In [None]:
# This block of code uses the same fitted model above generate predictions on the training data itself 
# if the score is higher than the validation (testing) set, then it is an overfitting model  
dt_train_y_predict = dt.predict(X_train_scaled)
dt_training_set_score = accuracy_score(y_train, dt_train_y_predict)
print(confusion_matrix(y_train, dt_train_y_predict))
print(tn, fp, fn, tp)
print(classification_report(y_train, dt_train_y_predict))
print(accuracy_score(y_train, dt_train_y_predict))

In [None]:
print(f'The accuracy on the training set is '+ str(dt_training_set_score))
print(f'The accuracy on the validation set is '+ str (dt_validation_set_score))

if dt_training_set_score > dt_validation_set_score:
    print (f'This model might be overfitting')
else:
    print (f'This model might be underfitting')

* Performing cross_validation on the trained logistic regression model
* Using the model trained earlier, scaled training data, 10 folds and looking at recall
* Lowest recall score was 0.99
* Highest recall score was 1

In [None]:
# Defining the scoring matrix 
recall_scorer = make_scorer(recall_score)

# Kfold
kf = KFold(n_splits=10, shuffle=True, random_state = 21)

# Obtaining the cross_validation_scores (recall)
dt_cv_results = cross_val_score(dt, X_train_scaled, y_train, cv=kf, scoring=recall_scorer)

print(dt_cv_results)

XGB cross validation

In [None]:
# Instantiate Model
xgb_model = xgb.XGBClassifier()

# Fit the model with training data
xgb_model = xgb_model.fit(X_train_scaled, y_train)

# Passing in scaled validation feature data (X_val_scaled) in fitted model to obtain predictions 
xgb_y_predict = xgb_model.predict(X_val_scaled)

tn, fp, fn, tp = confusion_matrix(y_val, xgb_y_predict).ravel()
xgb_validation_set_score = accuracy_score(y_val, xgb_y_predict)
# Printing the results when comparing predicts of scaled validation feature data (X_val_scaled) against ground truth (y_val)
print(confusion_matrix(y_val, xgb_y_predict))
print(tn, fp, fn, tp)
print(classification_report(y_val, xgb_y_predict))
print(accuracy_score(y_val, xgb_y_predict))


* We can use accuracy score of both training and test set to determine if model is underfitting or overfitting

Generating predictions on the training data using the model (trained with training data)

In [None]:
# This block of code uses the same fitted model above generate predictions on the training data itself 
# if the score is higher than the validation (testing) set, then it is an overfitting model  
xgb_train_y_predict = xgb_model.predict(X_train_scaled)
xgb_training_set_score = accuracy_score(y_train, xgb_train_y_predict)
print(confusion_matrix(y_train, xgb_train_y_predict))
print(tn, fp, fn, tp)
print(classification_report(y_train, xgb_train_y_predict))
print(accuracy_score(y_train, xgb_train_y_predict))

Training accuracy is slightly higher than validation accuracy by a small amount
Thus the model is generalising well to underseen data.

In [None]:
print(f'The accuracy on the training set is '+ str(xgb_training_set_score))
print(f'The accuracy on the validation set is '+ str (xgb_validation_set_score))

if xgb_training_set_score > xgb_validation_set_score:
    print (f'This model might be overfitting')
else:
    print (f'This model might be underfitting')

In [None]:
# Defining the scoring matrix 
recall_scorer = make_scorer(recall_score)

# Kfold
kf = KFold(n_splits=10, shuffle=True, random_state = 21)

# Obtaining the cross_validation_scores (recall)
cv_results = cross_val_score(xgb_model, X_train_scaled, y_train, cv=kf, scoring=recall_scorer)

print(cv_results)

## Tuning hyperparameters for chosen models

* Models are showing good performance, however with some parameter tuning, the performance might be able to perform better
* For decision tree only as KNN and XGBoost are achieving good performance

Randomsearch for KNN model

In [None]:
# Setting the parameters for tuning 
param_dist = {"n_neighbors": range(2,15),
                'weights' : ['uniform','distance'],
                'metric' : ['minkowski','euclidean','manhattan']}

# Using Randomized Search to reduce the amount of runs to find better hyperparameters.
random_search_knn = RandomizedSearchCV(
                    estimator = knn,
                    param_distributions = param_dist,
                    n_iter = 50,
                    cv=kf,
                    scoring = recall_scorer)

random_search_knn.fit(X_train_scaled, y_train)

In [None]:
# Observing scores
print(random_search_knn.best_score_)

# Best Parameters
print(random_search_knn.best_params_)

# Best estimator
print(random_search_knn.estimator)

Random search for Decision tree model

In [None]:
# Setting the parameters for tuning
param_dist = {"max_depth": range(2,15),
                "max_features": range(2, 15),
                "min_samples_split": range(2, 15)}

# Using Randomized Search to reduce the amount of runs to find better hyperparameters.
random_search_dt = RandomizedSearchCV(
                    estimator = dt,
                    param_distributions = param_dist,
                    n_iter = 50,
                    cv=kf,
                    scoring = recall_scorer)

random_search_dt.fit(X_train_scaled, y_train)

In [None]:
# Observing scores
print(random_search_dt.best_score_)

# Best Parameters
print(random_search_dt.best_params_)

# Best estimator
print(random_search_dt.estimator)

Random search XGB 

In [None]:
# Setting the parameters for tuning
param_dist = {"eta": [0.01,0.1,0.2,0.5,1],
                "max_depth": range(2, 15),
                }

# Using Randomized Search to reduce the amount of runs to find better hyperparameters.
random_search_xgb = RandomizedSearchCV(
                    estimator = xgb_model,
                    param_distributions = param_dist,
                    n_iter = 50,
                    cv=kf,
                    scoring = recall_scorer)

random_search_xgb.fit(X_train_scaled, y_train)

In [None]:
# Observing scores
print(random_search_xgb.best_score_)

# Best Parameters
print(random_search_xgb.best_params_)

# Best estimator
print(random_search_xgb.estimator)

* Obtained best parameters for both,
* Testing models with new parameters on validation set again 

In [None]:
# Instantiate Model
dt_best_params = DecisionTreeClassifier(min_samples_split = 7, max_features= 5, max_depth= 12)


# Fit the model with training data
dt_best_params = dt_best_params.fit(X_train_scaled, y_train)


# Passing in scaled validation feature data (X_val_scaled) in fitted model to obtain predictions 
dt_best_y_predict = dt_best_params.predict(X_val_scaled)

tn, fp, fn, tp = confusion_matrix(y_val, dt_best_y_predict).ravel()
dt_best_validation_set_score = accuracy_score(y_val, dt_best_y_predict)
# Printing the results when comparing predicts of scaled validation feature data (X_val_scaled) against ground truth (y_val)
print(confusion_matrix(y_val, dt_best_y_predict))
print(tn, fp, fn, tp)
print(classification_report(y_val, dt_best_y_predict))
print(accuracy_score(y_val, dt_best_y_predict))


In [None]:
# This block of code uses the same fitted model above generate predictions on the training data itself 
# if the score is higher than the validation (testing) set, then it is an overfitting model  
dt_best_train_y_predict = dt.predict(X_train_scaled)
dt_best_training_set_score = accuracy_score(y_train, dt_train_y_predict)
print(confusion_matrix(y_train, dt_train_y_predict))
print(tn, fp, fn, tp)
print(classification_report(y_train, dt_train_y_predict))
print(accuracy_score(y_train, dt_train_y_predict))

In [None]:
print(f'The accuracy on the training set is '+ str(dt_best_training_set_score))
print(f'The accuracy on the validation set is '+ str (dt_best_validation_set_score))

if dt_best_training_set_score > dt_best_validation_set_score:
    print (f'This model might be overfitting')
else:
    print (f'This model might be underfitting')

In [None]:
# Defining the scoring matrix 
recall_scorer = make_scorer(recall_score)

# Kfold
kf = KFold(n_splits=10, shuffle=True, random_state = 21)

# Obtaining the cross_validation_scores (recall)
dt_cv_results = cross_val_score(dt_best_params, X_train_scaled, y_train, cv=kf, scoring=recall_scorer)

print(dt_cv_results)

### After best params are defined, testing the final 3 models on test data

### Decision Tree

In [None]:
# Instantiate Model
dt_final = DecisionTreeClassifier(min_samples_split = 7, max_features= 5, max_depth= 12)


# Fit the model with training data
dt_final = dt_final.fit(X_train_scaled, y_train)


# Passing in scaled test feature data (X_test_scaled) in fitted model to obtain predictions 
dt_final_y_predict = dt_final.predict(X_test_scaled)

tn, fp, fn, tp = confusion_matrix(y_test, dt_final_y_predict).ravel()
dt_best_validation_set_score = accuracy_score(y_test, dt_final_y_predict)
# Printing the results when comparing predicts of scaled validation feature data (X_val_scaled) against ground truth (y_val)
print(confusion_matrix(y_test, dt_final_y_predict))
print(tn, fp, fn, tp)
print(classification_report(y_test, dt_final_y_predict))
print(recall_score(y_test, dt_final_y_predict))

### KNN

In [None]:
# Instantiate Model
knn_final = KNeighborsClassifier(weights= 'distance', n_neighbors= 10, metric= 'manhattan')

# Fit the model with training data
knn_final = knn_final.fit(X_train_scaled, y_train)

# Passing in scaled test feature data (X_test_scaled) in fitted model to obtain predictions 
knn_final_y_predict = knn_final.predict(X_test_scaled)


tn, fp, fn, tp = confusion_matrix(y_test, knn_final_y_predict).ravel()
knn_final_validation_set_score = accuracy_score(y_test, knn_final_y_predict)
# Printing the results when comparing predicts of scaled validation feature data (X_val_scaled) against ground truth (y_val)
print(confusion_matrix(y_test, knn_final_y_predict))
print(tn, fp, fn, tp)
print(classification_report(y_test, knn_final_y_predict))
print(recall_score(y_test, knn_final_y_predict))

### XGB

In [None]:
# Instantiate Model
xgb_model_final = xgb.XGBClassifier(max_depth = 3, eta = 0.5)

# Fit the model with training data
xgb_model_final = xgb_model_final.fit(X_train_scaled, y_train)

# Passing in scaled test feature data (X_test_scaled) in fitted model to obtain predictions 
xgb_final_y_predict = xgb_model_final.predict(X_test_scaled)

tn, fp, fn, tp = confusion_matrix(y_test, xgb_final_y_predict).ravel()
xgb_final_validation_set_score = accuracy_score(y_test, xgb_final_y_predict)
# Printing the results when comparing predicts of scaled validation feature data (X_val_scaled) against ground truth (y_val)
print(confusion_matrix(y_test, xgb_final_y_predict))
print(tn, fp, fn, tp)
print(classification_report(y_test, xgb_final_y_predict))
print(recall_score(y_test, xgb_final_y_predict))

### Exporting models into .pkl

* Saves current model as a checkpoint, can be used for deployment if model is deem serviceable
* Further tuning for features can be performed and compared to this
* Exporting .pkl file allows subsequent deployment endeavours
* Such as containerization with dockers
* .pkl file can used to generate a prediction when presented with feature data.

Creating the folder

In [None]:
Path("../bg10-Chan_Guan_Ling-162D/model").mkdir(exist_ok=True)

Exporting models

In [None]:
joblib.dump(xgb_model_final, '../bg10-Chan_Guan_Ling-162D/model/xgb_model.pkl')
joblib.dump(knn_final, '../bg10-Chan_Guan_Ling-162D/model/knn_model.pkl')
joblib.dump(dt_final, '../bg10-Chan_Guan_Ling-162D/model/dt_model.pkl')