In [22]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
from matplotlib import pyplot as plt
from sklearn.model_selection import cross_val_score
from collections import Counter

In [23]:
raw_df = pd.read_csv('heart.csv')

In [24]:
raw_df.shape

(918, 12)

<br>Dataset Attributes
<br>Age : age of the patient [years]
<br>Sex : sex of the patient [M: Male, F: Female]
<br>ChestPainType : chest pain type [TA: Typical Angina, ATA: Atypical Angina, NAP: Non-Anginal Pain, ASY: Asymptomatic]
<br>RestingBP : resting blood pressure [mm Hg]
<br>Cholesterol : serum cholesterol [mm/dl]
<br>FastingBS : fasting blood sugar [1: if FastingBS > 120 mg/dl, 0: otherwise]
<br>RestingECG : resting electrocardiogram results [Normal: Normal, ST: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV), LVH: showing probable or definite left ventricular hypertrophy by Estes' criteria]
<br>MaxHR : maximum heart rate achieved [Numeric value between 60 and 202]
<br>ExerciseAngina : exercise-induced angina [Y: Yes, N: No]
<br>Oldpeak : oldpeak = ST [Numeric value measured in depression]
<br>ST_Slope : the slope of the peak exercise ST segment [Up: upsloping, Flat: flat, Down: downsloping]
<br>HeartDisease : output class [1: heart disease, 0: Normal]

In [25]:
def IQR_method (df,n,features):
    """
    Takes a dataframe and returns an index list corresponding to the observations 
    containing more than n outliers according to the Tukey IQR method.
    """
    outlier_list = []
    
    for column in features:
        # 1st quartile (25%)
        Q1 = np.percentile(df[column], 25)
        # 3rd quartile (75%)
        Q3 = np.percentile(df[column],75)
        # Interquartile range (IQR)
        IQR = Q3 - Q1
        # outlier step
        outlier_step = 1.5 * IQR
        # Determining a list of indices of outliers
        outlier_list_column = df[(df[column] < Q1 - outlier_step) | (df[column] > Q3 + outlier_step )].index
        # appending the list of outliers 
        outlier_list.extend(outlier_list_column)
        
    # selecting observations containing more than x outliers
    outlier_list = Counter(outlier_list)        
    multiple_outliers = list( k for k, v in outlier_list.items() if v > n )
    
    # Calculate the number of records below and above lower and above bound value respectively
    out1 = df[df[column] < Q1 - outlier_step]
    out2 = df[df[column] > Q3 + outlier_step]
    
    print('Total number of deleted outliers is:', out1.shape[0]+out2.shape[0])
    
    return multiple_outliers

In [26]:
numerical_columns = list(raw_df.loc[:,['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']])
categorical_columns = list(raw_df.loc[:,['Sex', 'ChestPainType', 'FastingBS', 'RestingECG', 'ExerciseAngina', 'ST_Slope']])

In [27]:
# detecting outliers
Outliers_IQR = IQR_method(raw_df,1,numerical_columns)

# dropping outliers
df = raw_df.drop(Outliers_IQR, axis = 0).reset_index(drop=True)

Total number of deleted outliers is: 16


In [28]:
df = pd.get_dummies(df, drop_first=True)


In [29]:
X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size = 0.3, random_state = 42)

In [31]:
from sklearn.preprocessing import StandardScaler

# Creating function for scaling
def Standard_Scaler (df, col_names):
    features = df[col_names]
    scaler = StandardScaler().fit(features.values)
    features = scaler.transform(features.values)
    df[col_names] = features
    
    return df

In [32]:
col_names = numerical_columns
X_train = Standard_Scaler (X_train, col_names)
X_test = Standard_Scaler (X_test, col_names)

In [33]:
x_heart = np.concatenate((X_train, X_test), axis = 0)
y_heart = np.concatenate((y_train, y_test), axis = 0)

In [34]:
x_heart.shape

(905, 15)

In [35]:
y_heart.shape

(905,)

In [36]:
import pickle
with open('heart.pkl', mode = 'wb') as f:
  pickle.dump([x_heart, y_heart], f)