<span style="color: yellow; font-size: 350%; font-weight: bold;">Dealing with Outliers</span>

In [49]:
import os
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer, KNNImputer

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline

#  for classification
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.ensemble import (
    RandomForestClassifier,
    RandomForestRegressor,
    GradientBoostingClassifier,
    AdaBoostClassifier,
)
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    r2_score,
    mean_squared_error,
    mean_absolute_error,
)

# ignore warnings
import warnings

warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("./data/heart_disease_uci_imputed.csv")
df.isnull().sum() 

id          0
age         0
sex         0
dataset     0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalch      0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
num         0
dtype: int64

In [64]:
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [51]:
categorical_cols = ['thal', 'ca', 'slope', 'exang', 'restecg','fbs', 'cp', 'sex', 'num']
bool_cols = ['fbs', 'exang']
numeric_cols = ['oldpeak', 'thalch', 'chol', 'trestbps', 'age']


In [None]:
plt.figure(figsize=(20, 20))

colors = ["red", "green", "blue", "orange", "purple"]

for i, col in enumerate(numeric_cols):
    plt.subplot(3,2,i+1)
    sns.boxplot(x=df[col],color=colors[i])
    plt.title(col)

plt.show()

In [None]:
# fig = px.box(df, y=numeric_cols, points="all", title="Box Plots of Numeric Columns")
for col in numeric_cols:
    fig = px.box(df, y=col, points="all", title=f"Box Plot of {col}")
    fig.update_traces(boxpoints="outliers")  # show only outliers as points
    fig.show()

---

<span style="color: yellow; font-size: 350%; font-weight: bold;">Machine learning</span>

In [54]:
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [55]:
df['num'].value_counts()

num
0    411
1    265
2    109
3    107
4     28
Name: count, dtype: int64

## The Target Column is `num which is the predicted attribute. We will use this column to predict the heart disease.
## The unique values in this column are: [0, 1, 2, 3, 4], which states that there are 5 types of heart diseases.

- `0= no heart disease`
- `1= mild heart disease`
- `2 moderate heart disease`
- `3 = severe heart disease`
- `4 = critical heart disease`

In [56]:
X = df.drop("num", axis=1)
y = df["num"]


# encode X category or object columns using separate label encoder and ensuring inverse transforming is preserved

encoders = {}
for col in X.columns:
    if X[col].dtype == "object" or X[col].dtype == "category":
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])
        encoders[col] = le
    else:
        pass

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

Enlist all the models that you will use to predict the heart disease. These models should be classifiers for multi-class classification.
1. logistic regression
2. KNN
3. NB
4. SVM
5. Decision Tree
6. Random Forest
7. XGBoost
8. lightGBM
9. Adaboost
10. GradientboostClassifier

In [57]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import (
    RandomForestClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier,
)
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    r2_score,
    mean_squared_error,
    mean_absolute_error,
    confusion_matrix,
)

In [58]:
# Create a list of models to evaluate
models = [
    ("Random Forest", RandomForestClassifier(random_state=42)),
    ("Gradient Boosting", GradientBoostingClassifier(random_state=42)),
    ("Support Vector Machine", SVC(random_state=42)),
    ("Logistic Regression", LogisticRegression(random_state=42)),
    ("K-Nearest Neighbors", KNeighborsClassifier()),
    ("Decision Tree", DecisionTreeClassifier(random_state=42)),
    ("Ada Boost", AdaBoostClassifier(random_state=42)),
    ("XG Boost", XGBClassifier(random_state=42)),
    ("Naive Bayes", GaussianNB()),
]

best_model = None
best_accuracy = 0.0

# iterate over the models and evalutate their performance

for name, model in models:
    pipeline = Pipeline([("model", model)])

    # perform cross validation

    scores = cross_val_score(pipeline, X_train, y_train, cv=5)

    # calculate mean accuracy

    mean_accuracy = scores.mean()

    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    # Print the performance metrics
    print("Model:", name)
    print("Cross-validation Accuracy:", mean_accuracy)
    print("Test Accuracy:", accuracy)
    print()
    # Check if the current mogel has the best accuracy
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = pipeline


print("best model ", best_model)

Model: Random Forest
Cross-validation Accuracy: 0.6739462209302325
Test Accuracy: 0.7028985507246377

Model: Gradient Boosting
Cross-validation Accuracy: 0.6692223837209302
Test Accuracy: 0.6702898550724637

Model: Support Vector Machine
Cross-validation Accuracy: 0.5822795542635658
Test Accuracy: 0.5905797101449275

Model: Logistic Regression
Cross-validation Accuracy: 0.5108769379844962
Test Accuracy: 0.5217391304347826

Model: K-Nearest Neighbors
Cross-validation Accuracy: 0.5807776162790698
Test Accuracy: 0.5869565217391305

Model: Decision Tree
Cross-validation Accuracy: 0.6180232558139535
Test Accuracy: 0.6014492753623188

Model: Ada Boost
Cross-validation Accuracy: 0.5916666666666666
Test Accuracy: 0.5434782608695652

Model: XG Boost
Cross-validation Accuracy: 0.670796996124031
Test Accuracy: 0.6847826086956522

Model: Naive Bayes
Cross-validation Accuracy: 0.5683018410852714
Test Accuracy: 0.5760869565217391

best model  Pipeline(steps=[('model', RandomForestClassifier(random_s

In [61]:
import pickle

# pickle.dump(best_model, open("best_model_for_heart_diseas_prediction.pkl", "wb"))

Assignment Alert: How can you improve the accuracy of this or any other model?
Hints:
1. Feature Engineering
2. Feature Selection instead of 14 use 12, 10, or 8, or 6, features.
3. Data Pre-processing (Scaling and Normalization).
4. Hyperparameter Tuning.
    1. Ensemble Learning (Stacking, Bagging, Boosting) increase their depth.
5. Use different models.