<a href="https://colab.research.google.com/github/empyreanlee/ML_med/blob/update/Heart_Disease_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
#from imblearn.over_sampling import RandomOverSampler
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV

In [None]:
dataset_cols = ["age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal", "num"]
df = pd.read_csv("processed.cleveland.data", header=None)

In [None]:
df.columns = dataset_cols
df.head()

In [None]:
df.dtypes

Identify columms with missing values

In [None]:
df['ca'].unique()

In [None]:
df['thal'].unique()

In [None]:
#uncomment to find number of missing values in the two coluns
#len(df.loc[df['ca'] == '?']) | len(df.loc[df['thal'] == '?'])
#locate the missing values
df.loc[(df['ca'] == '?') | (df['thal'] == '?')]

In [None]:
# as there are only 6 rows , we can remove them
df_clear = df.loc[(df['ca'] != '?') & (df['thal'] != '?')]

In [None]:
len(df_clear)

In [None]:
df_clear.head()

Categorical values in slope, cp, thal, resceg.. need to be inspected as they are likely to have more than two categories.

In [None]:
df_clear['thal'].unique()

One-hot encoding is applied to convert a column of categorical values into multiple column of binary values

I`ll use pandas get_dummies to do encoding, chest pain has 4 categories rep by 1-4, 1: typical angina 2: atypical angina 3: non-anginal pain 4:asymptomatic. get_dummies separates cp into 4 cp's where 1 rep the patient has the specified type and 0 doesn't.   

In [None]:
pd.get_dummies(df_clear, columns=['cp']).head()

In [None]:
X = pd.get_dummies(df_clear, columns=['cp','restecg','slope','thal',])
X.head()

In [None]:
X['num'].unique()

For the target i.e "num" , i've made any value > 0 to be 1 , to have binary 0,1

In [None]:
num_not_zero = X['num'] > 0
X.loc[num_not_zero, "num"] = 1
X['num'].unique()

Moving the column "num" to the last column position [-1]

In [None]:
column_move = 'num'
column = X.pop(column_move)
X.insert(len(X.columns), column.name, column)
X.head()

 Some columns seem to contain non-numeric values. We handle this by imputing them with appropriate values

In [None]:
imputer = SimpleImputer(strategy='mean')
## Apply imputation to the DataFrame
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

In [None]:
X.dtypes

In [None]:
train, valid, test = np.split(X.sample(frac=1), [int(0.6*len(X)), int(0.8*len(X))])

In [None]:
def scale_dataset(dataframe):
  dataframe = pd.DataFrame(dataframe)
  X = dataframe[dataframe.columns[:-1]].values
  y = dataframe[dataframe.columns[-1]].values

  scaler = StandardScaler()
  X = scaler.fit_transform(X)

  data = np.hstack((X, np.reshape(y, (-1, 1))))

  return data, X, y

In [None]:
train, X_train, y_train = scale_dataset(train)
valid, X_valid, y_valid = scale_dataset(valid)
test, X_test, y_test = scale_dataset(test)

##Log Reg

In [None]:
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

In [None]:
log_model = LogisticRegression()
log_model = log_model.fit(X_train, y_train)

In [None]:
y_pred = log_model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, display_labels=["Does not have HD", "Has HD"])

#Support Vector Machine

In [None]:
from sklearn.svm import SVC

In [None]:
svm_model = SVC(random_state=42)
svm_model.fit(X_train, y_train)

In [None]:
y_pred = svm_model.predict(X_test)
print(classification_report(y_test, y_pred))

Optimisation

In [None]:
param_grid = {
    'C' : [0, 1, 10, 100],
    'gamma' : [1, 0.1, 0.01, 0.001 ],
    'kernel' : ['rbf']
}

svm = SVC(random_state=42)

grid_search = GridSearchCV(estimator = svm, param_grid = param_grid, cv = 5, scoring = 'accuracy')

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best hyperparameters:", best_params)



In [None]:
svm_model = SVC(random_state=42, C=1, gamma=0.01)
svm_model.fit(X_train, y_train)

In [None]:
y_pred = svm_model.predict(X_test)
print(classification_report(y_test, y_pred))