# Heart Disease

## Import all Python Libraries

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from scipy import stats
sns.set_theme()

## Read csv file into Pandas Dataframe

In [2]:
columns = ['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','num']
df = pd.read_csv('data.csv', names=columns)

In [3]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


## Clean Data and EDA

In [None]:
df.info()

Columns 'ca' and 'thal' have dtype object but appear to be numerical values from the third cell above.

In [None]:
df.describe()

In [None]:
df[df['thal'] == '?']

In [None]:
df[df['ca'] == '?']

We have two rows with unknown 'thal' values and four rows with unknown 'ca' values. These rows will be dropped for a cleaner dataset.

In [4]:
drop_rows = [*df[df['thal'] == '?'].index.values, *df[df['ca'] == '?'].index.values]

In [5]:
df.drop(drop_rows, axis=0, inplace=True)

In [6]:
df[['thal','ca']] = df[['thal','ca']].astype('float64')

In [None]:
sns.displot(df, x="age", hue='sex')

In [None]:
num_m = len(df[df['sex'] == 1.0]) # number of males
num_f = len(df[df['sex'] == 0.0]) # number of females
print('Number of males in study are {} out of {}'.format(num_m,len(df)))
print('Number of females in study are {} out of {}'.format(num_f,len(df)))

In [None]:
sns.countplot(x='sex', hue="num", data=df)

In [None]:
num_md = len(df[(df['num'] > 0) & (df['sex'] == 1.0)]) # males with heart disease
num_fd = len(df[(df['num'] > 0) & (df['sex'] == 0.0)]) # females with heart disease

In [None]:
print('Percentage of males in study with heart disease is {}%'.format(round(num_md/num_m*100,2)))
print('Percentage of females in study with heart disease is {}%'.format(round(num_fd/num_f*100,2)))

In [None]:
df.corr()

In [None]:
sns.displot(x="trestbps", hue='sex', data=df)

In [None]:
df[(df['trestbps'] > 0) & (df['sex'] == 1.0)].trestbps.plot(kind='hist')

In [None]:
df[(df['trestbps'] > 0) & (df['sex'] == 0.0)].trestbps.plot(kind='hist')

## Statistics and Transformations

In [None]:
stats = ols(formula='trestbps ~ age', data=df).fit()

In [None]:
stats.summary()

In [None]:
tmp = stats.params[1]*df['age'] + stats.params[0]

plt.scatter(df['age'], df['trestbps'])
plt.plot(df['age'], tmp, c='r')
plt.xlabel('age')
plt.ylabel('trestbps')
plt.title('trestbps vs. age');

In [None]:
pd.plotting.scatter_matrix(df, figsize=(20,16));

In [None]:
df['trestbps'].skew()
#df['trestbps'].kurtosis()

In [None]:
df['thalach'].skew()

In [None]:
df['chol'].skew()

In [None]:
df['oldpeak'].skew()

In [None]:
pt = PowerTransformer()
pt_fit = pt.fit_transform(df['thalach'].values.reshape(-1,1))

In [None]:
plt.hist(pt_fit)

In [None]:
plt.hist(df['thalach']);

In [None]:
plt.hist(df['trestbps']);

In [None]:
sci_tmp = stats.boxcox_normmax(df['trestbps'].values)

In [None]:
sci_tt = stats.boxcox(df['trestbps'].values, sci_tmp)

In [None]:
plt.hist(sci_tt);

In [None]:
stats.skew(sci_tt)

In [None]:
plt.hist(df['oldpeak']);

### Dummy Variables

In [7]:
data = pd.get_dummies(df, prefix=['slope','thal','ca'], columns=['slope','thal','ca'])

## Model without Transformations

In [8]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('num',axis=1), data['num'], test_size=0.2, random_state=42)

In [9]:
neigh = KNeighborsClassifier()
parameters = {
    'n_neighbors':[2,3,4,5,6,7,8,9],
    'weights':('uniform','distance'),
    'algorithm':('auto', 'ball_tree', 'kd_tree', 'brute'),
    'leaf_size':[15,20,25,30,35,40,45],
    'p':[1,2,3],
}
clf = GridSearchCV(neigh, parameters, cv=5, n_jobs=-1, verbose=5)
clf.fit(X_train, y_train)

Fitting 5 folds for each of 1344 candidates, totalling 6720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 1072 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 3088 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done 5680 tasks      | elapsed:   15.4s
[Parallel(n_jobs=-1)]: Done 6720 out of 6720 | elapsed:   18.5s finished


GridSearchCV(cv=5, estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'algorithm': ('auto', 'ball_tree', 'kd_tree', 'brute'),
                         'leaf_size': [15, 20, 25, 30, 35, 40, 45],
                         'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9],
                         'p': [1, 2, 3], 'weights': ('uniform', 'distance')},
             verbose=5)

In [10]:
clf.best_params_

{'algorithm': 'auto',
 'leaf_size': 15,
 'n_neighbors': 6,
 'p': 1,
 'weights': 'uniform'}

In [11]:
clf.best_score_

0.49796099290780144

In [12]:
clf.score(X_test, y_test)

0.5666666666666667

In [37]:
df_tmp = data.copy()

In [31]:
df_tmp[df_tmp['num'] > 0] = 1

In [55]:
X_tr, X_t, y_tr, y_t = train_test_split(data2.drop('target',axis=1), data2['target'], test_size=0.2, random_state=42)

In [56]:
clf_2 = GridSearchCV(neigh, parameters, cv=5, n_jobs=-1, verbose=5)
clf_2.fit(X_tr, y_tr)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s


Fitting 5 folds for each of 1344 candidates, totalling 6720 fits


[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 896 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 1904 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 3200 tasks      | elapsed:   10.4s
[Parallel(n_jobs=-1)]: Done 4784 tasks      | elapsed:   15.6s
[Parallel(n_jobs=-1)]: Done 6656 tasks      | elapsed:   22.0s
[Parallel(n_jobs=-1)]: Done 6720 out of 6720 | elapsed:   22.1s finished


GridSearchCV(cv=5, estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'algorithm': ('auto', 'ball_tree', 'kd_tree', 'brute'),
                         'leaf_size': [15, 20, 25, 30, 35, 40, 45],
                         'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9],
                         'p': [1, 2, 3], 'weights': ('uniform', 'distance')},
             verbose=5)

In [57]:
clf_2.best_params_

{'algorithm': 'auto',
 'leaf_size': 15,
 'n_neighbors': 5,
 'p': 1,
 'weights': 'distance'}

In [58]:
clf_2.best_score_

0.7107142857142856

In [46]:
clf_2.score(X_t,y_t)

1.0

In [40]:
df_tmp.loc[df_tmp['num'] > 0] = 1

In [47]:
df_tmp

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,...,slope_1.0,slope_2.0,slope_3.0,thal_3.0,thal_6.0,thal_7.0,ca_0.0,ca_1.0,ca_2.0,ca_3.0
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,...,0,0,1,0,1,0,1,0,0,0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1,1,1,1,1,1,1,1,1,1
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1,1,1,1,1,1,1,1,1,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,...,0,0,1,1,0,0,1,0,0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,...,1,0,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1,1,1,1,1,1,1,1,1,1
298,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1,1,1,1,1,1,1,1,1,1
299,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1,1,1,1,1,1,1,1,1,1
300,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1,1,1,1,1,1,1,1,1,1


In [48]:
df2 = pd.read_csv('heart.csv')

In [50]:
data2 = pd.get_dummies(df2, prefix=['slope','thal','ca'], columns=['slope','thal','ca'])