In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

import warnings 
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
df = pd.read_csv('heart.csv')
df.head()
df.shape

(303, 14)

In [3]:
for cols in df.columns:
    unique_vals = df[cols].unique()
    if len(unique_vals) <= 10:
        print("Feature ", cols, "has ",len(unique_vals), "unique vals are :  ",unique_vals)
    else:
        print("Feature ",cols,"has ",len(unique_vals))

Feature  age has  41
Feature  sex has  2 unique vals are :   [1 0]
Feature  cp has  4 unique vals are :   [3 2 1 0]
Feature  trestbps has  49
Feature  chol has  152
Feature  fbs has  2 unique vals are :   [1 0]
Feature  restecg has  3 unique vals are :   [0 1 2]
Feature  thalach has  91
Feature  exang has  2 unique vals are :   [0 1]
Feature  oldpeak has  40
Feature  slope has  3 unique vals are :   [0 2 1]
Feature  ca has  5 unique vals are :   [0 2 1 3 4]
Feature  thal has  4 unique vals are :   [1 2 3 0]
Feature  target has  2 unique vals are :   [1 0]


In [4]:
cat_features = ['cp','restecg','thal']

In [5]:
#data preprocess
df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [6]:
#remove duplicate values
df.drop_duplicates(inplace=True)
df.shape

(302, 14)

In [7]:
#convert categorical values to numeric representation
df['cp'].replace({0:'A',1:'B',2:'C',3:'D'}, inplace = True)
df['restecg'].replace({0:'P',1:'Q',2:'R'}, inplace = True)
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,D,145,233,1,P,150,0,2.3,0,0,1,1
1,37,1,C,130,250,0,Q,187,0,3.5,0,0,2,1
2,41,0,B,130,204,0,P,172,0,1.4,2,0,2,1
3,56,1,B,120,236,0,Q,178,0,0.8,2,0,2,1
4,57,0,A,120,354,0,Q,163,1,0.6,2,0,2,1


In [8]:
df['thal'].value_counts()

2    165
3    117
1     18
0      2
Name: thal, dtype: int64

In [9]:
df.index[df['thal']==0].tolist()

[48, 281]

In [10]:
df.drop(index=[48,281],inplace=True)
df['thal'].value_counts()

2    165
3    117
1     18
Name: thal, dtype: int64

In [11]:
df['thal'].replace({1:'fixed defect',2:'reversible defect',3:'three'}, inplace = True)
df.head(20)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,D,145,233,1,P,150,0,2.3,0,0,fixed defect,1
1,37,1,C,130,250,0,Q,187,0,3.5,0,0,reversible defect,1
2,41,0,B,130,204,0,P,172,0,1.4,2,0,reversible defect,1
3,56,1,B,120,236,0,Q,178,0,0.8,2,0,reversible defect,1
4,57,0,A,120,354,0,Q,163,1,0.6,2,0,reversible defect,1
5,57,1,A,140,192,0,Q,148,0,0.4,1,0,fixed defect,1
6,56,0,B,140,294,0,P,153,0,1.3,1,0,reversible defect,1
7,44,1,B,120,263,0,Q,173,0,0.0,2,0,three,1
8,52,1,C,172,199,1,Q,162,0,0.5,2,0,three,1
9,57,1,C,150,168,0,Q,174,0,1.6,2,0,reversible defect,1


In [12]:
df = pd.get_dummies(data = df, columns = cat_features)
df.head()

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,slope,ca,...,cp_A,cp_B,cp_C,cp_D,restecg_P,restecg_Q,restecg_R,thal_fixed defect,thal_reversible defect,thal_three
0,63,1,145,233,1,150,0,2.3,0,0,...,0,0,0,1,1,0,0,1,0,0
1,37,1,130,250,0,187,0,3.5,0,0,...,0,0,1,0,0,1,0,0,1,0
2,41,0,130,204,0,172,0,1.4,2,0,...,0,1,0,0,1,0,0,0,1,0
3,56,1,120,236,0,178,0,0.8,2,0,...,0,1,0,0,0,1,0,0,1,0
4,57,0,120,354,0,163,1,0.6,2,0,...,1,0,0,0,0,1,0,0,1,0


In [13]:
df.describe()

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,slope,ca,...,cp_A,cp_B,cp_C,cp_D,restecg_P,restecg_Q,restecg_R,thal_fixed defect,thal_reversible defect,thal_three
count,300.0,300.0,300.0,300.0,300.0,300.0,300.0,300.0,300.0,300.0,...,300.0,300.0,300.0,300.0,300.0,300.0,300.0,300.0,300.0,300.0
mean,54.433333,0.683333,131.626667,246.743333,0.146667,149.663333,0.326667,1.046667,1.396667,0.723333,...,0.473333,0.166667,0.283333,0.076667,0.486667,0.5,0.013333,0.06,0.55,0.39
std,9.07672,0.465953,17.619557,51.837514,0.354364,22.889658,0.469778,1.16376,0.61692,1.008387,...,0.500123,0.373301,0.45137,0.266506,0.500657,0.500835,0.114889,0.237884,0.498325,0.488565
min,29.0,0.0,94.0,126.0,0.0,71.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.75,0.0,120.0,211.0,0.0,133.75,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,56.0,1.0,130.0,241.5,0.0,152.5,0.0,0.8,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,1.0,0.0
75%,61.0,1.0,140.0,275.0,0.0,166.0,1.0,1.6,2.0,1.0,...,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0
max,77.0,1.0,200.0,564.0,1.0,202.0,1.0,6.2,2.0,4.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [14]:
X = df.drop(columns = 'target')
y = df['target']


In [16]:
scale_vars = []

for cols in X.columns:
    if X[cols].max() > 1 :
        scale_vars.append(cols)
print(scale_vars)        


['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca']


In [17]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

scaler = MinMaxScaler()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 20)
print(X_train.shape)
print(X_test.shape)

(240, 20)
(60, 20)


In [18]:
scaler.fit(X_train[scale_vars])
X_train[scale_vars] = scaler.transform(X_train[scale_vars])
X_test[scale_vars] = scaler.transform(X_test[scale_vars])

In [20]:
X_train.head()

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,slope,ca,cp_A,cp_B,cp_C,cp_D,restecg_P,restecg_Q,restecg_R,thal_fixed defect,thal_reversible defect,thal_three
208,0.425532,1,0.245283,0.13164,0,0.519084,0,0.322581,0.5,0.75,0,0,1,0,0,1,0,0,0,1
92,0.489362,1,0.415094,0.212471,0,0.748092,0,0.0,1.0,1.0,0,0,1,0,0,1,0,0,1,0
234,0.87234,1,0.339623,0.441109,0,0.290076,0,0.387097,0.5,0.75,1,0,0,0,1,0,0,0,1,0
13,0.744681,1,0.150943,0.184758,0,0.557252,1,0.290323,0.5,0.0,0,0,0,1,1,0,0,0,1,0
23,0.680851,1,0.528302,0.258661,1,0.503817,1,0.16129,0.5,0.0,0,0,1,0,0,1,0,0,1,0


# Feature selection

In [21]:
featuresWithZeroVariance = []

for cols in X_train.columns:
    unique_vals = X_train[cols].unique()
    if len(unique_vals) == 1:
        featuresWithZeroVariance.append(cols)
print(featuresWithZeroVariance)

[]


In [22]:
#perason correlation coeffiecence



In [25]:
X_train = X_train.drop(columns = ['thal_reversible_defect','restecg_Q'])
x

KeyError: "['thal_reversible_defect'] not found in axis"