# Importing required libraries

In [1]:
# for preprocessing
import numpy as np
import pandas as pd

# Reading data from files

In [2]:
data = pd.read_csv("cleveland.csv", header=None)
data2 = pd.read_csv("hungarian.csv", header=None)
data3 = pd.read_csv("switzerland.csv", header=None)
data4 = pd.read_csv("va.csv", header=None)

# Preprocessing

## Appending all the data read from files

## Counting total number of rows in all the files

In [3]:
rows = 0

for selected_dataframe in [data, data2, data3, data4]:
    print(selected_dataframe.shape)
    rows = rows + selected_dataframe.shape[0]

print("Total Rows", rows)

(303, 14)
(294, 14)
(123, 14)
(200, 14)
Total Rows 920


In [4]:
dataset = data.append(data2).append(data3).append(data4)
dataset.shape
dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,63.0,1.0,1.0,145,233,1,2,150,0,2.3,3,0.0,6.0,0
1,67.0,1.0,4.0,160,286,0,2,108,1,1.5,2,3.0,3.0,2
2,67.0,1.0,4.0,120,229,0,2,129,1,2.6,2,2.0,7.0,1
3,37.0,1.0,3.0,130,250,0,0,187,0,3.5,3,0.0,3.0,0
4,41.0,0.0,2.0,130,204,0,2,172,0,1.4,1,0.0,3.0,0


## Deleting previous dataframes to free memory

In [5]:
del data
del data2
del data3
del data4

## Assinging column names to each column

In [6]:
dataset = dataset.rename(columns ={0:'age',1:'sex',2:'cp',3:'trestbps',4:'chol',
          5:'fbs',6:'restecg',7:'thalach',8:'exang',
          9:'oldpeak',10:'slope',11:'ca',12:'thal',13:'target'
    })
dataset.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')

## Getting data types of each column

In [7]:
dataset.dtypes

age         float64
sex         float64
cp          float64
trestbps     object
chol         object
fbs          object
restecg      object
thalach      object
exang        object
oldpeak      object
slope        object
ca           object
thal         object
target        int64
dtype: object

## Description of each column

In [8]:
print("age - age in years")
print("sex - (1 = male; 0 = female)")
print("cp - chest pain type")
print("trestbps - resting blood pressure (in mm Hg on admission to the hospital)")
print("chol - serum cholestoral in mg/dl ")
print("fbs - fasting blood sugar 120 mg/dl (1 = true; 0 = false)")
print("restecg - resting electrocardiographic results ")
print("thalach - maximum heart rate achieved")
print("exang - exercise induced angina (1 = yes; 0 = no) ")
print("oldpeak - ST depression induced by exercise relative to rest")
print("slope - the slope of the peak exercise ST segment ")
print("ca - number of major vessels (0-3) colored by flourosopy ")
print("thal - 3 = normal; 6 = fixed defect; 7 = reversable defect")
print("target - (0 means absent. 1, 2, 3, 4 mean present)")


age - age in years
sex - (1 = male; 0 = female)
cp - chest pain type
trestbps - resting blood pressure (in mm Hg on admission to the hospital)
chol - serum cholestoral in mg/dl 
fbs - fasting blood sugar 120 mg/dl (1 = true; 0 = false)
restecg - resting electrocardiographic results 
thalach - maximum heart rate achieved
exang - exercise induced angina (1 = yes; 0 = no) 
oldpeak - ST depression induced by exercise relative to rest
slope - the slope of the peak exercise ST segment 
ca - number of major vessels (0-3) colored by flourosopy 
thal - 3 = normal; 6 = fixed defect; 7 = reversable defect
target - (0 means absent. 1, 2, 3, 4 mean present)


## Replacing missing values

### Missing 'resting blood pressure' attribute

In [9]:
dataset.loc[dataset['trestbps'] == "?"].shape[0]

59

#### Finding the mode of column

In [10]:
mode_of_trestbps = dataset.loc[dataset['trestbps'] != "?"].trestbps.mode()[0] 
mode_of_trestbps

'120'

#### Replacing the cells containing '?' with mode of the column

In [11]:
dataset.trestbps.replace({"?":mode_of_trestbps}, inplace=True)

In [12]:
dataset.loc[dataset['trestbps'] == "?"].shape

(0, 14)

In [13]:
dataset['trestbps'] = pd.to_numeric(dataset["trestbps"], downcast="float")

In [14]:
dataset.dtypes

age         float64
sex         float64
cp          float64
trestbps    float32
chol         object
fbs          object
restecg      object
thalach      object
exang        object
oldpeak      object
slope        object
ca           object
thal         object
target        int64
dtype: object

#### Cholesterol missing values

In [15]:
dataset.loc[dataset['chol'] == '?'].shape[0]

30

In [16]:
mode_of_cholesterol = dataset.loc[dataset['chol'] != "?"].chol.mode()[0] 
mode_of_cholesterol

0

In [17]:
dataset.chol.replace({"?":mode_of_cholesterol}, inplace=True)

In [18]:
dataset.loc[dataset['chol'] == "?"].shape

(0, 14)

In [19]:
dataset['chol'] = pd.to_numeric(dataset["chol"], downcast="float")
dataset.dtypes

age         float64
sex         float64
cp          float64
trestbps    float32
chol        float32
fbs          object
restecg      object
thalach      object
exang        object
oldpeak      object
slope        object
ca           object
thal         object
target        int64
dtype: object

#### "Fasting Blood Sugar" Missing values

In [20]:
dataset.loc[dataset['fbs'] == "?"].shape

(90, 14)

In [21]:
dataset.loc[dataset['fbs'] == -1].shape

(0, 14)

In [22]:
dataset.fbs.replace({"?": -1}, inplace= True)
dataset.loc[dataset['fbs'] == -1].shape

(90, 14)

In [23]:
dataset["fbs"] = pd.to_numeric(dataset["fbs"], downcast="float")
mean_of_fasting = dataset.loc[dataset['fbs'] != -1 ].fbs.mean()
mean_of_fasting

0.16626505553722382

In [24]:
dataset.fbs.replace({-1: mean_of_fasting}, inplace= True)
dataset.loc[dataset['fbs'] == -1].shape

(0, 14)

In [25]:
dataset.dtypes

age         float64
sex         float64
cp          float64
trestbps    float32
chol        float32
fbs         float32
restecg      object
thalach      object
exang        object
oldpeak      object
slope        object
ca           object
thal         object
target        int64
dtype: object

#### "ECG Results" Missing values

In [26]:
dataset.loc[dataset["restecg"] == "?"].shape

(2, 14)

In [27]:
mode_of_ecg_results = dataset.loc[dataset["restecg"] != "?"].restecg.mode()[0]
mode_of_ecg_results

'0'

In [28]:
dataset.restecg.replace({"?": mode_of_ecg_results}, inplace = True)

In [29]:
dataset.loc[dataset["restecg"] == "?"].shape

(0, 14)

In [30]:
dataset["restecg"] = pd.to_numeric(dataset["restecg"], downcast="float")
dataset.dtypes

age         float64
sex         float64
cp          float64
trestbps    float32
chol        float32
fbs         float32
restecg     float32
thalach      object
exang        object
oldpeak      object
slope        object
ca           object
thal         object
target        int64
dtype: object

#### "Maximum heart rate" missing data

In [31]:
dataset.loc[dataset["thalach"] == "?"].shape

(55, 14)

In [32]:
mode_of_thalach = dataset.loc[dataset["thalach"] != "?"].thalach.mode()[0]
mode_of_thalach

'150'

In [33]:
dataset.thalach.replace({"?": mode_of_thalach}, inplace= True)

In [34]:
dataset.loc[dataset["thalach"] == "?"].shape

(0, 14)

In [35]:
dataset["thalach"] = pd.to_numeric(dataset["thalach"], downcast="float")
dataset.dtypes

age         float64
sex         float64
cp          float64
trestbps    float32
chol        float32
fbs         float32
restecg     float32
thalach     float32
exang        object
oldpeak      object
slope        object
ca           object
thal         object
target        int64
dtype: object

#### "Exercise induced angina" missing values

In [36]:
dataset.loc[dataset["exang"] == "?"].shape

(55, 14)

In [37]:
mode_of_exang = dataset.loc[dataset["exang"] != "?"].exang.mode()[0]
mode_of_exang

'0'

In [38]:
dataset.exang.replace({"?": mode_of_exang}, inplace= True)

In [39]:
dataset.loc[dataset["exang"] == "?"].shape

(0, 14)

In [40]:
dataset["exang"] = pd.to_numeric(dataset["exang"], downcast="float")
dataset.dtypes

age         float64
sex         float64
cp          float64
trestbps    float32
chol        float32
fbs         float32
restecg     float32
thalach     float32
exang       float32
oldpeak      object
slope        object
ca           object
thal         object
target        int64
dtype: object

#### "Slope" missing values

In [41]:
dataset.loc[dataset["slope"] == "?"].shape

(309, 14)

In [42]:
mode_of_slope = dataset.loc[dataset["slope"] != "?"].slope.mode()[0]
mode_of_slope

'2'

In [43]:
dataset.slope.replace({"?": mode_of_slope}, inplace= True)

In [44]:
dataset.loc[dataset["slope"] == "?"].shape

(0, 14)

In [45]:
dataset["slope"] = pd.to_numeric(dataset["slope"], downcast = "float")
dataset.dtypes

age         float64
sex         float64
cp          float64
trestbps    float32
chol        float32
fbs         float32
restecg     float32
thalach     float32
exang       float32
oldpeak      object
slope       float32
ca           object
thal         object
target        int64
dtype: object

#### "ca" missing data

In [46]:
dataset.loc[dataset["ca"] == "?"].shape

(611, 14)

In [47]:
dataset.loc[dataset["ca"] == -1 ].shape

(0, 14)

In [48]:
dataset.ca.replace({"?": -1 }, inplace = True)

In [49]:
dataset["ca"] = pd.to_numeric(dataset["ca"], downcast = "float")
mean_of_ca = dataset.loc[dataset["ca"] != -1].ca.mean()
mean_of_ca

0.6763753890991211

In [50]:
dataset.ca.replace({ -1 : mean_of_ca }, inplace = True)
dataset.loc[dataset["ca"] == -1 ].shape

(0, 14)

In [51]:
dataset.dtypes

age         float64
sex         float64
cp          float64
trestbps    float32
chol        float32
fbs         float32
restecg     float32
thalach     float32
exang       float32
oldpeak      object
slope       float32
ca          float32
thal         object
target        int64
dtype: object

#### "old peak" missing data

In [52]:
dataset.loc[dataset["oldpeak"] == "?"].shape

(62, 14)

In [53]:
dataset.loc[dataset["oldpeak"] == -1 ].shape

(0, 14)

In [54]:
dataset.oldpeak.replace({"?": -1 }, inplace = True)
dataset.loc[dataset["oldpeak"] == -1 ].shape

(62, 14)

In [55]:
dataset["oldpeak"] = pd.to_numeric(dataset["oldpeak"], downcast = "float")
mean_of_oldpeak = dataset.loc[dataset["oldpeak"] != -1].oldpeak.mean()
mean_of_oldpeak

0.8831773996353149

In [56]:
dataset.oldpeak.replace({ -1 : mean_of_oldpeak }, inplace = True)
dataset.loc[dataset["oldpeak"] == -1 ].shape

(0, 14)

In [57]:
dataset.dtypes

age         float64
sex         float64
cp          float64
trestbps    float32
chol        float32
fbs         float32
restecg     float32
thalach     float32
exang       float32
oldpeak     float32
slope       float32
ca          float32
thal         object
target        int64
dtype: object

#### "thal" missing data

In [58]:
dataset.loc[dataset["thal"] == "?"].shape

(486, 14)

In [59]:
dataset.loc[dataset["thal"] == -1 ].shape

(0, 14)

In [60]:
dataset.thal.replace({"?": -1 }, inplace = True)
dataset.loc[dataset["thal"] == -1 ].shape

(486, 14)

In [61]:
dataset["thal"] = pd.to_numeric(dataset["thal"], downcast = "float")
mean_of_thal = dataset.loc[dataset["thal"] != -1].thal.mean()
mean_of_thal

5.087557792663574

In [62]:
dataset.thal.replace({ -1 : mean_of_thal }, inplace = True)
dataset.loc[dataset["thal"] == -1 ].shape

(0, 14)

In [63]:
dataset.dtypes

age         float64
sex         float64
cp          float64
trestbps    float32
chol        float32
fbs         float32
restecg     float32
thalach     float32
exang       float32
oldpeak     float32
slope       float32
ca          float32
thal        float32
target        int64
dtype: object

#### Change "sex" from Float to Int

In [64]:
dataset["sex"] = pd.to_numeric(dataset["sex"], downcast = "integer")

#### Change "Age" from Float to Int

In [65]:
dataset["age"] = pd.to_numeric(dataset["age"], downcast = "integer")

### Change values of "Target" from 0-4 to 0 and 1

In [66]:
len(dataset.loc[dataset.target > 1])

244

In [67]:
dataset.target.replace({ 2:1, 3:1, 4:1 }, inplace = True)

In [68]:
len(dataset.loc[dataset.target > 1])

0

## Displaying first 5 and last 5 rows from the dataset

In [69]:
dataset.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67,1,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1
2,67,1,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37,1,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41,0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [70]:
dataset.tail()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
195,54,0,4.0,127.0,333.0,1.0,1.0,154.0,0.0,0.0,2.0,0.676375,5.087558,1
196,62,1,1.0,120.0,139.0,0.0,1.0,150.0,0.0,0.883177,2.0,0.676375,5.087558,0
197,55,1,4.0,122.0,223.0,1.0,1.0,100.0,0.0,0.0,2.0,0.676375,6.0,1
198,58,1,4.0,120.0,385.0,1.0,2.0,150.0,0.0,0.883177,2.0,0.676375,5.087558,0
199,62,1,2.0,120.0,254.0,0.0,2.0,93.0,1.0,0.0,2.0,0.676375,5.087558,1


### Statistical description of features

In [71]:
dataset.describe(include = 'all')

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,920.0,920.0,920.0,920.0,920.0,920.0,920.0,920.0,920.0,920.0,920.0,920.0,920.0,920.0
mean,53.51087,0.78913,3.25,131.354355,192.636963,0.166265,0.603261,138.290222,0.366304,0.883178,1.847826,0.676373,5.087565,0.553261
std,9.424685,0.408148,0.930969,18.682089,114.559959,0.353831,0.805444,25.311525,0.482057,1.050118,0.51601,0.541667,1.317277,0.497426
min,28.0,0.0,1.0,0.0,0.0,0.0,0.0,60.0,0.0,-2.6,1.0,0.0,3.0,0.0
25%,47.0,1.0,3.0,120.0,164.0,0.0,0.0,120.0,0.0,0.0,2.0,0.676375,5.087558,0.0
50%,54.0,1.0,4.0,130.0,221.0,0.0,0.0,140.0,0.0,0.8,2.0,0.676375,5.087558,1.0
75%,60.0,1.0,4.0,140.0,267.0,0.0,1.0,156.0,1.0,1.5,2.0,0.676375,6.0,1.0
max,77.0,1.0,4.0,200.0,603.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,7.0,1.0


## Merging and storing dataset to file

In [72]:
dataset = dataset.append(dataset).append(dataset).append(dataset).append(dataset).append(dataset)

In [73]:
dataset.shape

(5520, 14)

### Shuffling rows for randomization

The line of code below shuffles a dataframe. 
It is taken from https://stackoverflow.com/questions/29576430/shuffle-dataframe-rows

In [74]:
dataset = dataset.sample(frac=1).reset_index(drop=True)

In [75]:
dataset.to_csv("preprocessed_data.csv")

## Scalling and normalization

In [76]:
#age = pd.DataFrame(dataset.Age)
#cp = pd.DataFrame(dataset.Chest_pain)
#trestbps = pd.DataFrame(dataset.Resting_blood_pressure )
#chol = pd.DataFrame(dataset.Cholesterol)
#fbs = pd.DataFrame(dataset.Fasting_blood_sugar)
#restecg = pd.DataFrame(dataset.ECG_results)
#thalach = pd.DataFrame(dataset.Maximum_heart_rate)
#exang = pd.DataFrame(dataset.Exercise_induced_angina)
#oldpeak = pd.DataFrame(dataset.ST_depression)
#slope = pd.DataFrame(dataset.ST_slope)
#ca = pd.DataFrame(dataset.Major_vessels)
#thal = pd.DataFrame(dataset.Thalassemia_types)