# Importing required libraries

In [21]:
# for preprocessing
import numpy as np
import pandas as pd

# for visualizations
import seaborn as sns
import matplotlib.pyplot as plt

# Reading data from files

In [22]:
data = pd.read_csv("cleveland.csv", header=None)
data2 = pd.read_csv("hungarian.csv", header=None)
data3 = pd.read_csv("switzerland.csv", header=None)
data4 = pd.read_csv("va.csv", header=None)

# Preprocessing

## Appending all the data read from files

## Counting total number of rows in all the files

In [23]:
rows = 0

for selected_dataframe in [data, data2, data3, data4]:
    print(selected_dataframe.shape)
    rows = rows + selected_dataframe.shape[0]

print("Total Rows", rows)

(303, 14)
(294, 14)
(123, 14)
(200, 14)
Total Rows 920


In [24]:
dataset = data.append(data2).append(data3).append(data4)
dataset.shape

(920, 14)

## Deleting previous dataframes to free memory

In [25]:
del data
del data2
del data3
del data4

## Assinging column names to each column

In [26]:
dataset = dataset.rename(columns ={0:'Age',1:'Sex',2:'Chest_pain',3:'Resting_blood_pressure',4:'Cholesterol',
          5:'Fasting_blood_sugar',6:'ECG_results',7:'Maximum_heart_rate',8:'Exercise_induced_angina',
          9:'ST_depression',10:'Major_vessels',11:'Thalassemia_types',12:'Heart_attack',13:'ST_slope'
    })
dataset.columns

Index(['Age', 'Sex', 'Chest_pain', 'Resting_blood_pressure', 'Cholesterol',
       'Fasting_blood_sugar', 'ECG_results', 'Maximum_heart_rate',
       'Exercise_induced_angina', 'ST_depression', 'Major_vessels',
       'Thalassemia_types', 'Heart_attack', 'ST_slope'],
      dtype='object')

## Getting data types of each column

In [27]:
dataset.dtypes

Age                        float64
Sex                        float64
Chest_pain                 float64
Resting_blood_pressure      object
Cholesterol                 object
Fasting_blood_sugar         object
ECG_results                 object
Maximum_heart_rate          object
Exercise_induced_angina     object
ST_depression               object
Major_vessels               object
Thalassemia_types           object
Heart_attack                object
ST_slope                     int64
dtype: object

## Replacing missing values

### Missing 'resting blood pressure' attribute

In [28]:
dataset.loc[dataset['Resting_blood_pressure'] == "?"].shape[0]

59

#### Finding the mode of column

In [29]:
mode_of_resting_blood_pressure = dataset.loc[dataset['Resting_blood_pressure'] != "?"].Resting_blood_pressure.mode()[0] 
mode_of_resting_blood_pressure

'120'

#### Replacing the cells containing '?' with mode of the column

In [30]:
dataset.Resting_blood_pressure.replace({"?":mode_of_resting_blood_pressure}, inplace=True)

In [31]:
dataset.loc[dataset['Resting_blood_pressure'] == "?"].shape

(0, 14)

In [32]:
dataset['Resting_blood_pressure'] = pd.to_numeric(dataset["Resting_blood_pressure"], downcast="float")

In [33]:
dataset.dtypes

Age                        float64
Sex                        float64
Chest_pain                 float64
Resting_blood_pressure     float32
Cholesterol                 object
Fasting_blood_sugar         object
ECG_results                 object
Maximum_heart_rate          object
Exercise_induced_angina     object
ST_depression               object
Major_vessels               object
Thalassemia_types           object
Heart_attack                object
ST_slope                     int64
dtype: object

# USE MEAN IF THE NUMBER OF MISSING VALUES IS LARGE

# USE MODE IF THE NUMBER OF MISSING VALUES IS BELOW 75

df = dataset.copy()

# Data visualization

## Statistical description of features

59

## Displaying first 5 and last 5 rows from the dataset

In [35]:
dataset.head()

Unnamed: 0,Age,Sex,Chest_pain,Resting_blood_pressure,Cholesterol,Fasting_blood_sugar,ECG_results,Maximum_heart_rate,Exercise_induced_angina,ST_depression,Major_vessels,Thalassemia_types,Heart_attack,ST_slope
0,63.0,1.0,1.0,145.0,233,1,2,150,0,2.3,3,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286,0,2,108,1,1.5,2,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229,0,2,129,1,2.6,2,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250,0,0,187,0,3.5,3,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204,0,2,172,0,1.4,1,0.0,3.0,0


In [36]:
dataset.tail()

Unnamed: 0,Age,Sex,Chest_pain,Resting_blood_pressure,Cholesterol,Fasting_blood_sugar,ECG_results,Maximum_heart_rate,Exercise_induced_angina,ST_depression,Major_vessels,Thalassemia_types,Heart_attack,ST_slope
195,54.0,0.0,4.0,127.0,333,1,1,154,0,0,?,?,?,1
196,62.0,1.0,1.0,120.0,139,0,1,?,?,?,?,?,?,0
197,55.0,1.0,4.0,122.0,223,1,1,100,0,0,?,?,6,2
198,58.0,1.0,4.0,120.0,385,1,2,?,?,?,?,?,?,0
199,62.0,1.0,2.0,120.0,254,0,2,93,1,0,?,?,?,1


## Checking for misssing values

In [12]:
dataset.isna().sum()

Age                        0
Sex                        0
Chest_pain                 0
Resting_blood_pressure     0
Cholesterol                0
Fasting_blood_sugar        0
ECG_results                0
Maximum_heart_rate         0
Exercise_induced_angina    0
ST_depression              0
Major_vessels              0
Thalassemia_types          0
Heart_attack               0
ST_slope                   0
dtype: int64

## Setting color pallete for graphs

In [13]:
color_pallete =["#00876c",
                "#56a474",
                "#8ebf7f",
                "#c6da8e",
                "#fff4a4",
                "#f9ca7a",
                "#f29e5e",
                "#e77051",
                "#d43d51"]
sns.set_style('whitegrid')
sns.set_palette(color_pallete)

In [43]:
dataset.loc[dataset['Age'] == 0].shape

(0, 14)