In [4]:
#Perform the following operations using Python on the Heart Disease data sets 
#a. Data cleaning 
#b. Data integration 
#c. Data transformation 
#d. Error correcting 
#e. Data model building 

In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [46]:
# Reading csv file
df = pd.read_csv("Heart.csv")
df.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [47]:
#a. Data cleaning 
#a.1 Removing Missing or Null Values:
df.dropna(axis=0,how='any')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trtbps    303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalachh  303 non-null    int64  
 8   exng      303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slp       303 non-null    int64  
 11  caa       303 non-null    int64  
 12  thall     303 non-null    int64  
 13  output    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [48]:
#a.2 Reading and Removing Duplicate Values
#reading duplicate values
df1 = df.duplicated(subset=['age','cp','chol'])
df1
 #remove duplicate values
df2=df.drop_duplicates(keep=False)
df2

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [49]:
#b. Data integration 1
subSet1 = df[['age', 'cp']]
subSet2 = df[['chol','fbs']]
concatenated_df = pd.concat([subSet1, subSet2], axis=1)
concatenated_df

Unnamed: 0,age,cp,chol,fbs
0,63,3,233,1
1,37,2,250,0
2,41,1,204,0
3,56,1,236,0
4,57,0,354,0
...,...,...,...,...
298,57,0,241,0
299,45,3,264,0
300,68,0,193,1
301,57,0,131,0


In [50]:
#integration 2
subSet1 = df[['age','cp','chol','thalachh']]
subSet2 = df[['exng','slp','output']]
merged_df = subSet1.merge(right=subSet2,how='cross')
merged_df.head()

Unnamed: 0,age,cp,chol,thalachh,exng,slp,output
0,63,3,233,150,0,0,1
1,63,3,233,150,0,0,1
2,63,3,233,150,0,2,1
3,63,3,233,150,0,2,1
4,63,3,233,150,1,2,1


In [51]:
#c. Data transformation 
dt = df.groupby(['age','cp'])
dt.first()

Unnamed: 0_level_0,Unnamed: 1_level_0,sex,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
age,cp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
29,1,1,130,204,0,0,202,0,0.0,2,0,2,1
34,1,0,118,210,0,1,192,0,0.7,2,0,2,1
34,3,1,118,182,0,0,174,0,0.0,2,0,2,1
35,0,0,138,183,0,1,182,0,1.4,2,0,2,1
35,1,1,122,192,0,1,174,0,0.0,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,1,0,160,302,0,1,162,0,0.4,2,2,2,1
71,2,0,110,265,1,0,130,0,0.0,2,1,2,1
74,1,0,120,269,0,0,121,1,0.2,2,1,2,1
76,2,0,140,197,0,2,116,0,1.1,1,0,2,1


In [52]:
#d. Error correcting
# Handling outliers
def remove_outliers(df,columns,n_std):
    for col in df.columns:
        print('Working on coloumn: {}'.format(col))
        
        mean = df[col].mean()
        sd = df[col].std()
        
        df = df[(df[col] <= mean+(n_std*sd))]
    return df
df

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [57]:
#e. Data model building 
from sklearn.model_selection import train_test_split
train,test=train_test_split(df,random_state=0,test_size=.25)

In [58]:
 print("Training Dataset:",train.shape)

Training Dataset: (227, 14)


In [59]:
 print("Testing Dataset:",test.shape)

Testing Dataset: (76, 14)
