In [1]:
import numpy as np
import pandas as pd
import io
from sklearn.preprocessing import StandardScaler

In [2]:
#load in data
heart_stroke_df = pd.read_csv('healthcare-dataset-stroke-data.csv')
heart_stroke_df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


## Data Cleaning

In [3]:
heart_stroke_df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [4]:
# replace yes with 1 and no with 0
heart_stroke_df["ever_married"] = heart_stroke_df["ever_married"].replace("Yes", 1)
heart_stroke_df["ever_married"] = heart_stroke_df["ever_married"].replace("No", 0)

In [5]:
#one hot encode data
genderDF = pd.get_dummies(heart_stroke_df["gender"])
workTypeDF = pd.get_dummies(heart_stroke_df["work_type"])
residenceTypeDF = pd.get_dummies(heart_stroke_df["Residence_type"])
smokingStatuesDF = pd.get_dummies(heart_stroke_df["smoking_status"])

In [6]:
#drop columns that are going to be one hot encoded
heart_stroke_df = heart_stroke_df.drop(["id", "gender", "work_type", "Residence_type", "smoking_status"], axis=1)

In [7]:
#merge the one hot encoded data together with main dataset
adjusted_df = genderDF.join([workTypeDF, residenceTypeDF, smokingStatuesDF])
heart_stroke_df = adjusted_df.join(heart_stroke_df)
heart_stroke_df

Unnamed: 0,Female,Male,Other,Govt_job,Never_worked,Private,Self-employed,children,Rural,Urban,...,formerly smoked,never smoked,smokes,age,hypertension,heart_disease,ever_married,avg_glucose_level,bmi,stroke
0,0,1,0,0,0,1,0,0,0,1,...,1,0,0,67.0,0,1,1,228.69,36.6,1
1,1,0,0,0,0,0,1,0,1,0,...,0,1,0,61.0,0,0,1,202.21,,1
2,0,1,0,0,0,1,0,0,1,0,...,0,1,0,80.0,0,1,1,105.92,32.5,1
3,1,0,0,0,0,1,0,0,0,1,...,0,0,1,49.0,0,0,1,171.23,34.4,1
4,1,0,0,0,0,0,1,0,1,0,...,0,1,0,79.0,1,0,1,174.12,24.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5105,1,0,0,0,0,1,0,0,0,1,...,0,1,0,80.0,1,0,1,83.75,,0
5106,1,0,0,0,0,0,1,0,0,1,...,0,1,0,81.0,0,0,1,125.20,40.0,0
5107,1,0,0,0,0,0,1,0,1,0,...,0,1,0,35.0,0,0,1,82.99,30.6,0
5108,0,1,0,0,0,1,0,0,1,0,...,1,0,0,51.0,0,0,1,166.29,25.6,0


In [8]:
#check for null values
heart_stroke_df.isna().sum().sum()

201

In [9]:
#drop null rows
heart_stroke_df = heart_stroke_df.dropna()
heart_stroke_df

Unnamed: 0,Female,Male,Other,Govt_job,Never_worked,Private,Self-employed,children,Rural,Urban,...,formerly smoked,never smoked,smokes,age,hypertension,heart_disease,ever_married,avg_glucose_level,bmi,stroke
0,0,1,0,0,0,1,0,0,0,1,...,1,0,0,67.0,0,1,1,228.69,36.6,1
2,0,1,0,0,0,1,0,0,1,0,...,0,1,0,80.0,0,1,1,105.92,32.5,1
3,1,0,0,0,0,1,0,0,0,1,...,0,0,1,49.0,0,0,1,171.23,34.4,1
4,1,0,0,0,0,0,1,0,1,0,...,0,1,0,79.0,1,0,1,174.12,24.0,1
5,0,1,0,0,0,1,0,0,0,1,...,1,0,0,81.0,0,0,1,186.21,29.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5104,1,0,0,0,0,0,0,1,1,0,...,0,0,0,13.0,0,0,0,103.08,18.6,0
5106,1,0,0,0,0,0,1,0,0,1,...,0,1,0,81.0,0,0,1,125.20,40.0,0
5107,1,0,0,0,0,0,1,0,1,0,...,0,1,0,35.0,0,0,1,82.99,30.6,0
5108,0,1,0,0,0,1,0,0,1,0,...,1,0,0,51.0,0,0,1,166.29,25.6,0


In [10]:
#positive rate
heart_stroke_df['stroke'].value_counts()

0    4700
1     209
Name: stroke, dtype: int64

In [11]:
#drop stroke column
heart_stroke_df.drop('stroke', axis=1).head()
# heart_stroke_df.head()

Unnamed: 0,Female,Male,Other,Govt_job,Never_worked,Private,Self-employed,children,Rural,Urban,Unknown,formerly smoked,never smoked,smokes,age,hypertension,heart_disease,ever_married,avg_glucose_level,bmi
0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,67.0,0,1,1,228.69,36.6
2,0,1,0,0,0,1,0,0,1,0,0,0,1,0,80.0,0,1,1,105.92,32.5
3,1,0,0,0,0,1,0,0,0,1,0,0,0,1,49.0,0,0,1,171.23,34.4
4,1,0,0,0,0,0,1,0,1,0,0,0,1,0,79.0,1,0,1,174.12,24.0
5,0,1,0,0,0,1,0,0,0,1,0,1,0,0,81.0,0,0,1,186.21,29.0


In [12]:
# #reset index
# heart_stroke_df = heart_stroke_df.reset_index()
# heart_stroke_df

PCA

In [13]:
#scaling data
heart_stroke_df_scaled = StandardScaler().fit_transform(heart_stroke_df)
pd.DataFrame(heart_stroke_df_scaled).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,-1.199942,1.200447,-0.014274,-0.383706,-0.067095,0.863918,-0.432978,-0.397906,-0.98564,0.98564,...,2.205673,-0.778346,-0.420302,1.070138,-0.318067,4.381968,0.729484,2.777698,0.981345,4.742155
1,-1.199942,1.200447,-0.014274,-0.383706,-0.067095,0.863918,-0.432978,-0.397906,1.014569,-1.014569,...,-0.453376,1.284775,-0.420302,1.646563,-0.318067,4.381968,0.729484,0.013842,0.459269,4.742155
2,0.833374,-0.833023,-0.014274,-0.383706,-0.067095,0.863918,-0.432978,-0.397906,-0.98564,0.98564,...,-0.453376,-0.778346,2.379241,0.272012,-0.318067,-0.228208,0.729484,1.484132,0.701207,4.742155
3,0.833374,-0.833023,-0.014274,-0.383706,-0.067095,-1.157518,2.309587,-0.397906,1.014569,-1.014569,...,-0.453376,1.284775,-0.420302,1.602222,3.143994,-0.228208,0.729484,1.549193,-0.623083,4.742155
4,-1.199942,1.200447,-0.014274,-0.383706,-0.067095,0.863918,-0.432978,-0.397906,-0.98564,0.98564,...,2.205673,-0.778346,-0.420302,1.690903,-0.318067,-0.228208,0.729484,1.821368,0.013595,4.742155


In [14]:
#covariance matrix
attributes = heart_stroke_df_scaled.T
covariance_matrix = np.cov(attributes)
covariance_matrix

array([[ 1.00020375e+00, -9.99782619e-01, -1.71315422e-02,
         1.51258237e-02, -1.22999597e-02,  3.86190242e-02,
         2.23163042e-02, -9.16379759e-02, -4.59906393e-03,
         4.59906393e-03, -5.79052883e-02, -3.96001774e-02,
         9.40704744e-02, -1.15210927e-02,  3.04628787e-02,
        -2.17342582e-02, -8.28989600e-02,  3.67105396e-02,
        -5.33692356e-02,  2.63657284e-02, -6.85230380e-03],
       [-9.99782619e-01,  1.00020375e+00, -1.18930446e-02,
        -1.49679971e-02,  1.23286933e-02, -3.89798958e-02,
        -2.21386103e-02,  9.18098009e-02,  4.17905970e-03,
        -4.17905970e-03,  5.81822799e-02,  3.86893385e-02,
        -9.37551424e-02,  1.16961069e-02, -3.01553607e-02,
         2.18676908e-02,  8.29998118e-02, -3.61453706e-02,
         5.30186226e-02, -2.60251666e-02,  6.94019367e-03],
       [-1.71315422e-02, -1.18930446e-02,  1.00020375e+00,
        -5.47816782e-03, -9.57913672e-04,  1.23341300e-02,
        -6.18161305e-03, -5.68089791e-03,  1.44849814e

In [15]:
#eigendecomposition
x,y = np.linalg.eig(covariance_matrix)
print(x,y)

[ 3.45463373e+00+0.00000000e+00j  2.08864974e+00+0.00000000e+00j
  2.00918463e+00+0.00000000e+00j  1.70664559e+00+0.00000000e+00j
  2.34105889e-01+0.00000000e+00j  4.19278666e-01+0.00000000e+00j
  1.41240416e+00+0.00000000e+00j  1.25697453e+00+0.00000000e+00j
  6.74171853e-01+0.00000000e+00j  1.17527664e+00+0.00000000e+00j
  1.10922744e+00+0.00000000e+00j  8.02177832e-01+0.00000000e+00j
  8.50956151e-01+0.00000000e+00j  8.69974956e-01+0.00000000e+00j
  1.01390401e+00+0.00000000e+00j  9.92498258e-01+0.00000000e+00j
  9.34214654e-01+0.00000000e+00j -8.01471784e-17+0.00000000e+00j
 -2.39617802e-16+0.00000000e+00j  4.01406909e-16+8.29749741e-17j
  4.01406909e-16-8.29749741e-17j] [[-6.42621565e-02+0.00000000e+00j -6.37986429e-01+0.00000000e+00j
  -6.60085677e-02+0.00000000e+00j -2.04312496e-01+0.00000000e+00j
   3.18291190e-03+0.00000000e+00j -1.72161849e-02+0.00000000e+00j
  -1.69416533e-01+0.00000000e+00j -5.96771164e-02+0.00000000e+00j
   3.88806677e-03+0.00000000e+00j  4.56622148e-02+0.