In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [2]:
#load in data
heart_stroke_df = pd.read_csv("healthcare-dataset-stroke-data.csv")

In [3]:
heart_stroke_df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [4]:
# replace yes with 1 and no with 0
heart_stroke_df["ever_married"] = heart_stroke_df["ever_married"].replace("Yes", 1)
heart_stroke_df["ever_married"] = heart_stroke_df["ever_married"].replace("No", 0)

In [5]:
#one hot encode data
genderDF = pd.get_dummies(heart_stroke_df["gender"])
workTypeDF = pd.get_dummies(heart_stroke_df["work_type"])
residenceTypeDF = pd.get_dummies(heart_stroke_df["Residence_type"])
smokingStatuesDF = pd.get_dummies(heart_stroke_df["smoking_status"])

In [6]:
#drop columns that are going to be one hot encoded
heart_stroke_df = heart_stroke_df.drop(["id", "gender", "work_type", "Residence_type", "smoking_status"], axis=1)

In [7]:
#merge the one hot encoded data together with main dataset
adjusted_df = genderDF.join([workTypeDF, residenceTypeDF, smokingStatuesDF])
heart_stroke_df = adjusted_df.join(heart_stroke_df)
heart_stroke_df

Unnamed: 0,Female,Male,Other,Govt_job,Never_worked,Private,Self-employed,children,Rural,Urban,...,formerly smoked,never smoked,smokes,age,hypertension,heart_disease,ever_married,avg_glucose_level,bmi,stroke
0,0,1,0,0,0,1,0,0,0,1,...,1,0,0,67.0,0,1,1,228.69,36.6,1
1,1,0,0,0,0,0,1,0,1,0,...,0,1,0,61.0,0,0,1,202.21,,1
2,0,1,0,0,0,1,0,0,1,0,...,0,1,0,80.0,0,1,1,105.92,32.5,1
3,1,0,0,0,0,1,0,0,0,1,...,0,0,1,49.0,0,0,1,171.23,34.4,1
4,1,0,0,0,0,0,1,0,1,0,...,0,1,0,79.0,1,0,1,174.12,24.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5105,1,0,0,0,0,1,0,0,0,1,...,0,1,0,80.0,1,0,1,83.75,,0
5106,1,0,0,0,0,0,1,0,0,1,...,0,1,0,81.0,0,0,1,125.20,40.0,0
5107,1,0,0,0,0,0,1,0,1,0,...,0,1,0,35.0,0,0,1,82.99,30.6,0
5108,0,1,0,0,0,1,0,0,1,0,...,1,0,0,51.0,0,0,1,166.29,25.6,0


In [8]:
#check for null values
heart_stroke_df.isna().sum().sum()

201

In [9]:
#drop null rows
heart_stroke_df = heart_stroke_df.dropna()
heart_stroke_df

Unnamed: 0,Female,Male,Other,Govt_job,Never_worked,Private,Self-employed,children,Rural,Urban,...,formerly smoked,never smoked,smokes,age,hypertension,heart_disease,ever_married,avg_glucose_level,bmi,stroke
0,0,1,0,0,0,1,0,0,0,1,...,1,0,0,67.0,0,1,1,228.69,36.6,1
2,0,1,0,0,0,1,0,0,1,0,...,0,1,0,80.0,0,1,1,105.92,32.5,1
3,1,0,0,0,0,1,0,0,0,1,...,0,0,1,49.0,0,0,1,171.23,34.4,1
4,1,0,0,0,0,0,1,0,1,0,...,0,1,0,79.0,1,0,1,174.12,24.0,1
5,0,1,0,0,0,1,0,0,0,1,...,1,0,0,81.0,0,0,1,186.21,29.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5104,1,0,0,0,0,0,0,1,1,0,...,0,0,0,13.0,0,0,0,103.08,18.6,0
5106,1,0,0,0,0,0,1,0,0,1,...,0,1,0,81.0,0,0,1,125.20,40.0,0
5107,1,0,0,0,0,0,1,0,1,0,...,0,1,0,35.0,0,0,1,82.99,30.6,0
5108,0,1,0,0,0,1,0,0,1,0,...,1,0,0,51.0,0,0,1,166.29,25.6,0


In [10]:
#reset index
heart_stroke_df = heart_stroke_df.reset_index()
heart_stroke_df = heart_stroke_df.drop(columns = "index")
heart_stroke_df

Unnamed: 0,Female,Male,Other,Govt_job,Never_worked,Private,Self-employed,children,Rural,Urban,...,formerly smoked,never smoked,smokes,age,hypertension,heart_disease,ever_married,avg_glucose_level,bmi,stroke
0,0,1,0,0,0,1,0,0,0,1,...,1,0,0,67.0,0,1,1,228.69,36.6,1
1,0,1,0,0,0,1,0,0,1,0,...,0,1,0,80.0,0,1,1,105.92,32.5,1
2,1,0,0,0,0,1,0,0,0,1,...,0,0,1,49.0,0,0,1,171.23,34.4,1
3,1,0,0,0,0,0,1,0,1,0,...,0,1,0,79.0,1,0,1,174.12,24.0,1
4,0,1,0,0,0,1,0,0,0,1,...,1,0,0,81.0,0,0,1,186.21,29.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4904,1,0,0,0,0,0,0,1,1,0,...,0,0,0,13.0,0,0,0,103.08,18.6,0
4905,1,0,0,0,0,0,1,0,0,1,...,0,1,0,81.0,0,0,1,125.20,40.0,0
4906,1,0,0,0,0,0,1,0,1,0,...,0,1,0,35.0,0,0,1,82.99,30.6,0
4907,0,1,0,0,0,1,0,0,1,0,...,1,0,0,51.0,0,0,1,166.29,25.6,0


# Different PCA

In [11]:
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt

In [12]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(heart_stroke_df)
scaled_data = scaler.transform(heart_stroke_df)

In [13]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(scaled_data)
x_pca = pca.transform(scaled_data)

In [14]:
x_pca

array([[-3.13738276,  3.58273636],
       [-2.69010339,  2.25815991],
       [-2.01043884, -0.39076736],
       ...,
       [-0.72363941, -1.34308456],
       [-0.91555572,  1.84133632],
       [ 0.53925124, -0.78002128]])

In [None]:
scaled_data.shape

In [None]:
x_pca.shape

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(x_pca[:,0],x_pca[:,1],c=heart_stroke_df['stroke'],cmap='rainbow')
plt.xlabel('First principal component')
plt.ylabel('Second Principal Component')

In [None]:
pca.components_

In [None]:
map= pd.DataFrame(pca.components_, columns = heart_stroke_df.columns)
plt.figure(figsize=(12,6))
sns.heatmap(map,cmap='twilight')

# PCA Algorithm

In [None]:
#set predictor column
strokeY = heart_stroke_df["stroke"]
strokeY = strokeY.replace(1, "stroke")
strokeY = strokeY.replace(0, "no stroke")
strokeY

In [None]:
#scale the data
heart_stroke_df_scaled = StandardScaler().fit_transform(heart_stroke_df)

In [None]:
#assign features and get the covariance matrix
feat = heart_stroke_df.T
covMatrix = np.cov(feat)

In [None]:
#perform eigendecomposition
eigvalues, eigvectors = np.linalg.eig(covMatrix)

In [None]:
#view vectors
eigvectors

In [None]:
#check variances to make sure they equal one and find the two that have the most impact
variances = []
for i in range(len(eigvalues)):
    variances.append(eigvalues[i] / np.sum(eigvalues))
 
print(np.sum(variances), "\n", variances)

In [None]:
#assign the two principal components and the target variable (stroke)
PCA1 = heart_stroke_df_scaled.dot(eigvectors.T[0])
PCA2 = heart_stroke_df_scaled.dot(eigvectors.T[1])
res = pd.DataFrame(PCA1, columns=["PCA1"])
res["PCA2"] = PCA2
res["Y"] = strokeY
res.head()

In [None]:
#graph in one dimension as a line
import matplotlib.pyplot as plt
import seaborn as sns
sns.scatterplot(res["PCA1"], [0] * len(res), hue=res["Y"])

In [None]:
#graph in 2 dimensional space
sns.scatterplot(res["PCA1"], res["PCA2"], hue=res["Y"])