In [13]:
# Import the modules
import pandas as pd
import numpy as np
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans

In [20]:
path = 'Resources/heart_attack_final_table.csv'
total_df = pd.read_csv(path)
total_df.head()

Unnamed: 0,id,asthma_age,ever_overweight,heart_failure_age,age_chronic_heart_disease,angina_pectoris_age,heart_attack_age,stroke_age,thyroid_age,liver_age,...,100_Cigarettes,age,gender_female,gender_male,race_asian,race_black,race_mexican_american,race_other,race_other_hispanic,race_white
0,109266.0,0.0,1,0.0,0.0,0.0,,0.0,0.0,0.0,...,0,29.0,1,0,1,0,0,0,0,0
1,109274.0,0.0,1,0.0,0.0,0.0,,0.0,0.0,0.0,...,0,68.0,0,1,0,0,0,1,0,0
2,109292.0,52.0,0,0.0,0.0,0.0,,0.0,0.0,0.0,...,0,58.0,0,1,0,0,0,0,1,0
3,109297.0,0.0,0,0.0,0.0,0.0,,0.0,0.0,0.0,...,0,30.0,1,0,1,0,0,0,0,0
4,109307.0,0.0,0,0.0,0.0,42.0,,0.0,0.0,0.0,...,1,47.0,0,1,1,0,0,0,0,0


Since we are looking at many different factors to predict an outcome, it will be helpful to reduce the dimensions to provide simplicity. 

In [3]:
#Since id is not a factor we want to be included in our analysis we will drop it
total_df = total_df.drop(columns='id')
#PCA will not accept NaN values, so they are replaced with 0s
total_df.fillna(0, inplace = True)
total_df.head()

Unnamed: 0,asthma_age,ever_overweight,heart_failure_age,age_chronic_heart_disease,angina_pectoris_age,heart_attack_age,stroke_age,thyroid_age,liver_age,fatty_liver,...,100_Cigarettes,age,gender_female,gender_male,race_asian,race_black,race_mexican_american,race_other,race_other_hispanic,race_white
0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,...,0,29.0,1,0,1,0,0,0,0,0
1,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,...,0,68.0,0,1,0,0,0,1,0,0
2,52.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,...,0,58.0,0,1,0,0,0,0,1,0
3,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,...,0,30.0,1,0,1,0,0,0,0,0
4,0.0,0,0.0,0.0,42.0,0.0,0.0,0.0,0.0,0,...,1,47.0,0,1,1,0,0,0,0,0


In [4]:
total_df.shape

(5478, 52)

In [5]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 2)

In [6]:
# Fit the PCA model on total_df
data_pca = pca.fit_transform(total_df)

# Show the first 5 rows of list data
data_pca[:5]

array([[-9.15643959e+00,  6.90744905e-02],
       [ 7.65202120e+01, -2.41478310e+01],
       [ 1.30162600e+01, -2.86054496e+00],
       [-2.73279421e+01,  5.77758991e+00],
       [ 2.35817685e+01, -1.00314049e+01]])

In [7]:
pca.explained_variance_ratio_

array([0.33281198, 0.31174508])

The first primary component explains 33% of the variance, the second primary component explains 31%.
These are relatively low because a large amount of the data is binary.

In [8]:
print(abs(pca.components_))

[[3.52027852e-03 2.89907679e-05 1.35146491e-02 2.47206191e-02
  1.02237387e-02 2.29758587e-02 1.05303630e-02 1.32418016e-02
  1.92068075e-03 5.80258646e-05 1.47901004e-05 2.58067187e-06
  2.02620437e-05 1.18185733e-05 3.64519105e-06 4.39319142e-03
  3.38500664e-04 2.50297118e-04 1.07032682e-04 3.97858637e-04
  2.86369754e-04 2.72170739e-04 4.47716687e-04 2.01690544e-04
  1.91801103e-04 2.66554289e-04 1.19109019e-05 1.16743616e-04
  1.52888860e-03 4.91617688e-05 2.01384138e-04 3.08552316e-04
  1.83908333e-04 5.87786568e-04 4.34171937e-04 1.06442708e-03
  1.70013826e-02 9.64670704e-01 6.65951434e-03 2.50773680e-01
  2.57341348e-04 5.88009989e-04 1.30059372e-04 6.65931146e-02
  7.60867503e-04 7.60867503e-04 3.67455244e-04 5.98851027e-04
  1.51890156e-05 4.25136372e-05 7.15549236e-05 1.87165481e-04]
 [4.80695120e-03 3.60522984e-04 2.01566717e-04 1.21503424e-02
  5.25459541e-03 1.06342753e-02 8.06832949e-03 6.65316746e-03
  3.16523084e-03 2.68430242e-05 2.31786010e-06 3.37886910e-06
  2.038

In [19]:
pc1_components = (abs(pca.components_))[0]
pc2_components = (abs(pca.components_))[1]

print(np.where(pc1_components == max(pc1_components)))
print(f" the variable with the most variation is {total_df.columns[37]}, for the first primary component")
print(" ")
print(np.where(pc2_components == max(pc2_components)))
print(f" the most important variable is {total_df.columns[39]}, for the second primary component")

(array([37], dtype=int64),)
 the variable with the most variation is Total_Cholesterol_mg_dL, for the first primary component
 
(array([39], dtype=int64),)
 the most important variable is Drinks_per_Day, for the second primary component


In [9]:
# Create the PCA DataFrame
pca_df = pd.DataFrame(
    data_pca,
    columns=["PCA1", "PCA2"]
)

# Review the PCA DataFrame
pca_df.head()

Unnamed: 0,PCA1,PCA2
0,-9.15644,0.069074
1,76.520212,-24.147831
2,13.01626,-2.860545
3,-27.327942,5.77759
4,23.581769,-10.031405


In [10]:
# Create a for-loop where each value of k is evaluated using the K-means algorithm
# Fit the model using the service_ratings DataFrame
# Append the value of the computed inertia from the `inertia_` attribute of the KMeans model instance
inertia = []
k = list(range(1, 11))
def kmeans(df):
    for i in k:
        k_model = KMeans(n_clusters=i, random_state=0)
        k_model.fit(df)
        inertia.append(k_model.inertia_)
        
    return inertia

In [11]:
kmeans(pca_df)

[17625448.30397254,
 9139934.313242251,
 3445520.7819337826,
 1904534.486639371,
 1250032.007000593,
 883583.572218645,
 667837.3202489789,
 533098.3751532625,
 440907.3757505497,
 370506.78338606877]

In [12]:
# Define a DataFrame to hold the values for k and the corresponding inertia
elbow_data = {"k": k, "inertia": inertia}

# Create the DataFrame from the elbow data
df_elbow = pd.DataFrame(elbow_data)

# Review the DataFrame
df_elbow.head()

Unnamed: 0,k,inertia
0,1,17625450.0
1,2,9139934.0
2,3,3445521.0
3,4,1904534.0
4,5,1250032.0


In [13]:
# Plot the DataFrame
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)

In [14]:
# Define the model Kmeans model using the optimal value of k for the number of clusters.
model = KMeans(n_clusters=3, random_state = 0)

# Fit the model
model.fit(pca_df)

# Make predictions
k_3 = model.predict(pca_df)

# Create a copy of the customers_pca_df DataFrame
pca_predictions_df = pca_df.copy()

# Add a class column with the labels
pca_predictions_df["segments"] = k_3

In [15]:
# Plot the clusters
fig1 = pca_predictions_df.hvplot.scatter(
    x="PCA1",
    y="PCA2",
    by="segments"
)
fig1

* Using PCA we find three clusters, which we can use to categorize individuals based on certain traits
* We find there are three clusters, which here means these three groups have similar characteristics

In [16]:
pca_data = total_df.copy()
pca_data['segments'] = k_3
pca_data.head()

Unnamed: 0,asthma_age,ever_overweight,heart_failure_age,age_chronic_heart_disease,angina_pectoris_age,heart_attack_age,stroke_age,thyroid_age,liver_age,fatty_liver,...,age,gender_female,gender_male,race_asian,race_black,race_mexican_american,race_other,race_other_hispanic,race_white,segments
0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,...,29.0,1,0,1,0,0,0,0,0,1
1,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,...,68.0,0,1,0,0,0,1,0,0,0
2,52.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,...,58.0,0,1,0,0,0,0,1,0,0
3,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,...,30.0,1,0,1,0,0,0,0,0,1
4,0.0,0,0.0,0.0,42.0,0.0,0.0,0.0,0.0,0,...,47.0,0,1,1,0,0,0,0,0,0


In [17]:
pca_data.to_csv('Resources/pca_df.csv', index=False)

# Viewing predictions for data from 2015-2016

In [18]:
old_data = pd.read_csv('Resources/medical_conditions_cleaned_2015_2016.csv')
old_data.head()

Unnamed: 0,id,asthma_age,heart_failure_age,coronary_heart_disease_age,angina_pectoris_age,heart_attack_age,stroke_age,emphysema_age,thyroid_age,liver_age,...,liver,asthma_yrs,heart_failure_yrs,coronary_heart_disease_yrs,angina_pectoris_yrs,stroke_yrs,emphysema_yrs,thyroid_yrs,liver_disease_yrs,max_age
0,83732.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,83733.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,83734.0,0.0,0.0,0.0,0.0,58.0,0.0,0.0,39.0,11.0,...,1,0.0,0.0,0.0,0.0,0.0,0.0,19.0,47.0,39.0
3,83735.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,83736.0,10.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0


In [19]:
old_data = old_data.drop(columns='id')
old_data.fillna(0, inplace=True)
old_data.head()

Unnamed: 0,asthma_age,heart_failure_age,coronary_heart_disease_age,angina_pectoris_age,heart_attack_age,stroke_age,emphysema_age,thyroid_age,liver_age,relative_heart_attack,...,liver,asthma_yrs,heart_failure_yrs,coronary_heart_disease_yrs,angina_pectoris_yrs,stroke_yrs,emphysema_yrs,thyroid_yrs,liver_disease_yrs,max_age
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,58.0,0.0,0.0,39.0,11.0,0,...,1,0.0,0.0,0.0,0.0,0.0,0.0,19.0,47.0,39.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0


In [20]:
# Fit the PCA model on the old_data
old_data_pca = pca.fit_transform(old_data)

# Review the first 5 rows of list data
old_data_pca[:5]

array([[-8.88663825e+00, -1.04078707e-02],
       [-8.88663825e+00, -1.04078707e-02],
       [ 4.76062464e+01,  3.68215878e+00],
       [-8.84135020e+00,  7.31783686e-03],
       [ 1.29123124e+00,  2.45678613e-01]])

In [21]:
pca.explained_variance_ratio_

array([0.47960941, 0.12848271])

In [22]:
# Create the PCA DataFrame
pca_df_2015_2016 = pd.DataFrame(
    old_data_pca,
    columns=["PCA1", "PCA2"]
)

# Review the PCA DataFrame
pca_df_2015_2016.head()

Unnamed: 0,PCA1,PCA2
0,-8.886638,-0.010408
1,-8.886638,-0.010408
2,47.606246,3.682159
3,-8.84135,0.007318
4,1.291231,0.245679


In [23]:
inertia = []
kmeans(pca_df_2015_2016)

[5480020.410698921,
 1737764.5450409674,
 999085.578027973,
 638544.0506375663,
 498068.5365501398,
 426334.62480049586,
 360137.0366050299,
 299758.97163612995,
 252408.4883032795,
 213840.5766607421]

In [24]:
elbow_data = {"k": k, "inertia": inertia}

# Create the DataFrame from the elbow data
df_elbow = pd.DataFrame(elbow_data)

# Review the DataFrame
df_elbow.head()

Unnamed: 0,k,inertia
0,1,5480020.0
1,2,1737765.0
2,3,999085.6
3,4,638544.1
4,5,498068.5


In [25]:
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)

In [26]:
# Define the model Kmeans model using the optimal value of k for the number of clusters.
model = KMeans(n_clusters=2, random_state = 0)

# Fit the model
model.fit(pca_df_2015_2016)

# Make predictions
k_2 = model.predict(pca_df_2015_2016)

# Create a copy of the customers_pca_df DataFrame
pca_predictions_df_2015_2016 = pca_df_2015_2016.copy()

# Add a class column with the labels
pca_predictions_df_2015_2016["segments"] = k_2

In [27]:
# Plot the clusters
pca_predictions_df_2015_2016.hvplot.scatter(
    x="PCA1",
    y="PCA2",
    by="segments"
)