In [17]:
# Import the modules
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [18]:
# Read in the CSV
suicides_df = pd.read_csv("output/suicides.csv")

suicides_df.head()

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides_per_100k_pop,gdp_for_year_USD,gdp_per_capita_USD,generation
0,Albania,1987,male,15-24 years,21,312900,6.71,2156625000.0,796,Generation X
1,Albania,1987,male,35-54 years,16,308000,5.19,2156625000.0,796,Silent
2,Albania,1987,female,15-24 years,14,289700,4.83,2156625000.0,796,Generation X
3,Albania,1987,male,75+ years,1,21800,4.59,2156625000.0,796,G.I. Generation
4,Albania,1987,male,25-34 years,9,274300,3.28,2156625000.0,796,Boomers


In [19]:
# Instantiate the PCA instance and declare the number of PCA variables
pca=PCA(n_components=4)

In [20]:
# Get dummies
country_dummies = pd.get_dummies(suicides_df["country"])
sex_dummies = pd.get_dummies(suicides_df["sex"])
age_dummies = pd.get_dummies(suicides_df["age"])
generation_dummies = pd.get_dummies(suicides_df["generation"])

In [22]:
# Make dataframe without non-numeric columns
suicides_scaled = StandardScaler().fit_transform(
    suicides_df[["year", "suicides_no", "population", "suicides_per_100k_pop", "gdp_for_year_USD", "gdp_per_capita_USD"]]
)

suicides_scaled_df = pd.DataFrame(
    suicides_scaled,
    columns=["year", "suicides_no", "population", "suicides_per_100k_pop", "gdp_for_year_USD", "gdp_per_capita_USD"]
)

suicides_scaled_df.head()

Unnamed: 0,year,suicides_no,population,suicides_per_100k_pop,gdp_for_year_USD,gdp_per_capita_USD
0,-1.683615,-0.245639,-0.391617,-0.322032,-0.305056,-0.850864
1,-1.683615,-0.251182,-0.39287,-0.402196,-0.305056,-0.850864
2,-1.683615,-0.2534,-0.397548,-0.421182,-0.305056,-0.850864
3,-1.683615,-0.267811,-0.466035,-0.433839,-0.305056,-0.850864
4,-1.683615,-0.258943,-0.401485,-0.502928,-0.305056,-0.850864


In [23]:
# Combine suicides_scaled_df with dummied data
suicides_scaled_df = pd.concat([suicides_scaled_df, country_dummies, sex_dummies, age_dummies, generation_dummies], axis=1)

suicides_scaled_df.head()

Unnamed: 0,year,suicides_no,population,suicides_per_100k_pop,gdp_for_year_USD,gdp_per_capita_USD,Albania,Antigua and Barbuda,Argentina,Armenia,...,35-54 years,5-14 years,55-74 years,75+ years,Boomers,G.I. Generation,Generation X,Generation Z,Millenials,Silent
0,-1.683615,-0.245639,-0.391617,-0.322032,-0.305056,-0.850864,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,-1.683615,-0.251182,-0.39287,-0.402196,-0.305056,-0.850864,1,0,0,0,...,1,0,0,0,0,0,0,0,0,1
2,-1.683615,-0.2534,-0.397548,-0.421182,-0.305056,-0.850864,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,-1.683615,-0.267811,-0.466035,-0.433839,-0.305056,-0.850864,1,0,0,0,...,0,0,0,1,0,1,0,0,0,0
4,-1.683615,-0.258943,-0.401485,-0.502928,-0.305056,-0.850864,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [24]:
# Fit the PCA model on the transformed credit card DataFrame
suicides_scaled_pca = pca.fit_transform(suicides_scaled_df)

# Review the first 5 rows of the array of list data
suicides_scaled_pca[:5]

array([[-0.89885927,  1.40103509, -0.83341488, -0.64322269],
       [-0.87230007,  1.40136504, -0.71682841, -0.57184156],
       [-0.99148661,  1.12817893, -1.34259586, -0.64964462],
       [-0.98966892,  1.45836478, -0.70672251, -0.78083316],
       [-0.90896755,  1.3557918 , -0.89112841, -0.60248865]])