In [1]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [7]:
# Load the data
import os
file_path = os.path.join("Resources", "shopping_data_cleaned.csv")

shopping_df = pd.read_csv(file_path)

In [8]:
# Standardize the data
shopping_scaled = StandardScaler().fit_transform(shopping_df)
print(shopping_scaled[0:5])

[[ 1.12815215 -1.42456879 -1.73899919 -0.43480148]
 [ 1.12815215 -1.28103541 -1.73899919  1.19570407]
 [-0.88640526 -1.3528021  -1.70082976 -1.71591298]
 [-0.88640526 -1.13750203 -1.70082976  1.04041783]
 [-0.88640526 -0.56336851 -1.66266033 -0.39597992]]


In [11]:
# Initialize PCA model
pca = PCA(n_components=2)

# Get two principal components for the data.
shopping_pca = pca.fit_transform(shopping_scaled)

# Transform PCA data to a DataFrame
df_shopping_pca = pd.DataFrame(data=shopping_pca, columns=["principal component 1", "principal component 2"])

df_shopping_pca.head()

Unnamed: 0,principal component 1,principal component 2
0,-0.406383,-0.520714
1,-1.427673,-0.36731
2,0.050761,-1.894068
3,-1.694513,-1.631908
4,-0.313108,-1.810483


In [12]:
# Fetch the explained variance
pca.explained_variance_ratio_

array([0.33690046, 0.26230645])

**Sample Analysis**

According to the explained variance, the first principal component contains `33.7%` of the variance and the second principal component contains `26.2%` of the variance. Since we have `59.9%` of the information in the original dataset, we will see whether increasing the number of principal components to 3 will increase the explained variance.

In [13]:
# Initialize PCA model for 3 principal components
pca_alt = PCA(n_components=3)

# Get two principal components for the iris data.
shopping_pca_alt = pca_alt.fit_transform(shopping_scaled)


In [15]:
# Transform PCA data to a DataFrame
df_alt_shopping_pca = pd.DataFrame(data=shopping_pca_alt, columns = ["principal component 1", "principal component 2", "principal component 3"])

df_alt_shopping_pca.head()

Unnamed: 0,principal component 1,principal component 2,principal component 3
0,-0.406383,-0.520714,-2.072527
1,-1.427673,-0.36731,-2.277644
2,0.050761,-1.894068,-0.367375
3,-1.694513,-1.631908,-0.717467
4,-0.313108,-1.810483,-0.42646


In [16]:
# Fetch the explained variance
pca_alt.explained_variance_ratio_

array([0.33690046, 0.26230645, 0.23260639])

**Sample Analysis**

With three principal components, we have `83.1%` of the information in the original dataset. We therefore conclude that three principal components preserves.

In [17]:
# Initialize the K-Means model
model = KMeans(n_clusters=3, random_state=42) # should clusters be 6???

# Fit the model
model.fit(df_alt_shopping_pca)

# Predict clusters
predictions = model.predict(df_alt_shopping_pca)

# Add the predicted class columns
df_alt_shopping_pca["class"] = model.labels_
df_alt_shopping_pca.head()

Unnamed: 0,principal component 1,principal component 2,principal component 3,class
0,-0.406383,-0.520714,-2.072527,1
1,-1.427673,-0.36731,-2.277644,1
2,0.050761,-1.894068,-0.367375,2
3,-1.694513,-1.631908,-0.717467,1
4,-0.313108,-1.810483,-0.42646,2


In [21]:
# BONUS: plot the 3 principal components
import plotly.express as px
fig = px.scatter_3d(
    df_alt_shopping_pca,
    x="principal component 3",
    y="principal component 2",
    z="principal component 1",
    color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()