In [1]:
# Import the modules
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans

### Read in the CSV file and prepare the Pandas DataFrame

In [2]:
# Read the csv file into a pandas DataFrame
customers_transformed_df = pd.read_csv(
    Path("../Resources/customers.csv")
)

# Review the DataFrame
customers_transformed_df.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10
0,1.148534,4.606077,2.699069,-2.661824,1.526433,1.236671,0.211421,1.482896,-4.445627,-1.936831
1,-1.14941,-1.650549,2.530167,-3.227088,0.572138,4.1626,-0.291679,-1.237575,3.604765,-1.635689
2,0.332427,-0.887985,-0.309216,0.399891,0.828492,3.641945,-0.916946,-1.978024,1.056772,-1.882747
3,2.245599,3.826309,0.264039,0.095471,1.98438,0.373991,-0.280279,1.602786,-5.993331,-2.258925
4,0.705503,-1.312329,0.895406,-0.405408,1.116187,3.699562,-1.427985,-1.494409,1.156908,-1.434964


### Step 1: Use PCA to reduce the dimensionality of the transformed customers DataFrame to 2 principal components

In [3]:
# Import the PCA module
from sklearn.decomposition import PCA

In [4]:
# Instantiate the PCA instance and declare the number of PCA variables
pca = PCA(n_components=2)

In [5]:
# Fit the PCA model on the transformed credit card DataFrame
customers_pca = pca.fit_transform(customers_transformed_df)

# Review the first 5 rows of the array of list data
customers_pca[:5]

array([[-4.72382358, -0.60489964],
       [ 5.85571568, -1.98331135],
       [ 2.43063042, -3.15456594],
       [-6.96050326, -1.35772617],
       [ 2.47746793, -3.29412896]])

### Step 2: Using the explained_variance_ratio_ function from PCA, calculate the percentage of the total variance that is captured by the two PCA variables.

In [6]:
# Calculate the PCA explained variance ratio
pca.explained_variance_ratio_

array([0.55083554, 0.30256389])

**Question:** What is the explained variance ratio captured by the two PCA variables?
    
**Answer:** About 85% of the total variance is condensed into the 2 PCA variables.

### Step 3: Using the customer_pca data, create a Pandas DataFrame called customers_pca_df. The columns of the DataFrame should be called "PCA1" and "PCA2".

In [7]:
# Create the PCA DataFrame
customers_pca_df = pd.DataFrame(
    customers_pca,
    columns=["PCA1", "PCA2"]
)

# Review the PCA DataFrame
customers_pca_df.head()

Unnamed: 0,PCA1,PCA2
0,-4.723824,-0.6049
1,5.855716,-1.983311
2,2.43063,-3.154566
3,-6.960503,-1.357726
4,2.477468,-3.294129


### Step 4: Using the customers_pca_df Dataframe, utilize the elbow method to determine the optimal value of k.

In [8]:
# Create a a list to store inertia values and the values of k
inertia = []
k = list(range(1, 11))

In [9]:
# Create a for-loop where each value of k is evaluated using the K-means algorithm
# Fit the model using the service_ratings DataFrame
# Append the value of the computed inertia from the `inertia_` attribute of the KMeans model instance
for i in k:
    k_model = KMeans(n_clusters=i, random_state=0)
    k_model.fit(customers_pca_df)
    inertia.append(k_model.inertia_)



In [10]:
# Define a DataFrame to hold the values for k and the corresponding inertia
elbow_data = {"k": k, "inertia": inertia}

# Create the DataFrame from the elbow data
df_elbow = pd.DataFrame(elbow_data)

# Review the DataFrame
df_elbow.head()

Unnamed: 0,k,inertia
0,1,49585.714978
1,2,23750.95547
2,3,8773.367304
3,4,6840.237425
4,5,5378.566007


In [12]:
# Plot the DataFrame
df_elbow.hvplot.line(
    x="k",
    y="inertia",
    title="ELbow Curve",
    xticks=k
)

### Step 5: Segment the `customers_pca_df`  DataFrame using the K-means algorithm.

In [12]:
# Define the model Kmeans model using the optimal value of k for the number of clusters.
model = KMeans(n_cluster=3, random_state=0)

# Fit the model


# Make predictions


# Create a copy of the customers_pca_df DataFrame


# Add a class column with the labels


In [13]:
# Plot the clusters


### Step 6: Segment the `customers_transformed_df` DataFrame with all factors using the K-means algorithm

In [14]:
# Define the model Kmeans model using k=3 clusters


# Fit the model


# Make predictions


# Create a copy of the customers_transformed_df DataFrame


# Add a class column with the labels


In [15]:
# Plot the clusters using the first two feature columns


### Step 7: What is the difference between the segmentation results of the PCA DataFrame and the full-factored DataFrame?

**Answer:** It appears that the customer segmentation information using the DataFrame with all of the factors yielded similar results that the PCA analysis. The data is grouped in clear customer segments in both cases.