# Import software libraries

In [None]:
# Import required libraries.
import sys                                                  # Read system parameters.
import pandas as pd                                         # Manipulate and analyze data.
import matplotlib                                           # Create and format charts.
import matplotlib.pyplot as plt  
import seaborn as sns                                       # Make charting easier.
import yellowbrick                                          # Visualize elbow and silhouette plots.
from yellowbrick.cluster import SilhouetteVisualizer
import sklearn                                              # Train and evaluate machine learning models.
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import pickle                                               # Save Python objects as binary files.
from collections import Counter
import warnings                                             # Suppress warnings.
warnings.filterwarnings('ignore')

# Summarize software libraries used.
print('Libraries used in this project:')
print('- Python {}'.format(sys.version))
print('- pandas {}'.format(pd.__version__))
print('- Matplotlib {}'.format(matplotlib.__version__))
print('- Seaborn {}'.format(sns.__version__))
print('- Yellowbrick {}'.format(yellowbrick.__version__))
print('- scikit-learn {}'.format(sklearn.__version__))

# Read and examine the data

In [None]:
# Read the data.



# Preview the first five rows of the data.



In [None]:
# Get summary statistics for the dataset.
# Count, mean, standard deviation, minimum, maximum, etc.



In [None]:
# Check the structure of the data.



In [None]:
# Get the shape of the data.



# Train a *k*-means clustering model

In [None]:
# Standardize the training data.




In [None]:
# Identify the optimal number of clusters for segmenting customers.

def optimal_number_clusters(data_scaled):
    """Calculates optimal number of clusted based on elbow method."""
    
    sum_of_squared_distances = []
    K = range(1, 30)  # Define the range of clusters to try.

    for k in K:
        km = KMeans(n_clusters = k)
        km = km.fit(data_scaled)
        sum_of_squared_distances.append(km.inertia_)

    plt.figure(figsize=(20, 6))

    plt.plot(K, sum_of_squared_distances, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Sum of Squared Distances')
    plt.title('Elbow Method For Optimal k')
    plt.show()

In [None]:
# Call the function to cluster the scaled data and generate an elbow point.



# Perform silhouette analysis on the clustering model

In [None]:
# Use SilhouetteVisualizer() to generate silhouette plots and scores for a model with 10 clusters.





In [None]:
# Print the number of clusters and the silhouette score.




In [None]:
# Use SilhouetteVisualizer() to generate silhouette plots and scores for a model with 5 clusters.





In [None]:
# Print the number of clusters and the silhouette score.




# Train an optimal clustering model

In [None]:
# Define the optimal number of clusters as identified by the silhouette analysis.



# Generate a k-means clustering model using this optimal number of clusters and fit it on the data.




In [None]:
# Determine the clusters for the users.



# Print a DataFrame that shows each customer and their assigned cluster.




# Obtain the first five rows.



# Evaluate the clustering model as a whole

In [None]:
# Generate a bar chart that shows how many users were assigned to each cluster.






In [None]:
# Get the count of customers in each cluster.



# Evaluate summary statistics for individual clusters

In [None]:
# Get summmary statistics for cluster 0.



In [None]:
# Get the average amount spent by cluster 0.



In [None]:
# Get summmary statistics for cluster 3.



In [None]:
# Get the average amount spent by cluster 3.



In [None]:
# Get summmary statistics for cluster 4.



In [None]:
# Get the average amount spent by cluster 4.



# Evaluate the distribution of features for individual clusters

In [None]:
# Generate violin subplots for the distribution of the "recency" feature for each of the clusters.










In [None]:
# Generate violin subplots for the distribution of the "frequency" feature for each of the clusters.








In [None]:
# Generate violin subplots for the distribution of the "monetary_value" feature for each of the clusters.








# Perform PCA to visualize the clusters in two dimensions

In [None]:
# This function performs principal component analysis (PCA) to facilitate the visualization of clustering in two dimensions.

def visualize(y_kmeans, data_scaled, n_clusters):
    """
    Run PCA on the data to reduce the dimensions.
    
    Visualize the customers with their predicted clusters.

    """

    reduced_data = PCA(n_components = 2).fit_transform(data_scaled)
    results = pd.DataFrame(reduced_data, columns = ['pca1', 'pca2'])
    results = pd.concat([results, pd.DataFrame(y_kmeans)], axis = 1).rename(columns={0 : "cluster"})

    cmap = sns.color_palette('Set1', n_colors = n_clusters, desat = .5)

    sns.scatterplot(x = 'pca1', y = 'pca2', hue = 'cluster', data = results, palette = cmap, legend = False)
    plt.title('k-means Clustering with 2 Dimensions')
    plt.show();

In [None]:
# Call the function to perform PCA and plot the clusters.



# Save the optimal model

In [None]:
# Save the optimal model as a pickle file named optimal_clustering_model.pickle.

