# Project 3 - Variation

In this code, we first import the necessary libraries, including pandas for data manipulation, numpy for numerical operations, matplotlib for data visualization, and scikit-learn for machine learning.

In [3]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

Next, we load the dataset from the UCI Machine Learning Repository, and perform data cleaning and preparation by removing any missing or invalid data, dropping duplicate records or transactions, and converting categorical variables into numerical variables using one-hot encoding. We then normalize or standardize the data to eliminate any differences in scale or range between the variables.

In [4]:
# Load the dataset
df = pd.read_excel("data//Online_Retail.xlsx")
print(df.head())

  InvoiceNo StockCode                          Description  Quantity  \
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
1    536365     71053                  WHITE METAL LANTERN         6   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   

          InvoiceDate  UnitPrice  CustomerID         Country  
0 2010-12-01 08:26:00       2.55     17850.0  United Kingdom  
1 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  
2 2010-12-01 08:26:00       2.75     17850.0  United Kingdom  
3 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  
4 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  


In [14]:
# Data cleaning and preparation
df.dropna(inplace=True) # Drop any missing or invalid data
df.drop_duplicates(inplace=True) # Drop any duplicate records or transactions

KeyboardInterrupt: 

In [6]:
# Convert categorical variables into numerical variables using one-hot encoding
df = pd.get_dummies(df, columns=['Description'])

In [15]:
# Verify null values
null_columns = df.columns[df.isnull().any()]
print(df[null_columns].isnull().sum())

Series([], dtype: float64)


In [16]:
# Normalize or standardize the data to eliminate any differences in scale or range between the variables
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df.drop(['InvoiceNo', 'StockCode', 'InvoiceDate', 'Quantity', 'UnitPrice', 'CustomerID', 'Country'], axis=1))

: 

: 

In [None]:
# PCA to reduce dimensionality of dataset
pca = PCA(n_components=2)
df_pca = pca.fit_transform(df_scaled)

To perform clustering analysis, we use the k-means clustering algorithm, which partitions the data into k distinct clusters based on the similarity of the data points. We evaluate the performance of the algorithm using the silhouette score, which measures the similarity of a data point to its own cluster compared to other clusters, and ranges from -1 to 1, with higher values indicating better clustering.

In [None]:
# Model Training and Evaluation
silhouette_scores = []
k_values = range(2, 11)


for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(df_pca)
    labels = kmeans.labels_
    silhouette_scores.append(silhouette_score(df_pca, labels))

# Plot the silhouette scores to determine the optimal number of clusters
plt.plot(k_values, silhouette_scores)
plt.xlabel('Number of clusters (k)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Scores for k-means Clustering')
plt.show()

# Fit the k-means clustering algorithm with the optimal number of clusters (k=3)
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(df_pca)
labels = kmeans.labels_

Finally, we visualize the results of the clustering analysis using scatter plots, with each data point colored according to its assigned cluster. We identify customer segments based on the cluster means, and these segments can be used to develop targeted marketing strategies and improve customer satisfaction.

In [None]:
# Visualization of clustering results
plt.scatter(df_pca[:, 0], df_pca[:, 1], c=labels)
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('Customer Segmentation using k-means Clustering')
plt.show()

# Identify customer segments
df['Cluster'] = labels
cluster_means = df.groupby('Cluster').mean()
print(cluster_means)