In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# URL of the dataset
url = "https://raw.githubusercontent.com/getnetbogale27/Baacumen-Unsupervised-Learning-Project/refs/heads/main/Dataset/Online_Retail_dataset.csv?token=GHSAT0AAAAAACY5XTINSQMUTIPBIXAA4JHWZYRHDCA"

# Read the CSV file from the URL
data = pd.read_csv(url)

# Display the first few rows of the dataset
data.head()  # This will display the data in a nice table format

In [None]:
# Convert InvoiceDate to datetime
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

# Remove rows with missing CustomerID
df = df.dropna(subset=['CustomerID'])

# Filter out negative quantities or unit prices (returns or errors)
df = df[(df['Quantity'] > 0) & (df['UnitPrice'] > 0)]


In [None]:
# Create a TotalPrice column
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']


In [None]:
# Define the latest date to calculate recency
latest_date = df['InvoiceDate'].max()

# Create the RFM table
rfm = df.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (latest_date - x.max()).days,  # Recency
    'InvoiceNo': 'nunique',  # Frequency (count of unique invoices)
    'TotalPrice': 'sum'  # Monetary (total spending)
}).reset_index()

# Rename columns
rfm.columns = ['CustomerID', 'Recency', 'Frequency', 'Monetary']

# Preview the RFM table
print(rfm.head())


In [None]:
# Apply log transformation to Monetary (optional for reducing skewness)
rfm['Monetary'] = np.log1p(rfm['Monetary'])

# Standardize the RFM values
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm[['Recency', 'Frequency', 'Monetary']])

# Convert back to DataFrame
rfm_scaled = pd.DataFrame(rfm_scaled, columns=['Recency', 'Frequency', 'Monetary'])


In [None]:
# Elbow method to determine optimal number of clusters
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
    kmeans.fit(rfm_scaled)
    wcss.append(kmeans.inertia_)

# Plot the Elbow curve
plt.figure(figsize=(8, 5))
plt.plot(range(1, 11), wcss, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.show()


In [None]:
# Apply K-means with the chosen number of clusters (e.g., 3)
kmeans = KMeans(n_clusters=3, init='k-means++', random_state=42)
rfm['Cluster'] = kmeans.fit_predict(rfm_scaled)

# Preview the RFM table with cluster assignments
print(rfm.head())


In [None]:
# Scatter plot of Frequency vs Monetary by Cluster
plt.figure(figsize=(8, 5))
sns.scatterplot(data=rfm, x='Frequency', y='Monetary', hue='Cluster', palette='viridis')
plt.title('Customer Segments by Frequency and Monetary')
plt.show()

# Pairplot of the RFM data by cluster
sns.pairplot(rfm, hue='Cluster', palette='coolwarm')
plt.show()


In [None]:
# Summary statistics for each cluster
cluster_summary = rfm.groupby('Cluster').mean()
print(cluster_summary)


In [None]:
rfm.to_csv('customer_segments.csv', index=False)
