# Project 3: Clustering Analysis for Customer Segmentation - Unsupervised Learning


## Step 1: Load the data

In [2]:
import pandas as pd

# Load the data from the UCI Machine Learning Repository
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx"
#data = Online_retail.xlsx
df = pd.read_excel(url)

# Print the first 5 rows of the data
print(df.head())

  InvoiceNo StockCode                          Description  Quantity  \
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
1    536365     71053                  WHITE METAL LANTERN         6   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   

          InvoiceDate  UnitPrice  CustomerID         Country  
0 2010-12-01 08:26:00       2.55     17850.0  United Kingdom  
1 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  
2 2010-12-01 08:26:00       2.75     17850.0  United Kingdom  
3 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  
4 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  


## Step 2: Data Cleaning and Preparation

In [3]:
# Remove any duplicate records or transactions
df.drop_duplicates(inplace=True)

# Remove any missing or invalid data
df.dropna(inplace=True)

# Convert categorical variables into numerical variables using one-hot encoding
df = pd.get_dummies(df, columns=['Description'], prefix='', prefix_sep='')

# Normalize or standardize the data to eliminate any differences in scale or range between the variables
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df.drop(['InvoiceNo', 'Quantity', 'InvoiceDate', 'UnitPrice'], axis=1))
df_scaled = pd.DataFrame(df_scaled, columns=df.drop(['InvoiceNo', 'Quantity', 'InvoiceDate', 'UnitPrice'], axis=1).columns)


## Step 3: Modeling

In [None]:
# Apply k-means clustering algorithm
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=4, random_state=42)
clusters = kmeans.fit_predict(df_scaled)
df['Cluster'] = clusters

## Step 4: Evaluation

In [None]:
# Calculate the silhouette score
from sklearn.metrics import silhouette_score
score = silhouette_score(df_scaled, clusters)
print("Silhouette Score:", score)
