In [2]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.neighbors import NearestNeighbors
from datetime import datetime

# Load datasets
customers = pd.read_csv('dataset/Customers.csv')
products = pd.read_csv('dataset/Products.csv')
transactions = pd.read_csv('dataset/Transactions.csv')

# Merge datasets
# Step 1: Merge transactions with customers on CustomerID
data = pd.merge(transactions, customers, on='CustomerID')

# Step 2: Merge the result with products on ProductID
data = pd.merge(data, products, on='ProductID')

# Handle duplicate column names after merge
# Rename Price_x to Price and drop Price_y
data.rename(columns={"Price_x": "Price"}, inplace=True)
data.drop(columns=["Price_y"], inplace=True)

# Feature Engineering
# 1. Customer tenure (days since signup)
data['SignupDate'] = pd.to_datetime(data['SignupDate'])
data['Tenure'] = (datetime.now() - data['SignupDate']).dt.days

# 2. Aggregate transaction data for customer-level features
customer_features = data.groupby('CustomerID').agg({
    'Tenure': 'first',  # Tenure is the same for all rows of a customer
    'Region': 'first',  # Region is the same for all rows of a customer
    'TotalValue': ['sum', 'mean'],  # Total and average spending
    'Quantity': 'sum',  # Total quantity purchased
    'Category': lambda x: x.mode()[0],  # Most frequent category
    'Price': 'mean'  # Average product price
}).reset_index()

# Flatten multi-index columns
customer_features.columns = [
    'CustomerID', 'Tenure', 'Region', 
    'TotalSpending', 'AvgSpending', 
    'TotalQuantity', 'FavoriteCategory', 
    'AvgProductPrice'
]

# 3. Encode categorical variables (Region and FavoriteCategory)
encoder = OneHotEncoder(drop='first', sparse_output=False)  # Use dense output
encoded_features = encoder.fit_transform(customer_features[['Region', 'FavoriteCategory']])

# Get feature names for encoded columns
encoded_columns = encoder.get_feature_names_out(['Region', 'FavoriteCategory'])

# Create DataFrame for encoded features
encoded_df = pd.DataFrame(encoded_features, columns=encoded_columns)

# Combine encoded features with numerical features
customer_features_final = pd.concat([customer_features.drop(['Region', 'FavoriteCategory'], axis=1), encoded_df], axis=1)

# 4. Normalize numerical features
scaler = MinMaxScaler()
customer_features_scaled = scaler.fit_transform(customer_features_final.iloc[:, 1:])

# Train KNN Model
knn_model = NearestNeighbors(metric='cosine', n_neighbors=4)  # 4 because we exclude the customer itself
clusters = knn_model.fit(customer_features_scaled)

# Lookalike Recommendations
lookalikes = {}
for i, cust_id in enumerate(customer_features_final['CustomerID']):
    if cust_id in [f'C{str(i).zfill(4)}' for i in range(1, 21)]:  # Only for C0001 to C0020
        distances, indices = knn_model.kneighbors([customer_features_scaled[i]])
        similar_customers = [(customer_features_final['CustomerID'][j], 1 - distances[0][k])  # Convert distance to similarity
                             for k, j in enumerate(indices[0]) if j != i][:3]  # Exclude self and take top 3
        lookalikes[cust_id] = similar_customers

# Save to CSV
lookalike_df = pd.DataFrame(lookalikes.items(), columns=['CustomerID', 'Lookalikes'])
lookalike_df.to_csv('Lookalike.csv', index=False)

# Print results
print(lookalike_df[:10])

  CustomerID                                         Lookalikes
0      C0001  [(C0184, 0.9938610436936237), (C0192, 0.993439...
1      C0002  [(C0134, 0.9888261684811294), (C0106, 0.983359...
2      C0003  [(C0052, 0.9939611009462402), (C0031, 0.990825...
3      C0004  [(C0169, 0.9917355195559103), (C0165, 0.991216...
4      C0005  [(C0007, 0.9950379241347769), (C0140, 0.968600...
5      C0006  [(C0126, 0.9887368103028111), (C0187, 0.985585...
6      C0007  [(C0005, 0.9950379241347769), (C0140, 0.974502...
7      C0008  [(C0189, 0.9807242348140186), (C0065, 0.969508...
8      C0009  [(C0061, 0.9747244536021684), (C0062, 0.972511...
9      C0010  [(C0062, 0.9921708629227519), (C0103, 0.971164...


In [3]:
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score, silhouette_score

db_index = davies_bouldin_score(customer_features_scaled, clusters)
print(f"Davies-Bouldin Index: {db_index}")

# Step 3: Calculate Silhouette Score
silhouette_avg = silhouette_score(customer_features_scaled, clusters)
print(f"Silhouette Score: {silhouette_avg}")

InvalidParameterError: The 'labels' parameter of davies_bouldin_score must be an array-like. Got NearestNeighbors(metric='cosine', n_neighbors=4) instead.