In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import numpy as np

In [None]:
# Load the datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

In [None]:
# Step 1: Merge datasets to create a customer profile with transaction history
customer_transactions = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")

In [None]:
# Aggregate data to create customer-level profiles
customer_profiles = customer_transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',       # Total revenue generated by the customer
    'Quantity': 'sum',         # Total quantity purchased by the customer
    'Price_y': 'mean',         # Average price of products purchased
    'ProductID': 'nunique',    # Number of unique products purchased
    'Category': lambda x: ','.join(x)  # Categories of products purchased
}).reset_index()

In [None]:
# One-hot encode categories to include in similarity calculation
category_dummies = customer_transactions.groupby('CustomerID')['Category'].apply(lambda x: ','.join(x))
category_dummies = category_dummies.str.get_dummies(sep=',')

In [None]:
# Combine numeric features and encoded categories
customer_features = pd.concat([
    customer_profiles[['TotalValue', 'Quantity', 'Price_y', 'ProductID']],
    category_dummies
], axis=1)

In [None]:
# Replace NaN values with 0 for all features
customer_features_filled = customer_features.fillna(0)

In [None]:
# Normalize numeric features for similarity calculations
scaler = StandardScaler()
customer_features_normalized = scaler.fit_transform(customer_features_filled)

In [None]:
# Step 2: Calculate cosine similarity between customers
similarity_matrix = cosine_similarity(customer_features_normalized)

In [None]:
# Step 3: Find top 3 similar customers for the first 20 customers (C0001-C0020)
customer_ids = customer_profiles['CustomerID']
top_lookalikes = {}

In [None]:
for idx, customer_id in enumerate(customer_ids[:20]):
    # Get similarity scores for the current customer
    scores = similarity_matrix[idx]
    # Sort customers by similarity score (excluding the customer itself)
    similar_customers = np.argsort(scores)[::-1][1:4]  # Exclude the current customer
    # Store the top 3 similar customers with their scores
    top_lookalikes[customer_id] = [(customer_ids[sim_idx], scores[sim_idx]) for sim_idx in similar_customers]

In [None]:
# Step 4: Save results to a CSV file
lookalike_results = []

In [None]:
for cust_id, lookalikes in top_lookalikes.items():
    for similar_cust_id, score in lookalikes:
        lookalike_results.append({"CustomerID": cust_id, "SimilarCustomerID": similar_cust_id, "Score": score})

In [None]:
lookalike_df = pd.DataFrame(lookalike_results)
lookalike_df.to_csv("Lookalike.csv", index=False)

In [None]:
print("Lookalike model results saved to Lookalike.csv")