In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

merged_data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")

total_spending = merged_data.groupby("CustomerID")["TotalValue"].sum()

transaction_count = merged_data.groupby("CustomerID")["TransactionID"].nunique()

product_categories = pd.get_dummies(merged_data[["CustomerID", "Category"]], columns=["Category"])
category_features = product_categories.groupby("CustomerID").sum()

region_features = pd.get_dummies(customers.set_index("CustomerID")["Region"])

customer_profile = pd.concat([total_spending, transaction_count, category_features, region_features], axis=1).fillna(0)
customer_profile.columns = ["TotalSpending", "TransactionCount"] + list(category_features.columns) + list(region_features.columns)

scaler = StandardScaler()
customer_profile_scaled = scaler.fit_transform(customer_profile)

similarity_matrix = cosine_similarity(customer_profile_scaled)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_profile.index, columns=customer_profile.index)

def get_top_similar(customers_df, target_customer, top_n=3):
    scores = customers_df.loc[target_customer].sort_values(ascending=False)
    top_customers = scores.iloc[1:top_n + 1]  
    return [(cust_id, score) for cust_id, score in zip(top_customers.index, top_customers.values)]

lookalike_map = {}
for customer in customers.query("CustomerID >= 'C0001' and CustomerID <= 'C0020'")["CustomerID"]:
    lookalike_map[customer] = get_top_similar(similarity_df, customer)

lookalike_data = [{"CustomerID": cust_id, "Lookalikes": lookalike_map[cust_id]} for cust_id in lookalike_map]
lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df.to_csv("Dhruv_Baheti_Lookalike.csv", index=False)