In [17]:
import pandas as pd
import csv

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics.pairwise import cosine_similarity


In [3]:
# Load datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Merge transactions with customer and product data
merged = pd.merge(transactions, customers, on='CustomerID', how='left')
merged = pd.merge(merged, products, on='ProductID', how='left')

In [5]:
merged.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 7:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [6]:
# Features from transactions
transaction_features = transactions.groupby('CustomerID').agg(
    total_transactions=('TransactionID', 'count'),
    total_spent=('TotalValue', 'sum'),
    avg_quantity=('Quantity', 'mean'),
    unique_products=('ProductID', 'nunique')
).reset_index()

In [7]:
# Features from merged data (product categories)
category_counts = merged.groupby(['CustomerID', 'Category']).size().unstack(fill_value=0)
category_features = category_counts.div(category_counts.sum(axis=1), axis=0)  # Normalize

In [8]:
# Combine all features
customer_features = pd.merge(
    customers[['CustomerID', 'Region', 'SignupDate']],
    transaction_features,
    on='CustomerID'
)
customer_features = pd.merge(
    customer_features,
    category_features,
    on='CustomerID'
)

In [9]:
# Add tenure (days since signup)
customer_features['SignupDate'] = pd.to_datetime(customer_features['SignupDate'])
customer_features['tenure_days'] = (pd.Timestamp.now() - customer_features['SignupDate']).dt.days

In [10]:
# Drop non-numeric columns for similarity calculation
customer_features_final = customer_features.drop(['CustomerID', 'SignupDate'], axis=1)

In [11]:
customer_features_final.head()

Unnamed: 0,Region,total_transactions,total_spent,avg_quantity,unique_products,Books,Clothing,Electronics,Home Decor,tenure_days
0,South America,5,3354.52,2.4,5,0.2,0.0,0.6,0.2,934
1,Asia,4,1862.74,2.5,4,0.0,0.5,0.0,0.5,1081
2,South America,4,2725.38,3.5,4,0.0,0.25,0.25,0.5,328
3,South America,8,5354.88,2.875,8,0.375,0.0,0.25,0.375,843
4,Asia,3,2034.24,2.333333,3,0.0,0.0,0.666667,0.333333,898


In [13]:
# Define columns to transform
numerical_cols = ['total_transactions', 'total_spent', 'avg_quantity', 'unique_products', 'tenure_days']
categorical_cols = ['Region']

# Preprocessor pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ],
    remainder='passthrough'
)

# Apply preprocessing
customer_features_processed = preprocessor.fit_transform(customer_features_final)

In [15]:
# Compute pairwise similarity matrix
similarity_matrix = cosine_similarity(customer_features_processed)

# Map CustomerID to indices
customer_id_to_index = {cust_id: idx for idx, cust_id in enumerate(customer_features['CustomerID'])}

### For the first 20 customers (C0001-C0020), find top 3 lookalikes.

In [16]:
lookalike_map = {}

# Target customers (first 20)
target_customers = customer_features['CustomerID'].iloc[:20]

for target_cust in target_customers:
    target_idx = customer_id_to_index[target_cust]
    
    # Get similarity scores for the target customer
    sim_scores = list(enumerate(similarity_matrix[target_idx]))
    
    # Sort and exclude self
    sorted_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:4]  # Top 3 excluding self
    
    # Get CustomerIDs and scores
    lookalikes = []
    for idx, score in sorted_scores:
        lookalike_cust = customer_features.iloc[idx]['CustomerID']
        lookalikes.append((lookalike_cust, round(score, 3)))
    
    lookalike_map[target_cust] = lookalikes

In [19]:
with open('Ayusman_Pradhan_Lookalike.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['CustomerID', 'Lookalike1_ID', 'Lookalike1_Score', 
                     'Lookalike2_ID', 'Lookalike2_Score', 'Lookalike3_ID', 'Lookalike3_Score'])
    
    for cust_id, lookalikes in lookalike_map.items():
        row = [cust_id]
        for lookalike in lookalikes:
            row.extend([lookalike[0], lookalike[1]])
        writer.writerow(row)

- Many recommendations have scores > 0.9 (e.g., C0004 and C0113 at 0.967), indicating strong behavioral similarities.
- Some customers (e.g., C0006 with C0190 at 0.731) have unique behaviors or may be outliers.