In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import plotly.express as px
import seaborn as sns
from itertools import combinations
from collections import Counter

In [2]:
df_customer=pd.read_csv(r"C:\Users\drish\Downloads\Customers.csv")
df_products=pd.read_csv(r"C:\Users\drish\Downloads\Products.csv")
df_transactions=pd.read_csv(r"C:\Users\drish\Downloads\Transactions.csv")

In [3]:
customer_transactions = df_transactions.merge(df_products, on='ProductID', how='left')

In [4]:
customer_transactions

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68
...,...,...,...,...,...,...,...,...,...,...
995,T00496,C0118,P037,2024-10-24 08:30:27,1,459.86,459.86,SoundWave Smartwatch,Electronics,459.86
996,T00759,C0059,P037,2024-06-04 02:15:24,3,1379.58,459.86,SoundWave Smartwatch,Electronics,459.86
997,T00922,C0018,P037,2024-04-05 13:05:32,4,1839.44,459.86,SoundWave Smartwatch,Electronics,459.86
998,T00959,C0115,P037,2024-09-29 10:16:02,2,919.72,459.86,SoundWave Smartwatch,Electronics,459.86


In [5]:
customer_data = customer_transactions.groupby('CustomerID').agg({
    'ProductID': 'count',         # Purchase frequency
    'Price_x': ['mean', 'sum'],     # Avg and total spend
    'Category': lambda x: x.mode()[0],  # Most purchased category
}).reset_index()
customer_data.columns = ['CustomerID', 'PurchaseFrequency', 'AvgSpend', 'TotalSpend', 'TopCategory']
customer_data = customer_data.merge(df_customer, on='CustomerID', how='left')


In [6]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
encoder = LabelEncoder()
customer_data['TopCategory'] = encoder.fit_transform(customer_data['TopCategory'])
scaler = StandardScaler()
numerical_cols = ['PurchaseFrequency', 'AvgSpend', 'TotalSpend']
customer_data[numerical_cols] = scaler.fit_transform(customer_data[numerical_cols])


In [7]:
from sklearn.metrics.pairwise import cosine_similarity
features = ['PurchaseFrequency', 'AvgSpend', 'TotalSpend', 'TopCategory']
similarity_matrix = cosine_similarity(customer_data[features])
similarity_df = pd.DataFrame(similarity_matrix, index=customer_data['CustomerID'], columns=customer_data['CustomerID'])


In [8]:
def get_top_n_similar(customers, similarity_matrix, n=3):
    lookalike_map = {}
    for customer in customers:
        similar_customers = similarity_matrix[customer].sort_values(ascending=False)[1:n+1]
        lookalike_map[customer] = list(zip(similar_customers.index, similar_customers.values))
    return lookalike_map
customer_ids = customer_data['CustomerID'].iloc[:20].values
lookalike_map = get_top_n_similar(customer_ids, similarity_df, n=3)
lookalike_df = pd.DataFrame([
    {'CustomerID': cust_id, 'Lookalikes': lookalikes}
    for cust_id, lookalikes in lookalike_map.items()
])
lookalike_df.to_csv('Lookalike.csv', index=False)
