# Lookalike Model

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity

<b>Importing Data</b>

In [3]:
cdf = pd.read_csv("Customers.csv")
pdf = pd.read_csv("Products.csv")
tdf = pd.read_csv("Transactions.csv")

<b>Data Preprocessing</b>

In [4]:
cdf["SignupDate"] = pd.to_datetime(cdf["SignupDate"])

customer_transactions = tdf.groupby("CustomerID").agg(
    TotalSpent=("TotalValue", "sum"),
    AvgTransactionValue=("TotalValue", "mean"),
    PurchaseCount=("TransactionID", "count"),
    UniqueProducts=("ProductID", "nunique")
).reset_index()

customer_profiles = pd.merge(cdf, customer_transactions, on="CustomerID", how="left").fillna(0)

encoder = OneHotEncoder(sparse=False, drop="first")
region_encoded = encoder.fit_transform(customer_profiles[["Region"]])
region_df = pd.DataFrame(region_encoded, columns=encoder.get_feature_names_out(["Region"]))
customer_profiles = pd.concat([customer_profiles, region_df], axis=1).drop(columns=["Region", "CustomerName", "SignupDate"])

scaler = StandardScaler()
numerical_features = ["TotalSpent", "AvgTransactionValue", "PurchaseCount", "UniqueProducts"]
customer_profiles[numerical_features] = scaler.fit_transform(customer_profiles[numerical_features])



<b>Customer Profile Attributes:</b>
    
TotalSpent → Sum of all transactions for a customer. <br>
AvgTransactionValue → Average amount spent per transaction. <br>
PurchaseCount → Total number of transactions made by the customer. <br>
UniqueProducts → Number of different products purchased by the customer.

Standardization the values ensures that all numerical features are on the same scale and helps for similarity calculations.

<b>Similarity Matrix</b>

In [5]:
customer_matrix = customer_profiles.set_index("CustomerID").values
similarity_matrix = cosine_similarity(customer_matrix)

Converted the processed DataFrame into a matrix. <br>
Compute cosine similarity, which measures how similar two customers are based on their profile. <br>
It Works well for sparse data. <br>
Measures how close customer profiles are in a multi-dimensional space. <br>

<b>Top 3 similar customers</b>

In [6]:
customer_ids = customer_profiles["CustomerID"].values
lookalike_dict = {}

for idx, cust_id in enumerate(customer_ids[:20]):  # First 20 customers
    sim_scores = list(enumerate(similarity_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:4]  # Top 3 excluding itself
    lookalike_dict[cust_id] = [(customer_ids[i], round(score, 4)) for i, score in sim_scores]


<b>Results to Lookalike.csv</b>

In [7]:
lookalike_df = pd.DataFrame([(cust_id, sim[0], sim[1]) for cust_id, sims in lookalike_dict.items() for sim in sims],
                            columns=["CustomerID", "LookalikeID", "SimilarityScore"])
lookalike_df.to_csv("Lookalike.csv", index=False)

In [9]:
lookalike_df.head()

Unnamed: 0,CustomerID,LookalikeID,SimilarityScore
0,C0001,C0137,0.9998
1,C0001,C0152,0.9995
2,C0001,C0107,0.9654
3,C0002,C0043,0.987
4,C0002,C0142,0.9769
