# Lookalike Model

In [11]:
import pandas as pd
import numpy as np
import json
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity

<b>Importing Data</b>

In [3]:
cdf = pd.read_csv("Customers.csv")
pdf = pd.read_csv("Products.csv")
tdf = pd.read_csv("Transactions.csv")

<b>Data Preprocessing</b>

In [4]:
cdf["SignupDate"] = pd.to_datetime(cdf["SignupDate"])

customer_transactions = tdf.groupby("CustomerID").agg(
    TotalSpent=("TotalValue", "sum"),
    AvgTransactionValue=("TotalValue", "mean"),
    PurchaseCount=("TransactionID", "count"),
    UniqueProducts=("ProductID", "nunique")
).reset_index()

customer_profiles = pd.merge(cdf, customer_transactions, on="CustomerID", how="left").fillna(0)

encoder = OneHotEncoder(sparse=False, drop="first")
region_encoded = encoder.fit_transform(customer_profiles[["Region"]])
region_df = pd.DataFrame(region_encoded, columns=encoder.get_feature_names_out(["Region"]))
customer_profiles = pd.concat([customer_profiles, region_df], axis=1).drop(columns=["Region", "CustomerName", "SignupDate"])

scaler = StandardScaler()
numerical_features = ["TotalSpent", "AvgTransactionValue", "PurchaseCount", "UniqueProducts"]
customer_profiles[numerical_features] = scaler.fit_transform(customer_profiles[numerical_features])



<b>Customer Profile Attributes:</b>
    
TotalSpent → Sum of all transactions for a customer. <br>
AvgTransactionValue → Average amount spent per transaction. <br>
PurchaseCount → Total number of transactions made by the customer. <br>
UniqueProducts → Number of different products purchased by the customer.

Standardization the values ensures that all numerical features are on the same scale and helps for similarity calculations.

<b>Similarity Matrix</b>

In [5]:
customer_matrix = customer_profiles.set_index("CustomerID").values
similarity_matrix = cosine_similarity(customer_matrix)

Converted the processed DataFrame into a matrix. <br>
Compute cosine similarity, which measures how similar two customers are based on their profile. <br>
It Works well for sparse data. <br>
Measures how close customer profiles are in a multi-dimensional space. <br>

<b>Top 3 similar customers</b>

In [16]:
customer_ids = customer_profiles["CustomerID"].values
lookalike_list = []

for idx, cust_id in enumerate(customer_ids[:20]):
    sim_scores = list(enumerate(similarity_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:4]
    similar_customers = [[customer_ids[i], round(score, 4)] for i, score in sim_scores]

    lookalike_list.append([cust_id, json.dumps(similar_customers)])


<b>Results to Lookalike.csv</b>

In [17]:
lookalike_df = pd.DataFrame(lookalike_list, columns=["CustomerID", "Lookalikes"])
lookalike_df.to_csv("Lookalike.csv", index=False)

In [18]:
lookalike_df.head()

Unnamed: 0,CustomerID,Lookalikes
0,C0001,"[[""C0137"", 0.9998], [""C0152"", 0.9995], [""C0107..."
1,C0002,"[[""C0043"", 0.987], [""C0142"", 0.9769], [""C0097""..."
2,C0003,"[[""C0133"", 0.9886], [""C0052"", 0.9427], [""C0112..."
3,C0004,"[[""C0108"", 0.9864], [""C0113"", 0.9743], [""C0155..."
4,C0005,"[[""C0159"", 0.9993], [""C0123"", 0.9986], [""C0178..."
