In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

Load the datasets

In [22]:
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

Convert dates to datetime format

In [3]:
customers["SignupDate"] = pd.to_datetime(customers["SignupDate"])
transactions["TransactionDate"] = pd.to_datetime(transactions["TransactionDate"])

In [4]:
# Combine datasets into a single DataFrame
data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")

Feature Engineering

In [5]:
# 1. Total transaction value per customer
customer_total_value = data.groupby("CustomerID")["TotalValue"].sum().rename("TotalTransactionValue")

# 2. Average order value (AOV)
customer_aov = data.groupby("CustomerID")["TotalValue"].mean().rename("AverageOrderValue")

# 3. Product preferences based on category
customer_category_preference = data.pivot_table(
    index="CustomerID", columns="Category", values="Quantity", aggfunc="sum", fill_value=0
)

# 4. Customer activity frequency (average days between purchases)
customer_purchase_dates = data.groupby("CustomerID")["TransactionDate"].agg(list)
customer_avg_days_between = customer_purchase_dates.apply(
    lambda x: np.mean(np.diff(sorted(x))).days if len(x) > 1 else np.nan
).rename("AvgDaysBetweenPurchases")

Combine all features into a single DataFrame

In [6]:
features = pd.concat([customer_total_value, customer_aov, customer_category_preference, customer_avg_days_between], axis=1)

In [9]:
# Handle missing values (e.g., fill NaN with 0 for days between purchases)
features.fillna(0, inplace=True)

In [13]:
features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 199 entries, C0001 to C0200
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   TotalTransactionValue    199 non-null    float64
 1   AverageOrderValue        199 non-null    float64
 2   Books                    199 non-null    int64  
 3   Clothing                 199 non-null    int64  
 4   Electronics              199 non-null    int64  
 5   Home Decor               199 non-null    int64  
 6   AvgDaysBetweenPurchases  199 non-null    float64
dtypes: float64(3), int64(4)
memory usage: 20.5+ KB


In [14]:
features.describe()

Unnamed: 0,TotalTransactionValue,AverageOrderValue,Books,Clothing,Electronics,Home Decor,AvgDaysBetweenPurchases
count,199.0,199.0,199.0,199.0,199.0,199.0,199.0
mean,3467.314372,687.580182,3.422111,2.964824,3.150754,3.211055,62.40201
std,1832.677958,237.936649,3.068694,3.083644,3.072752,3.064241,41.989287
min,82.36,82.36,0.0,0.0,0.0,0.0,0.0
25%,2162.04,542.941667,1.0,0.0,0.0,0.0,37.0
50%,3137.66,677.2075,3.0,3.0,3.0,3.0,56.0
75%,4770.225,828.624167,5.0,4.0,4.5,5.0,79.5
max,10673.87,1323.133333,15.0,15.0,14.0,14.0,295.0


In [10]:
# Standardize the features for similarity calculation
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

Calculate pairwise cosine similarity

In [12]:
similarity_matrix = cosine_similarity(features_scaled)
similarity_matrix

array([[ 1.        , -0.2261447 ,  0.19356658, ..., -0.03250585,
         0.31855736, -0.71427799],
       [-0.2261447 ,  1.        ,  0.6299443 , ...,  0.31646807,
         0.49314288, -0.12392431],
       [ 0.19356658,  0.6299443 ,  1.        , ...,  0.13734155,
         0.59004509, -0.01361518],
       ...,
       [-0.03250585,  0.31646807,  0.13734155, ...,  1.        ,
         0.60329767, -0.39673553],
       [ 0.31855736,  0.49314288,  0.59004509, ...,  0.60329767,
         1.        , -0.55068698],
       [-0.71427799, -0.12392431, -0.01361518, ..., -0.39673553,
        -0.55068698,  1.        ]])

function to find top 3 lookalike customers

In [15]:
# Create a function to find top 3 lookalike customers
def get_top_lookalikes(customer_id, similarity_matrix, customer_ids, top_n=3):
    customer_index = customer_ids.index(customer_id)
    similarity_scores = similarity_matrix[customer_index]
    # Exclude the customer itself and sort by similarity
    lookalike_indices = np.argsort(similarity_scores)[::-1][1 : top_n + 1]
    return [(customer_ids[idx], similarity_scores[idx]) for idx in lookalike_indices]

In [21]:
# Get lookalike data for customers C0001 to C0020
customer_ids = features.index.tolist()
lookalike_data = {}

# Correctly format customer IDs with leading zeros
for cust_id in customer_ids:
    if cust_id in [f"C{str(i).zfill(4)}" for i in range(1, 21)]:
        lookalike_data[cust_id] = get_top_lookalikes(cust_id, similarity_matrix, customer_ids)

# Convert lookalike data to a DataFrame
lookalike_df = pd.DataFrame(
    [{"cust_id": cust, "lookalikes": lookalikes} for cust, lookalikes in lookalike_data.items()]
)

# Save to CSV
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike model complete. Results saved to 'Lookalike.csv'.")

Lookalike model complete. Results saved to 'Lookalike.csv'.
