In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [11]:
cust_df = pd.read_csv('./Customers.csv')
pro_df = pd.read_csv('./Products.csv')
tx_df = pd.read_csv('./Transactions.csv')

In [12]:
# Convert dates to datetime
cust_df['SignupDate'] = pd.to_datetime(cust_df['SignupDate'])
tx_df['TransactionDate'] = pd.to_datetime(tx_df['TransactionDate'])

In [13]:
tx_pro = pd.merge(tx_df, pro_df[['ProductID','ProductName', 'Category']] , on='ProductID', how='left')

In [14]:
rec_tx_date = tx_pro['TransactionDate'].max()

In [15]:
cust_df['Tenure'] = (cust_df['SignupDate'] - rec_tx_date).dt.days

In [16]:
(cust_df['Tenure'] > 0).sum()

np.int64(0)

In [17]:
cust_df.drop(columns=['Tenure'], inplace=True)

In [18]:
reg_cust = pd.get_dummies(cust_df['Region'], prefix='Region')
cust_ohe = pd.concat([cust_df[['CustomerID']], reg_cust], axis=1)

In [19]:
tx_stats = tx_pro.groupby('CustomerID').agg(
    transaction_count=('TransactionID', 'count'),
    total_quantity=('Quantity', 'sum'),
    total_value=('TotalValue', 'sum'),
    avg_transaction_value=('TotalValue', 'mean'),
    avg_quantity_per_transaction=('Quantity', 'mean'),
    first_transaction_date=('TransactionDate', 'min'),
    last_transaction_date=('TransactionDate', 'max'),
    unique_products=('ProductID', 'nunique'),
    unique_categories=('Category', 'nunique'),
    avg_price=('Price', 'mean')
).reset_index()

In [20]:
tx_stats['activity_duration'] = (tx_stats['last_transaction_date'] - tx_stats['first_transaction_date']).dt.days
tx_stats['recency'] = (rec_tx_date - tx_stats['last_transaction_date']).dt.days
tx_stats.drop(['first_transaction_date', 'last_transaction_date'], axis=1, inplace=True)

In [21]:
df = pd.merge(tx_stats, cust_ohe, on='CustomerID', how='left')

In [22]:
cust_ids = df['CustomerID']
df.drop(columns=['CustomerID'], inplace=True)

In [23]:
sim_matrix = cosine_similarity(df)

In [24]:
lookalike = {}
for i in range(20):
    tg = cust_ids[i]
    sims = sim_matrix[i]
    sort_sims = np.argsort(sims)[::-1]
    top_3 = []
    k = 0
    while len(top_3) != 3:
        if sort_sims[k] == i:
            k += 1
            continue
        top_3.append(sort_sims[k])
        k += 1
    lookalike[f'{tg}'] = [(cust_ids[t], sims[t]) for t in top_3]

In [25]:
res = pd.DataFrame(lookalike)

In [26]:
res = res.T
res.columns = ['Lookalike1', 'Lookalike2', 'Lookalike3']

In [27]:
res.index.rename('CustomerID', inplace=True)

In [None]:
res.to_csv('Lookalike.csv')

In [28]:
res.head()

Unnamed: 0_level_0,Lookalike1,Lookalike2,Lookalike3
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C0001,"(C0152, 0.9999815692994218)","(C0164, 0.9999691373270239)","(C0183, 0.999960399011372)"
C0002,"(C0031, 0.9998368577488196)","(C0029, 0.9997944044934609)","(C0010, 0.9996920105350803)"
C0003,"(C0117, 0.9996675973669418)","(C0070, 0.9996317430885981)","(C0006, 0.9996151036089245)"
C0004,"(C0068, 0.9999838232271343)","(C0028, 0.9999759360711772)","(C0145, 0.9999749005812171)"
C0005,"(C0085, 0.9998142752606903)","(C0061, 0.9996272738667601)","(C0115, 0.9995145614239235)"
