In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
from google.colab import files
uploaded = files.upload()


Saving Transactions.csv to Transactions.csv
Saving Products.csv to Products.csv
Saving Customers.csv to Customers.csv


In [4]:
import pandas as pd

customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')

print(customers_df.head())
print(products_df.head())
print(transactions_df.head())


  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3       

In [6]:
merged_df = pd.merge(transactions_df, customers_df, on='CustomerID', how='left')
merged_df = pd.merge(merged_df, products_df, on='ProductID', how='left')

merged_df['TransactionDate'] = pd.to_datetime(merged_df['TransactionDate'])

customer_profile = merged_df.groupby('CustomerID').agg(
    total_spend=('TotalValue', 'sum'),
    num_transactions=('TransactionID', 'nunique'),
    avg_spend=('TotalValue', 'mean'),
    most_purchased_category=('Category', lambda x: x.mode()[0]),
    recency=('TransactionDate', lambda x: (pd.to_datetime('today') - x.max()).days)
).reset_index()


print(customer_profile.head())


  CustomerID  total_spend  num_transactions  avg_spend  \
0      C0001      3354.52                 5    670.904   
1      C0002      1862.74                 4    465.685   
2      C0003      2725.38                 4    681.345   
3      C0004      5354.88                 8    669.360   
4      C0005      2034.24                 3    678.080   

  most_purchased_category  recency  
0             Electronics       85  
1                Clothing       55  
2              Home Decor      155  
3                   Books       34  
4             Electronics       84  


In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity


features = customer_profile[['total_spend', 'num_transactions', 'avg_spend', 'recency']]


scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)


similarity_matrix = cosine_similarity(scaled_features)


similarity_df = pd.DataFrame(similarity_matrix, index=customer_profile['CustomerID'], columns=customer_profile['CustomerID'])

print(similarity_df.head())


CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000  0.715345 -0.595896  0.339112  0.388220 -0.338022   
C0002       0.715345  1.000000  0.068836 -0.261288  0.709845 -0.653791   
C0003      -0.595896  0.068836  1.000000 -0.940056  0.462793  0.057745   
C0004       0.339112 -0.261288 -0.940056  1.000000 -0.726953 -0.132250   
C0005       0.388220  0.709845  0.462793 -0.726953  1.000000  0.005291   

CustomerID     C0007     C0008     C0009     C0010  ...     C0191     C0192  \
CustomerID                                          ...                       
C0001      -0.491568  0.340421  0.302495  0.588296  ...  0.985949 -0.414551   
C0002      -0.143131  0.046294  0.881564  0.984193  ...  0.795619  0.338119   
C0003       0.813245 -0.718298  0.481952  0.197951  ... -0.535058  0.885775   
C0004      -0.875277  0.841557 -0.566660 -0.344587  ...  0.304736 -0.784316   
C0005  

In [8]:
# Get the top 3 most similar customers for the first 20 customers
lookalike_recommendations = {}

for customer_id in customer_profile['CustomerID'][:20]:
    similarities = similarity_df[customer_id]

    sorted_similarities = similarities.drop(customer_id).sort_values(ascending=False)

    top_3_similar = sorted_similarities.head(3)

    lookalike_recommendations[customer_id] = list(zip(top_3_similar.index, top_3_similar.values))

lookalike_df = pd.DataFrame([
    {'CustomerID': cust_id, 'Lookalikes': lookalikes}
    for cust_id, lookalikes in lookalike_recommendations.items()
])

print(lookalike_df.head())


  CustomerID                                         Lookalikes
0      C0001  [(C0056, 0.9958000025797457), (C0190, 0.988015...
1      C0002  [(C0031, 0.9982716805506013), (C0029, 0.989477...
2      C0003  [(C0112, 0.9983440085944362), (C0144, 0.997910...
3      C0004  [(C0101, 0.9977071256667731), (C0075, 0.983313...
4      C0005  [(C0123, 0.9980016654890479), (C0036, 0.984902...


In [9]:
lookalike_df.to_csv('Lookalike.csv', index=False)


The algorithm for the Lookalike Model follows these main steps:

Data Loading and Merging:

Load the Customers.csv, Products.csv, and Transactions.csv datasets into pandas DataFrames.
Merge these datasets on CustomerID and ProductID to create a unified DataFrame containing all transaction details along with customer and product information.
Feature Engineering:

For each customer, create several features that describe their transaction behavior:
Total Spend: Calculate the total monetary value spent by the customer, which is the sum of the TotalValue for each transaction.
Number of Transactions: Count how many unique transactions each customer has made.
Average Spend: Compute the average spending per transaction by calculating the mean of TotalValue.
Most Purchased Category: Identify the most frequently purchased product category by calculating the mode of the Category column for each customer.
Recency: Calculate how many days have passed since the customer's most recent transaction using the difference between today's date and the latest TransactionDate.
Similarity Calculation:

Normalize the features (i.e., standardize them) to ensure that each feature contributes equally to the similarity measure.
Calculate the Cosine Similarity between all customer profiles based on the selected features (total_spend, num_transactions, avg_spend, and recency). Cosine similarity is chosen here because it measures the cosine of the angle between two vectors, which helps identify customers with similar transaction behaviors.
Recommendation Generation:

For each of the first 20 customers (CustomerID: C0001 - C0020), find the top 3 most similar customers by sorting the similarity scores in descending order.
Exclude the customer themselves from the recommendations, as the similarity with themselves is always 1.
Result Compilation:

Store the recommended lookalike customers and their similarity scores in a dictionary, where each key is the CustomerID and the value is a list of tuples containing the CustomerID of similar customers and their respective similarity scores.
Convert this dictionary into a pandas DataFrame and export it to a CSV file (Lookalike.csv) for further use.