###Divya Bhagat

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
Customers=pd.read_csv('Customers.csv')
Customers.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [None]:
Products=pd.read_csv("Products.csv")
Products.head()

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


In [None]:
Transactions=pd.read_csv("Transactions.csv")
Transactions.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


In [None]:
Customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   CustomerID    200 non-null    object
 1   CustomerName  200 non-null    object
 2   Region        200 non-null    object
 3   SignupDate    200 non-null    object
dtypes: object(4)
memory usage: 6.4+ KB


In [None]:
Products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   ProductID    100 non-null    object 
 1   ProductName  100 non-null    object 
 2   Category     100 non-null    object 
 3   Price        100 non-null    float64
dtypes: float64(1), object(3)
memory usage: 3.3+ KB


In [None]:
Transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   TransactionID    1000 non-null   object 
 1   CustomerID       1000 non-null   object 
 2   ProductID        1000 non-null   object 
 3   TransactionDate  1000 non-null   object 
 4   Quantity         1000 non-null   int64  
 5   TotalValue       1000 non-null   float64
 6   Price            1000 non-null   float64
dtypes: float64(2), int64(1), object(4)
memory usage: 54.8+ KB


##### Lookalike Model

In [None]:
# Merge dataframes on ProductID and CustomerID
customer_transactions = pd.merge(transactions, customers, on='CustomerID', how='left')
customer_transactions = pd.merge(customer_transactions, products, on='ProductID', how='left')
customer_transactions.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [None]:
# Feature engineering: Calculate total spend, average spend, transaction frequency, and category preferences
customer_features = customer_transactions.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    avg_spent=('TotalValue', 'mean'),
    transaction_count=('TransactionID', 'count')
).reset_index()
customer_features.head()

Unnamed: 0,CustomerID,total_spent,avg_spent,transaction_count
0,C0001,3354.52,670.904,5
1,C0002,1862.74,465.685,4
2,C0003,2725.38,681.345,4
3,C0004,5354.88,669.36,8
4,C0005,2034.24,678.08,3


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder

In [None]:
# Encode categorical data (e.g., Product Category) using OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)  # Change sparse to sparse_output
product_category_encoded = encoder.fit_transform(customer_transactions[['Category']])
product_category_encoded

array([[0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       ...,
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.]])

In [None]:
# Add encoded features to customer features
category_df = pd.DataFrame(product_category_encoded, columns=encoder.get_feature_names_out())
category_df.head()

Unnamed: 0,Category_Books,Category_Clothing,Category_Electronics,Category_Home Decor
0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0


In [None]:
category_df['CustomerID'] = customer_transactions['CustomerID'].values  # Retain the CustomerID column
category_df = category_df.groupby('CustomerID').sum()  # group by CustomerID
category_df.head()

Unnamed: 0_level_0,Category_Books,Category_Clothing,Category_Electronics,Category_Home Decor
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C0001,1.0,0.0,3.0,1.0
C0002,0.0,2.0,0.0,2.0
C0003,0.0,1.0,1.0,2.0
C0004,3.0,0.0,2.0,3.0
C0005,0.0,0.0,2.0,1.0


In [None]:
# Merge the category features with the original customer features
customer_features = pd.merge(customer_features, category_df, on='CustomerID', how='left')
customer_features.head()

Unnamed: 0,CustomerID,total_spent,avg_spent,transaction_count,Category_Books_x,Category_Clothing_x,Category_Electronics_x,Category_Home Decor_x,Category_Books_y,Category_Clothing_y,Category_Electronics_y,Category_Home Decor_y,Category_Books,Category_Clothing,Category_Electronics,Category_Home Decor
0,C0001,3354.52,670.904,5,1.0,0.0,3.0,1.0,1.0,0.0,3.0,1.0,1.0,0.0,3.0,1.0
1,C0002,1862.74,465.685,4,0.0,2.0,0.0,2.0,0.0,2.0,0.0,2.0,0.0,2.0,0.0,2.0
2,C0003,2725.38,681.345,4,0.0,1.0,1.0,2.0,0.0,1.0,1.0,2.0,0.0,1.0,1.0,2.0
3,C0004,5354.88,669.36,8,3.0,0.0,2.0,3.0,3.0,0.0,2.0,3.0,3.0,0.0,2.0,3.0
4,C0005,2034.24,678.08,3,0.0,0.0,2.0,1.0,0.0,0.0,2.0,1.0,0.0,0.0,2.0,1.0


In [None]:
# Calculate cosine similarity between customers
# We combine all features (numerical + one-hot encoded categories)
feature_matrix = customer_features.drop('CustomerID', axis=1).values
feature_matrix

array([[3.35452e+03, 6.70904e+02, 5.00000e+00, ..., 0.00000e+00,
        3.00000e+00, 1.00000e+00],
       [1.86274e+03, 4.65685e+02, 4.00000e+00, ..., 2.00000e+00,
        0.00000e+00, 2.00000e+00],
       [2.72538e+03, 6.81345e+02, 4.00000e+00, ..., 1.00000e+00,
        1.00000e+00, 2.00000e+00],
       ...,
       [9.31830e+02, 4.65915e+02, 2.00000e+00, ..., 1.00000e+00,
        1.00000e+00, 0.00000e+00],
       [1.97928e+03, 4.94820e+02, 4.00000e+00, ..., 0.00000e+00,
        2.00000e+00, 2.00000e+00],
       [4.75860e+03, 9.51720e+02, 5.00000e+00, ..., 2.00000e+00,
        1.00000e+00, 1.00000e+00]])

In [None]:
similarity_matrix = cosine_similarity(feature_matrix)
similarity_matrix

array([[1.        , 0.9988642 , 0.99886715, ..., 0.96476232, 0.99886717,
        0.99999895],
       [0.9988642 , 1.        , 0.99999872, ..., 0.97618427, 0.99999692,
        0.99886582],
       [0.99886715, 0.99999872, 1.        , ..., 0.97618524, 0.99999897,
        0.99886758],
       ...,
       [0.96476232, 0.97618427, 0.97618524, ..., 1.        , 0.97618446,
        0.96476219],
       [0.99886717, 0.99999692, 0.99999897, ..., 0.97618446, 1.        ,
        0.99886559],
       [0.99999895, 0.99886582, 0.99886758, ..., 0.96476219, 0.99886559,
        1.        ]])

In [None]:
# Function to get top 3 lookalikes for each customer
def get_top_lookalikes(similarity_matrix, num_top=3):
    lookalikes = {}
    for idx, row in enumerate(similarity_matrix):
        top_indices = row.argsort()[-(num_top+1):-1]  # Exclude the customer itself
        lookalikes[customers.loc[idx, 'CustomerID']] = [(customers.loc[i, 'CustomerID'], row[i]) for i in top_indices]
    return lookalikes

In [None]:
# Get top 3 lookalikes for customers C0001 to C0020
lookalikes = get_top_lookalikes(similarity_matrix[:20])

Top 3 Lookalikes with Similarity Scores for the First 20 Customers:

The get_top_lookalikes function computes the cosine similarity between customers.
For each of the first 20 customers (from C0001 to C0020), it finds the top 3 most similar customers (based on cosine similarity).
It then stores the results in a DataFrame (lookalike_df) with CustomerID, LookalikeCustomerID, and SimilarityScore columns.

In [None]:
# Save lookalikes to a CSV
lookalike_data = []
for cust_id, recommendations in lookalikes.items():
    for rec in recommendations:
        lookalike_data.append([cust_id, rec[0], rec[1]])

In [None]:
lookalike_df = pd.DataFrame(lookalike_data, columns=['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])
lookalike_df.head()

Unnamed: 0,CustomerID,LookalikeCustomerID,SimilarityScore
0,C0001,C0048,1.0
1,C0001,C0055,1.0
2,C0001,C0189,1.0
3,C0002,C0025,0.999999
4,C0002,C0031,0.999999


In [None]:
lookalike_df.to_csv('Lookalike.csv', index=False)

In [None]:
print("Lookalike model and recommendations saved to 'Lookalike.csv'.")

Lookalike model and recommendations saved to 'Lookalike.csv'.
