In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [2]:
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')
products = pd.read_csv('Products.csv')

In [3]:
# Create a customer profile based on transaction history
transactions['TotalSpend'] = transactions['Price'] * transactions['Quantity']

In [4]:
# Aggregate the transaction data at the customer level
customer_features = transactions.groupby('CustomerID').agg({
    'TotalSpend': 'sum',
    'Quantity': 'sum',
}).reset_index()
print(customer_features)

    CustomerID  TotalSpend  Quantity
0        C0001     3354.52        12
1        C0002     1862.74        10
2        C0003     2725.38        14
3        C0004     5354.88        23
4        C0005     2034.24         7
..         ...         ...       ...
194      C0196     4982.88        12
195      C0197     1928.65         9
196      C0198      931.83         3
197      C0199     1979.28         9
198      C0200     4758.60        16

[199 rows x 3 columns]


In [5]:
# Create a profile of most purchased product categories
product_categories = transactions.merge(products[['ProductID', 'Category']], on='ProductID', how='left')
print(product_categories)

    TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0          T00001      C0199      P067  2024-08-25 12:38:23         1   
1          T00112      C0146      P067  2024-05-27 22:23:54         1   
2          T00166      C0127      P067  2024-04-25 07:38:55         1   
3          T00272      C0087      P067  2024-03-26 22:55:37         2   
4          T00363      C0070      P067  2024-03-21 15:10:10         3   
..            ...        ...       ...                  ...       ...   
995        T00496      C0118      P037  2024-10-24 08:30:27         1   
996        T00759      C0059      P037  2024-06-04 02:15:24         3   
997        T00922      C0018      P037  2024-04-05 13:05:32         4   
998        T00959      C0115      P037  2024-09-29 10:16:02         2   
999        T00992      C0024      P037  2024-04-21 10:52:24         1   

     TotalValue   Price  TotalSpend     Category  
0        300.68  300.68      300.68  Electronics  
1        300.68  300.

In [6]:
# Aggregate the product categories for each customer
category_features = product_categories.groupby('CustomerID')['Category'].value_counts().unstack(fill_value=0)
print("Product Category Features:")
print(category_features.head(), "\n")

Product Category Features:
Category    Books  Clothing  Electronics  Home Decor
CustomerID                                          
C0001           1         0            3           1
C0002           0         2            0           2
C0003           0         1            1           2
C0004           3         0            2           3
C0005           0         0            2           1 



In [7]:
# Combine customer features with category features
customer_profile = pd.merge(customer_features, category_features, on='CustomerID', how='left').fillna(0)

In [8]:
print("Product Category Features:")
print(customer_profile.head(), "\n")

Product Category Features:
  CustomerID  TotalSpend  Quantity  Books  Clothing  Electronics  Home Decor
0      C0001     3354.52        12      1         0            3           1
1      C0002     1862.74        10      0         2            0           2
2      C0003     2725.38        14      0         1            1           2
3      C0004     5354.88        23      3         0            2           3
4      C0005     2034.24         7      0         0            2           1 



In [9]:
# Normalize the data using StandardScaler
scaler = StandardScaler()
customer_profile_scaled = scaler.fit_transform(customer_profile.iloc[:, 1:]) 

print("Normalized Customer Profile:")
print(customer_profile_scaled[:5], "\n")

Normalized Customer Profile:
[[-0.06170143 -0.12203296 -0.3211125  -1.04160638  1.55087763 -0.22104388]
 [-0.87774353 -0.44800021 -1.22113205  0.77663634 -1.14846331  0.67666495]
 [-0.40585722  0.20393428 -1.22113205 -0.13248502 -0.248683    0.67666495]
 [ 1.03254704  1.67078689  1.47892659 -1.04160638  0.65109731  1.57437379]
 [-0.78392861 -0.93695108 -1.22113205 -1.04160638  0.65109731 -0.22104388]] 



In [10]:
# Calculate the cosine similarity matrix
similarity_matrix = cosine_similarity(customer_profile_scaled)

print("Cosine Similarity Matrix (First 5 Customers):")
print(similarity_matrix[:5, :5], "\n")

Cosine Similarity Matrix (First 5 Customers):
[[ 1.         -0.53171524 -0.0017398   0.16570297  0.66179897]
 [-0.53171524  1.          0.72855119 -0.56703123  0.19037039]
 [-0.0017398   0.72855119  1.         -0.17801532  0.45351359]
 [ 0.16570297 -0.56703123 -0.17801532  1.         -0.44741565]
 [ 0.66179897  0.19037039  0.45351359 -0.44741565  1.        ]] 



In [11]:
# Create the lookalike dictionary
lookalike_dict = {}
for i in range(20):  # For the first 20 customers
    similar_customers = sorted(
        [(j, similarity_matrix[i][j]) for j in range(len(similarity_matrix)) if i != j],
        key=lambda x: x[1],
        reverse=True
    )[:3]  # Get top 3 similar customers
    lookalike_dict[customers.iloc[i]['CustomerID']] = [
        (customers.iloc[sim[0]]['CustomerID'], sim[1]) for sim in similar_customers
    ]

print("Lookalike Recommendations for First 20 Customers:")
for cust_id, lookalikes in lookalike_dict.items():
    print(f"Customer {cust_id}:")
    for lookalike in lookalikes:
        print(f"  - Lookalike {lookalike[0]} with Similarity Score: {lookalike[1]:.4f}")
    print()

Lookalike Recommendations for First 20 Customers:
Customer C0001:
  - Lookalike C0069 with Similarity Score: 0.9444
  - Lookalike C0127 with Similarity Score: 0.8946
  - Lookalike C0120 with Similarity Score: 0.8430

Customer C0002:
  - Lookalike C0133 with Similarity Score: 0.9657
  - Lookalike C0062 with Similarity Score: 0.9066
  - Lookalike C0159 with Similarity Score: 0.8991

Customer C0003:
  - Lookalike C0166 with Similarity Score: 0.9434
  - Lookalike C0031 with Similarity Score: 0.8932
  - Lookalike C0106 with Similarity Score: 0.8280

Customer C0004:
  - Lookalike C0017 with Similarity Score: 0.9231
  - Lookalike C0090 with Similarity Score: 0.9211
  - Lookalike C0113 with Similarity Score: 0.9195

Customer C0005:
  - Lookalike C0007 with Similarity Score: 0.9893
  - Lookalike C0196 with Similarity Score: 0.9885
  - Lookalike C0146 with Similarity Score: 0.9079

Customer C0006:
  - Lookalike C0135 with Similarity Score: 0.8802
  - Lookalike C0186 with Similarity Score: 0.8154

In [12]:
lookalike_df = pd.DataFrame([{
    'cust_id': cust, 
    'lookalikes': lookalike_dict[cust]
} for cust in lookalike_dict])

# Flatten the structure into a more readable form
flat_lookalike_df = pd.DataFrame([{
    'CustomerID': cust,
    'LookalikeCustomerID': lookalike[0],
    'SimilarityScore': lookalike[1]
} for cust, lookalikes in lookalike_dict.items() for lookalike in lookalikes])

In [13]:
print(flat_lookalike_df)

   CustomerID LookalikeCustomerID  SimilarityScore
0       C0001               C0069         0.944377
1       C0001               C0127         0.894590
2       C0001               C0120         0.842955
3       C0002               C0133         0.965708
4       C0002               C0062         0.906610
5       C0002               C0159         0.899121
6       C0003               C0166         0.943362
7       C0003               C0031         0.893241
8       C0003               C0106         0.828002
9       C0004               C0017         0.923063
10      C0004               C0090         0.921135
11      C0004               C0113         0.919519
12      C0005               C0007         0.989294
13      C0005               C0196         0.988472
14      C0005               C0146         0.907931
15      C0006               C0135         0.880172
16      C0006               C0186         0.815387
17      C0006               C0171         0.727052
18      C0007               C00

In [14]:
flat_lookalike_df.to_csv('Divyansh_Saxena_Lookalike.csv', index=False)

In [15]:
print("Top 3 Lookalike Recommendations for First 20 Customers: ")
print(flat_lookalike_df.head(60))

Top 3 Lookalike Recommendations for First 20 Customers: 
   CustomerID LookalikeCustomerID  SimilarityScore
0       C0001               C0069         0.944377
1       C0001               C0127         0.894590
2       C0001               C0120         0.842955
3       C0002               C0133         0.965708
4       C0002               C0062         0.906610
5       C0002               C0159         0.899121
6       C0003               C0166         0.943362
7       C0003               C0031         0.893241
8       C0003               C0106         0.828002
9       C0004               C0017         0.923063
10      C0004               C0090         0.921135
11      C0004               C0113         0.919519
12      C0005               C0007         0.989294
13      C0005               C0196         0.988472
14      C0005               C0146         0.907931
15      C0006               C0135         0.880172
16      C0006               C0186         0.815387
17      C0006            