In [9]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler


customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')


merged_data = pd.merge(transactions, customers, on='CustomerID', how='left')


merged_data = pd.merge(merged_data, products, on='ProductID', how='left')


print(merged_data.isnull().sum())
merged_data.fillna(0, inplace=True)


customer_transactions = merged_data.groupby('CustomerID').agg(
    total_spent=pd.NamedAgg(column='TotalValue', aggfunc='sum'),
    num_transactions=pd.NamedAgg(column='TransactionID', aggfunc='nunique'),
    avg_spent_per_transaction=pd.NamedAgg(column='TotalValue', aggfunc='mean'),
    most_purchased_category=pd.NamedAgg(column='Category', aggfunc=lambda x: x.mode()[0])
).reset_index()


customer_profile = customers[['CustomerID', 'Region', 'SignupDate']]


customer_data = pd.merge(customer_transactions, customer_profile, on='CustomerID', how='left')


scaler = StandardScaler()
customer_data[['total_spent', 'num_transactions', 'avg_spent_per_transaction']] = scaler.fit_transform(
    customer_data[['total_spent', 'num_transactions', 'avg_spent_per_transaction']])


similarity_matrix = cosine_similarity(customer_data[['total_spent', 'num_transactions', 'avg_spent_per_transaction']])



lookalike_dict = {}


for customer_id in customer_data['CustomerID'][:20]:
    
    index = customer_data[customer_data['CustomerID'] == customer_id].index[0]
    
    
    similarity_scores = similarity_matrix[index]
   
    similarity_scores[index] = -1
    
   
    similar_indices = np.argsort(similarity_scores)[::-1][:3]
    
   
    similar_customers = customer_data.iloc[similar_indices][['CustomerID']].values.flatten()
    scores = similarity_scores[similar_indices]
    
  
    lookalike_dict[customer_id] = list(zip(similar_customers, scores))


lookalike_data = []
for cust_id, lookalikes in lookalike_dict.items():
    for lookalike, score in lookalikes:
        lookalike_data.append([cust_id, lookalike, score])

lookalike_df = pd.DataFrame(lookalike_data, columns=['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])


lookalike_df.to_csv('Lookalike.csv', index=False)


print(lookalike_df.head())


TransactionID      0
CustomerID         0
ProductID          0
TransactionDate    0
Quantity           0
TotalValue         0
Price_x            0
CustomerName       0
Region             0
SignupDate         0
ProductName        0
Category           0
Price_y            0
dtype: int64
  CustomerID LookalikeCustomerID  SimilarityScore
0      C0001               C0137         0.999360
1      C0001               C0152         0.995658
2      C0001               C0121         0.993012
3      C0002               C0029         0.999638
4      C0002               C0199         0.998867
