In [28]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [29]:
# Merge transactions with product data
transactions = transactions.merge(products, on="ProductID", how="left")

# Aggregate transaction data for each customer
customer_transactions = transactions.groupby("CustomerID").agg(
    total_spent=("TotalValue", "sum"),
    num_transactions=("TransactionID", "count"),
    favorite_category=("Category", lambda x: x.mode()[0] if not x.mode().empty else "Unknown")
).reset_index()


In [30]:
# Merge with customer data
customer_profiles = customers.merge(customer_transactions, on="CustomerID", how="left").fillna({
    'total_spent': 0,
    'num_transactions': 0,
    'favorite_category': 'Unknown'
})

# Encode categorical features (favorite category, Region, etc.)
customer_profiles = pd.get_dummies(customer_profiles, columns=["favorite_category", "Region"], drop_first=True)


In [31]:
# Extract relevant features for similarity calculation
feature_columns = ['total_spent', 'num_transactions'] + \
                  [col for col in customer_profiles.columns if col.startswith("favorite_category_") or col.startswith("Region_")]
features = customer_profiles[feature_columns]

# Standardize the features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

In [32]:
# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(features_scaled)

# Get Customer IDs
customer_ids = customer_profiles['CustomerID'].tolist()


In [33]:
# Generate lookalike recommendations for the first 20 customers
lookalike_data = {}
for i in range(20):  # First 20 customers
    customer_id = customer_ids[i]
    similarity_scores = list(enumerate(similarity_matrix[i]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)  # Sort by similarity score
    top_lookalikes = [
        (customer_ids[idx], round(score, 4))
        for idx, score in similarity_scores[1:4]  # Exclude the customer themselves
    ]
    lookalike_data[customer_id] = top_lookalikes


In [34]:
# Save the results to Lookalike.csv
lookalike_output = pd.DataFrame({
    "CustomerID": lookalike_data.keys(),
    "Top_Lookalikes": [str(value) for value in lookalike_data.values()]
})
lookalike_output.to_csv("Devaannamalai_R_Lookalike.csv", index=False)

# Print sample output
print(lookalike_output.head())


  CustomerID                                     Top_Lookalikes
0      C0001  [('C0190', 0.9971), ('C0048', 0.9948), ('C0181...
1      C0002  [('C0088', 0.9812), ('C0092', 0.9671), ('C0134...
2      C0003  [('C0052', 0.999), ('C0031', 0.9908), ('C0076'...
3      C0004  [('C0155', 0.9869), ('C0165', 0.977), ('C0087'...
4      C0005  [('C0186', 0.9985), ('C0007', 0.9934), ('C0140...
