<a href="https://colab.research.google.com/github/dhanushba/Data_Science_Assignment/blob/main/Lookalike/Dhanush_B_A_Lookalike.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [16]:
# Step 1: Load datasets
customers_df = pd.read_csv('Customers.csv')  # Customer dataset
products_df = pd.read_csv('Products.csv')    # Product dataset
transactions_df = pd.read_csv('Transactions.csv')  # Transactions dataset


In [17]:
# Step 2: Merge datasets to get relevant information in one dataframe
merged_df = transactions_df.merge(products_df, on='ProductID', how='left')
merged_df = merged_df.merge(customers_df, on='CustomerID', how='left')

In [18]:
# Step 3: Feature Engineering
# Calculate total spent per customer, transaction count, etc.
customer_features = merged_df.groupby('CustomerID').agg(
    total_spent=pd.NamedAgg(column='TotalValue', aggfunc='sum'),
    purchase_count=pd.NamedAgg(column='TransactionID', aggfunc='count'),
    avg_purchase_value=pd.NamedAgg(column='TotalValue', aggfunc='mean')
).reset_index()

# Calculate spend per product category for each customer
category_features = merged_df.groupby(['CustomerID', 'Category']).agg(
    category_spend=pd.NamedAgg(column='TotalValue', aggfunc='sum')
).reset_index()

# Pivot category features to create a category spend matrix
category_pivot = category_features.pivot_table(index='CustomerID', columns='Category', values='category_spend', fill_value=0)


In [19]:
# Step 4: Combine customer features with category features
full_features = pd.merge(customer_features, category_pivot, on='CustomerID', how='left')


In [20]:
# Step 5: Normalize features using StandardScaler
scaler = StandardScaler()
scaled_features = scaler.fit_transform(full_features.drop('CustomerID', axis=1))


In [21]:
# Step 6: Compute Cosine Similarity Matrix
cosine_sim = cosine_similarity(scaled_features)


In [22]:
# Step 7: Function to get top 3 similar customers for each customer
def get_top_similar_customers(customer_id, top_n=3):
    customer_index = full_features[full_features['CustomerID'] == customer_id].index[0]
    similarity_scores = cosine_sim[customer_index]
    similar_indices = similarity_scores.argsort()[-(top_n + 1):-1]  # Exclude the customer itself
    similar_customers = full_features.iloc[similar_indices]

    # Create a map of similar customers and their similarity scores
    recommendations = [
        (full_features.iloc[similar_indices[i]]['CustomerID'], similarity_scores[similar_indices[i]])
        for i in range(top_n)
    ]
    return recommendations

In [23]:
# Step 8: Prepare the Lookalike recommendations for the first 20 customers
lookalike_map = {}
for customer_id in customers_df['CustomerID'][:20]:
    lookalike_map[customer_id] = get_top_similar_customers(customer_id)


In [26]:
# Step 9: Save recommendations as a CSV file
lookalike_list = []
for customer_id, recommendations in lookalike_map.items():
    for rec in recommendations:
        lookalike_list.append([customer_id, rec[0], rec[1]])

lookalike_df = pd.DataFrame(lookalike_list, columns=['CustomerID', 'RecommendedCustomerID', 'SimilarityScore'])

# Display all rows in the dataframe for verification
pd.set_option('display.max_rows', None)  # Show all rows
print(lookalike_df.head(20))  # Preview the first 20 lookalike recommendations

# Save the lookalike recommendations to CSV
lookalike_df.to_csv('Dhanush_B_A_Lookalike.csv', index=False)

# Optional: Reset the option to show default rows
pd.reset_option('display.max_rows')

   CustomerID RecommendedCustomerID  SimilarityScore
0       C0001                 C0072         0.875402
1       C0001                 C0091         0.924337
2       C0001                 C0069         0.968236
3       C0002                 C0134         0.863323
4       C0002                 C0055         0.867274
5       C0002                 C0036         0.878586
6       C0003                 C0007         0.897475
7       C0003                 C0005         0.900679
8       C0003                 C0166         0.926960
9       C0004                 C0090         0.913134
10      C0004                 C0065         0.921354
11      C0004                 C0075         0.980464
12      C0005                 C0003         0.900679
13      C0005                 C0166         0.922584
14      C0005                 C0197         0.979024
15      C0006                 C0200         0.871869
16      C0006                 C0196         0.894301
17      C0006                 C0185         0.