In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [2]:
# Step 1: Load Data
customers = pd.read_csv('/content/Customers.csv')
products = pd.read_csv('/content/Products.csv')
transactions = pd.read_csv('/content/Transactions.csv')



In [3]:
# Merge transactions with product data to get the product details
merged_data = transactions.merge(products, on='ProductID', how='left')



In [7]:
# Step 2: Feature Engineering

# Create customer features: total spend, number of transactions, and product diversity
customer_features = merged_data.groupby('CustomerID').agg(
    total_spend=('TotalValue', 'sum'),
    num_transactions=('TransactionID', 'count'),
    product_diversity=('ProductID', 'nunique')
).reset_index()
# For simplicity, we'll also include the customer region for profiling
customer_region = customers[['CustomerID', 'Region']].set_index('CustomerID')
customer_features = customer_features.set_index('CustomerID').join(customer_region)

# Fill missing values (if any)
customer_features = customer_features.fillna(0)



In [8]:
# Step 3: Create Feature Vectors
# Normalize the data to scale it appropriately for similarity computation
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
features_scaled = scaler.fit_transform(customer_features[['total_spend', 'num_transactions', 'product_diversity']])



In [9]:
# Step 4: Calculate Cosine Similarity
similarity_matrix = cosine_similarity(features_scaled)



In [11]:
# Step 5: Generate Lookalikes for customers C0001 to C0020
lookalikes = {}

# Loop through the first 20 customers (C0001 - C0020)
for i in range(20):
    cust_id = f'C{str(i+1).zfill(4)}'
    # Get similarity scores for the customer
    similarity_scores = similarity_matrix[i]

    # Exclude the customer itself and sort by similarity
    similarity_scores[i] = -1  # Set the similarity to itself as -1 to avoid self-selection
    top_3_similar = np.argsort(similarity_scores)[-3:]  # Get indices of top 3 most similar customers

    # Map the top 3 similar customers and their similarity scores
    similar_customers = customer_features.index[top_3_similar].tolist()
    similar_scores = similarity_scores[top_3_similar].tolist()

    # Store the lookalike details
    lookalikes[cust_id] = list(zip(similar_customers, similar_scores))


In [12]:
# Step 6: Prepare the DataFrame for Lookalike.csv output
lookalike_data = []
for cust_id, similar_list in lookalikes.items():
    for similar_cust, score in similar_list:
        lookalike_data.append([cust_id, similar_cust, score])

lookalike_df = pd.DataFrame(lookalike_data, columns=['CustomerID', 'Lookalike_CustomerID', 'Similarity_Score'])

In [13]:
# Step 7: Save the results to CSV
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Lookalike model completed and saved to Lookalike.csv")

# View the first few rows of the lookalike data
lookalike_df.head()

Lookalike model completed and saved to Lookalike.csv


Unnamed: 0,CustomerID,Lookalike_CustomerID,Similarity_Score
0,C0001,C0056,0.930427
1,C0001,C0152,0.986905
2,C0001,C0137,0.996332
3,C0002,C0010,0.999182
4,C0002,C0199,0.999347
