In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
import numpy as np

In [2]:
# Load datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [3]:
# Convert date columns to datetime
customers["SignupDate"] = pd.to_datetime(customers["SignupDate"], errors="coerce")
transactions["TransactionDate"] = pd.to_datetime(transactions["TransactionDate"], errors="coerce")

# Handle any rows with invalid dates
customers.dropna(subset=["SignupDate"], inplace=True)
transactions.dropna(subset=["TransactionDate"], inplace=True)

# Check for and handle missing values
customers.fillna("Unknown", inplace=True)
products.fillna("Unknown", inplace=True)
transactions.fillna(0, inplace=True)

## Lookalike Model

In [5]:
# Combine customer and transaction data
customer_transactions = transactions.groupby("CustomerID").agg({"TotalValue": "sum", "Quantity": "sum"}).reset_index()
customer_profiles = pd.merge(customers, customer_transactions, on="CustomerID", how="left").fillna(0)

# Calculate similarity using cosine similarity
features = ["TotalValue", "Quantity"]
similarity_matrix = cosine_similarity(customer_profiles[features])

# Find top 3 lookalikes for each customer
lookalikes = {}
for i, customer_id in enumerate(customer_profiles["CustomerID"]):
    similar_indices = np.argsort(similarity_matrix[i])[::-1][1:4]  # Exclude self
    similar_customers = [(customer_profiles["CustomerID"].iloc[j], round(similarity_matrix[i, j], 12)) for j in similar_indices]
    lookalikes[customer_id] = similar_customers

# Save lookalikes for the first 20 customers
output = {cust: lookalikes[cust] for cust in customer_profiles["CustomerID"].iloc[:20]}
lookalike_df = pd.DataFrame.from_dict(output, orient="index", columns=["Top1", "Top2", "Top3"])
lookalike_df.rename(columns={"index": "CustomerID"}, inplace=True)
lookalike_df.to_csv("Lookalike.csv")
print("Lookalike recommendations saved to Lookalike.csv.")

Lookalike recommendations saved to Lookalike.csv.


# Model Development
## 1) Feature Engineering:
 - Aggregated transactional data by CustomerID, calculating:
    - Total purchase value (TotalValue)
    - Total quantity of items purchased (Quantity).
 - Merged the aggregated data with customer profiles to form a comprehensive dataset.
## 2) Similarity Computation:
 - Used Cosine Similarity to compute the similarity matrix between customers based on their transactional features (TotalValue and Quantity).
 - This approach quantifies similarity on a scale of 0 to 1.
## 3) Recommendation Process:
 - For each customer, identified the top 3 most similar customers by sorting their similarity scores (excluding self-similarity).
 - Stored the results as a dictionary, where keys represent CustomerID and values contain a list of tuples (customerID(Lookalike customers), Score).
## 4)Output Generation:
 - Created a CSV file named Lookalike.csv, containing:
   - CustomerID: The target customer.
   - LookalikeID: IDs of the three most similar customers.
   - Score: Similarity scores rounded to 12 decimal places.