In [None]:
# pip install scikit-learn
# %pip install import-ipynb

Collecting import-ipynb
  Downloading import_ipynb-0.2-py3-none-any.whl.metadata (2.3 kB)
Collecting nbformat (from import-ipynb)
  Downloading nbformat-5.10.4-py3-none-any.whl.metadata (3.6 kB)
Collecting fastjsonschema>=2.15 (from nbformat->import-ipynb)
  Downloading fastjsonschema-2.21.1-py3-none-any.whl.metadata (2.2 kB)
Collecting jsonschema>=2.6 (from nbformat->import-ipynb)
  Using cached jsonschema-4.23.0-py3-none-any.whl.metadata (7.9 kB)
Collecting attrs>=22.2.0 (from jsonschema>=2.6->nbformat->import-ipynb)
  Using cached attrs-25.1.0-py3-none-any.whl.metadata (10 kB)
Collecting jsonschema-specifications>=2023.03.6 (from jsonschema>=2.6->nbformat->import-ipynb)
  Using cached jsonschema_specifications-2024.10.1-py3-none-any.whl.metadata (3.0 kB)
Collecting referencing>=0.28.4 (from jsonschema>=2.6->nbformat->import-ipynb)
  Using cached referencing-0.36.2-py3-none-any.whl.metadata (2.8 kB)
Collecting rpds-py>=0.7.1 (from jsonschema>=2.6->nbformat->import-ipynb)
  Using cach

In [10]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import import_ipynb
import os
import sys
from contextlib import redirect_stdout

# Suppressing output during the import of the notebook
with open(os.devnull, 'w') as devnull, redirect_stdout(devnull):
    from Dharmanshu_Singh_EDA import merged_data, customers

# Task 2: Lookalike Model

# Step 1: Prepare the data
# Extract relevant features from the merged data
customer_features = merged_data.groupby("CustomerID").agg({
    "TotalValue": "sum",  # Total spending by each customer
    "Quantity": "sum",  # Total quantity of products purchased by each customer
    "ProductID": lambda x: len(x.unique()),  # Count of unique products purchased
    "Category": lambda x: x.mode()[0] if not x.mode().empty else np.nan  # Most purchased category
}).reset_index()

# Rename columns for clarity
customer_features = customer_features.rename(columns={
    "TotalValue": "TotalSpend",
    "Quantity": "TotalQuantity",
    "ProductID": "UniqueProducts",
    "Category": "MostPurchasedCategory"
})

# Step 2: Handle categorical data
# One-hot encode the "MostPurchasedCategory"
category_encoded = pd.get_dummies(customer_features["MostPurchasedCategory"], prefix="Category")
customer_features = pd.concat([customer_features, category_encoded], axis=1).drop(columns=["MostPurchasedCategory"])

# Step 3: Normalize numerical features
scaler = StandardScaler()
numerical_features = ["TotalSpend", "TotalQuantity", "UniqueProducts"]
customer_features[numerical_features] = scaler.fit_transform(customer_features[numerical_features])

# Step 4: Calculate similarity between customers
# Generate a customer similarity matrix using cosine similarity
customer_matrix = customer_features.drop(columns=["CustomerID"]).values
similarity_matrix = cosine_similarity(customer_matrix)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features["CustomerID"], columns=customer_features["CustomerID"])

# Function to find top N lookalikes for a given customer
def get_top_lookalikes(customer_id, top_n=3):
    """Returns top N similar customers and their similarity scores."""
    similar_customers = similarity_df[customer_id].sort_values(ascending=False)[1:top_n+1]
    return list(similar_customers.index), list(similar_customers.values)

# Step 5: Generate lookalike recommendations for the first 20 customers
lookalike_results = {}
for customer_id in customers["CustomerID"].head(20):
    lookalike_ids, scores = get_top_lookalikes(customer_id)
    lookalike_results[customer_id] = list(zip(lookalike_ids, scores))

# Step 6: Save recommendations to Lookalike.csv
lookalike_data = []
for cust_id, lookalikes in lookalike_results.items():
    lookalike_entry = {"cust_id": cust_id, "lookalikes": str(lookalikes)}
    lookalike_data.append(lookalike_entry)

lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df.to_csv("Dharmanshu_Singh_Lookalike.csv", index=False)

print("Dharmanshu_Singh_Lookalike.csv has been generated with the top 3 lookalikes for the first 20 customers.")

# Explaining Model Development
# ------------------------------------
# The lookalike model uses cosine similarity to compare customer profiles based on:
# 1. Total spend: Represents the purchasing power of the customer.
# 2. Total quantity: Indicates the frequency of purchases.
# 3. Unique products: Shows the diversity of products purchased.
# 4. Most purchased category: Encoded as one-hot vectors for categorical comparison.
#
# Steps:
# 1. Preprocessed data by aggregating relevant features and normalizing numerical columns.
# 2. Calculated pairwise cosine similarity between customers to identify similar profiles.
# 3. Selected the top 3 similar customers for the first 20 customers based on the similarity scores.
# 4. Results saved in Lookalike.csv for easy reference.


Dharmanshu_Singh_Lookalike.csv has been generated with the top 3 lookalikes for the first 20 customers.
