**Task 2: Lookalike Model**

In [4]:
# Task 2: Lookalike Model

# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

# Loading datasets
customers_path = "/content/drive/MyDrive/Zeotap Dataset/Customers.csv"
products_path = "/content/drive/MyDrive/Zeotap Dataset/Products.csv"
transactions_path = "/content/drive/MyDrive/Zeotap Dataset/Transactions.csv"

customers = pd.read_csv(customers_path)
products = pd.read_csv(products_path)
transactions = pd.read_csv(transactions_path)

# Standardizing column names
customers.columns = customers.columns.str.strip()
products.columns = products.columns.str.strip()
transactions.columns = transactions.columns.str.strip()

# Convert SignupDate to datetime format
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])

# Feature Engineering: Creating Aggregated Transaction Data
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])
customer_spending = transactions.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    total_transactions=('TransactionID', 'count'),
    avg_order_value=('TotalValue', 'mean'),
    first_purchase=('TransactionDate', 'min'),
    last_purchase=('TransactionDate', 'max')
).reset_index()

# Merge customer information with spending data
customer_data = customers.merge(customer_spending, on='CustomerID', how='left')

# Handle missing values correctly
customer_data['first_purchase'] = customer_data['first_purchase'].fillna(pd.NaT)
customer_data['last_purchase'] = customer_data['last_purchase'].fillna(pd.NaT)
customer_data.fillna(0, inplace=True)  # Handle numerical missing values

# Feature Engineering: Encoding Categorical Features
customer_data['SignupDays'] = (customer_data['SignupDate'].max() - customer_data['SignupDate']).dt.days
customer_data.drop(columns=['SignupDate', 'first_purchase', 'last_purchase'], inplace=True)

# One-hot encoding for categorical features (Region)
customer_data = pd.get_dummies(customer_data, columns=['Region'], drop_first=True)

# Normalizing Data
scaler = StandardScaler()
features = ['total_spent', 'total_transactions', 'avg_order_value', 'SignupDays'] + list(customer_data.columns[4:])
customer_data_scaled = scaler.fit_transform(customer_data[features])

# Training Nearest Neighbors Model
knn = NearestNeighbors(n_neighbors=4, metric='euclidean')  # 4 neighbors (including self)
knn.fit(customer_data_scaled)

# Finding Lookalikes for first 20 customers
customer_ids = customer_data['CustomerID'].values[:20]
lookalike_results = []

for idx, customer in enumerate(customer_ids):
    customer_idx = customer_data[customer_data['CustomerID'] == customer].index[0]
    distances, indices = knn.kneighbors([customer_data_scaled[customer_idx]])
    similar_customers = customer_data.iloc[indices[0][1:]]  # Exclude self
    lookalike_results.append(
        [customer] + [item for sublist in zip(similar_customers['CustomerID'], distances[0][1:]) for item in sublist]
    )

# Convert results to DataFrame and save
lookalike_df = pd.DataFrame(lookalike_results, columns=['CustomerID', 'Lookalike1', 'SimilarityScore1', 'Lookalike2', 'SimilarityScore2', 'Lookalike3', 'SimilarityScore3'])
lookalike_df.to_csv("Dheeraj_Mishra_Lookalike.csv", index=False)

print("Lookalike model successfully generated recommendations!")


Lookalike model successfully generated recommendations!


  customer_data.fillna(0, inplace=True)  # Handle numerical missing values


In [5]:
# Validate Lookalike Model Output

import pandas as pd

# Load the generated Lookalike CSV file
file_path = "Dheeraj_Mishra_Lookalike.csv"
lookalike_df = pd.read_csv(file_path)

# Step 1: Display first few rows
print("\nPreview of Lookalike CSV:")
print(lookalike_df.head())

# Step 2: Check column names and data types
print("\nLookalike DataFrame Info:")
print(lookalike_df.info())

# Step 3: Check for missing values
print("\nMissing Values Count:")
print(lookalike_df.isnull().sum())

# Step 4: Check for duplicate customers
duplicate_count = lookalike_df['CustomerID'].duplicated().sum()
print(f"\nNumber of Duplicate Customer Entries: {duplicate_count}")

# Step 5: Verify recommendations for a sample customer
sample_customer = lookalike_df['CustomerID'].iloc[0]  # Taking first customer in dataset
print(f"\nSample Lookalike Recommendations for {sample_customer}:")
print(lookalike_df[lookalike_df['CustomerID'] == sample_customer])

print("\nLookalike model output validation completed!")



Preview of Lookalike CSV:
  CustomerID Lookalike1  SimilarityScore1 Lookalike2  SimilarityScore2  \
0      C0001      C0152          0.359416      C0174          0.782027   
1      C0002      C0134          0.754951      C0106          1.408437   
2      C0003      C0052          0.376965      C0137          0.583755   
3      C0004      C0108          0.547975      C0113          0.700855   
4      C0005      C0159          0.086665      C0027          0.688693   

  Lookalike3  SimilarityScore3  
0      C0011          0.830964  
1      C0005          1.546777  
2      C0191          0.687550  
3      C0102          0.715591  
4      C0193          1.111066  

Lookalike DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   CustomerID        20 non-null     object 
 1   Lookalike1        20 non-null     object 
 2   Similari