In [43]:
# Import necessary libraries
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import numpy as np

In [3]:
# Load the datasets
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')

In [5]:
# Data Preparation
# Merge customers with their transaction history
customer_transactions = transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum'
}).reset_index()

In [11]:
import warnings
warnings.filterwarnings('ignore')

In [13]:
# Fill NaN values with 0 for customers with no transactions
customer_data['TotalValue'].fillna(0, inplace=True)
customer_data['Quantity'].fillna(0, inplace=True)

In [15]:
# Feature Engineering
# Select relevant features for similarity calculation
features = customer_data[['Region', 'TotalValue', 'Quantity']]

In [17]:
# One-hot encode the 'Region' feature
features = pd.get_dummies(features, columns=['Region'], drop_first=True)

In [19]:
# Standardize the features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

In [21]:
# Calculate similarity scores using cosine similarity
similarity_matrix = cosine_similarity(features_scaled)

In [23]:
# Create a DataFrame to hold similarity scores
similarity_df = pd.DataFrame(similarity_matrix, index=customer_data['CustomerID'], columns=customer_data['CustomerID'])

In [31]:
# Function to get top 3 lookalikes for a given customer
def get_top_lookalikes(customer_id, n=3):
    # Get the similarity scores for the given customer
    scores = similarity_df[customer_id].sort_values(ascending=False)
    # Exclude the customer themselves
    scores = scores[scores.index != customer_id]
    # Get the top n lookalikes
    return scores.head(n)

In [27]:
# Generate lookalikes for the first 20 customers
lookalike_results = []
for customer_id in customer_data['CustomerID'][:20]:
    lookalikes = get_top_lookalikes(customer_id)
    for lookalike_id, score in zip(lookalikes.index, lookalikes.values):
        lookalike_results.append({
            'CustomerID': customer_id,
            'LookalikeID': lookalike_id,
            'SimilarityScore': score
        })

In [33]:
# Convert results to DataFrame
lookalike_df = pd.DataFrame(lookalike_results)

In [35]:
# Save the results to Lookalike.csv
lookalike_df.to_csv('Lookalike.csv', index=False)

In [39]:
# Evaluation of the Lookalike Model

In [45]:
# 1. Model Accuracy and Logic
def evaluate_model_accuracy(similarity_df):
    # Check if the similarity matrix is square
    assert similarity_df.shape[0] == similarity_df.shape[1], "Similarity matrix is not square."
    
    # Check for NaN values in the similarity matrix
    assert not np.any(np.isnan(similarity_df)), "Similarity matrix contains NaN values."
    
    print("Model accuracy checks passed.")
    
evaluate_model_accuracy(similarity_df)

Model accuracy checks passed.


In [47]:
# 2. Quality of Recommendations and Similarity Scores
def evaluate_recommendations(lookalike_results):
    for customer_id, lookalikes in lookalike_results.items():
        print(f"Customer ID: {customer_id}")
        for lookalike_id, score in lookalikes:
            print(f"  Lookalike ID: {lookalike_id}, Similarity Score: {score:.4f}")
        print()


In [49]:
# Evaluate the recommendations
evaluate_recommendations(lookalike_results)

Customer ID: C0001
  Lookalike ID: C0107, Similarity Score: 0.9965
  Lookalike ID: C0137, Similarity Score: 0.9958
  Lookalike ID: C0184, Similarity Score: 0.9957

Customer ID: C0002
  Lookalike ID: C0088, Similarity Score: 0.9961
  Lookalike ID: C0142, Similarity Score: 0.9880
  Lookalike ID: C0159, Similarity Score: 0.9729

Customer ID: C0003
  Lookalike ID: C0147, Similarity Score: 0.9978
  Lookalike ID: C0190, Similarity Score: 0.9970
  Lookalike ID: C0174, Similarity Score: 0.9824

Customer ID: C0004
  Lookalike ID: C0113, Similarity Score: 0.9944
  Lookalike ID: C0102, Similarity Score: 0.9795
  Lookalike ID: C0169, Similarity Score: 0.9788

Customer ID: C0005
  Lookalike ID: C0186, Similarity Score: 0.9969
  Lookalike ID: C0159, Similarity Score: 0.9964
  Lookalike ID: C0140, Similarity Score: 0.9910

Customer ID: C0006
  Lookalike ID: C0048, Similarity Score: 0.9936
  Lookalike ID: C0126, Similarity Score: 0.9913
  Lookalike ID: C0187, Similarity Score: 0.9905

Customer ID: C00

In [51]:
# Additional Evaluation Metrics
def calculate_average_similarity(lookalike_results):
    scores = []
    for lookalikes in lookalike_results.values():
        for _, score in lookalikes:
            scores.append(score)
    average_score = np.mean(scores)
    print(f"Average Similarity Score of Recommendations: {average_score:.4f}")

calculate_average_similarity(lookalike_results)

Average Similarity Score of Recommendations: 0.9905
