In [3]:
# Prepare features for lookalike model
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

transactions_df = pd.read_csv('transactions.csv')
customers_df = pd.read_csv('customers.csv')

# Merge transaction data with customer and product data
transaction_features = transactions_df.groupby('CustomerID').agg({
    'TotalValue': ['sum', 'mean'],
    'Quantity': ['sum', 'mean'],
    'ProductID': 'nunique'
}).reset_index()
transaction_features.columns = ['CustomerID', 'total_spend', 'avg_transaction_value', 'total_quantity', 'avg_quantity', 'unique_products']

# Create customer region dummies
customer_features = pd.get_dummies(customers_df, columns=['Region'], prefix='region')

# Merge all features
final_features = customer_features.merge(transaction_features, on='CustomerID', how='left')
final_features = final_features.fillna(0)

# Select features for similarity calculation
feature_cols = [col for col in final_features.columns if col not in ['CustomerID', 'CustomerName', 'SignupDate']]

# Scale features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(final_features[feature_cols])

# Calculate similarity matrix
similarity_matrix = cosine_similarity(scaled_features)

# Function to get top 3 similar customers
def get_top_3_similar(customer_idx, similarity_matrix, customer_ids, exclude_self=True):
    similarities = similarity_matrix[customer_idx]
    if exclude_self:
        similarities[customer_idx] = -1
    top_3_idx = np.argsort(similarities)[-3:][::-1]
    return [(customer_ids[idx], similarities[idx]) for idx in top_3_idx]

# Generate recommendations for first 20 customers
lookalike_results = {}
customer_ids = final_features['CustomerID'].values

for i in range(20):
    customer_id = customer_ids[i]
    similar_customers = get_top_3_similar(i, similarity_matrix, customer_ids)
    lookalike_results[customer_id] = similar_customers

# Create and save Lookalike.csv
lookalike_df = pd.DataFrame({
    'CustomerID': list(lookalike_results.keys()),
    'Recommendations': [str(recommendations) for recommendations in lookalike_results.values()]
})
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Sample of lookalike recommendations:")
for i, (customer_id, recommendations) in enumerate(lookalike_results.items()):
    if i < 5:  # Show first 5 examples
        print(f"\
Customer {customer_id}:")
        for rec_id, score in recommendations:
            print(f"  Similar customer: {rec_id}, Similarity score: {score:.3f}")

Sample of lookalike recommendations:
Customer C0001:
  Similar customer: C0107, Similarity score: 0.990
  Similar customer: C0137, Similarity score: 0.981
  Similar customer: C0174, Similarity score: 0.971
Customer C0002:
  Similar customer: C0142, Similarity score: 0.969
  Similar customer: C0186, Similarity score: 0.943
  Similar customer: C0177, Similarity score: 0.937
Customer C0003:
  Similar customer: C0091, Similarity score: 0.874
  Similar customer: C0190, Similarity score: 0.856
  Similar customer: C0129, Similarity score: 0.853
Customer C0004:
  Similar customer: C0113, Similarity score: 0.982
  Similar customer: C0102, Similarity score: 0.956
  Similar customer: C0104, Similarity score: 0.945
Customer C0005:
  Similar customer: C0186, Similarity score: 0.989
  Similar customer: C0159, Similarity score: 0.974
  Similar customer: C0177, Similarity score: 0.968
