In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load data
customers = pd.read_csv("customers.csv")
products = pd.read_csv("products.csv")
transactions = pd.read_csv("transactions.csv")

# Preprocess data
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

# Feature engineering
# Aggregate transaction data by customer
transaction_features = transactions.groupby('CustomerID').agg({
 'TotalValue': ['sum', 'mean', 'count'],
 'Quantity': ['sum', 'mean']
}).reset_index()
transaction_features.columns = ['CustomerID', 'TotalValue_sum', 'TotalValue_mean', 'Transaction_count', 'Quantity_sum', 'Quantity_mean']

# Merge with customer data
customer_data = pd.merge(customers, transaction_features, on='CustomerID', how='left')

# Fill NaN values for customers without transactions
customer_data.fillna(0, inplace=True)

# Encode categorical variables
customer_data = pd.get_dummies(customer_data, columns=['Region'])

# Normalize numerical features
scaler = StandardScaler()
numerical_features = ['TotalValue_sum', 'TotalValue_mean', 'Transaction_count', 'Quantity_sum', 'Quantity_mean']
customer_data[numerical_features] = scaler.fit_transform(customer_data[numerical_features])

# Calculate similarity matrix
similarity_matrix = cosine_similarity(customer_data.drop(columns=['CustomerID', 'CustomerName', 'SignupDate']))

# Get top 3 lookalikes for the first 20 customers
lookalike_results = {}
for i in range(20):
 customer_id = customer_data.iloc[i]['CustomerID']
 similarities = list(enumerate(similarity_matrix[i]))
 # Exclude self-comparison
 similarities = [(idx, score) for idx, score in similarities if idx != i]
 # Sort by similarity score
 similarities.sort(key=lambda x: x[1], reverse=True)
 # Get top 3
 top_3 = similarities[:3]
 lookalike_results[customer_id] = [(customer_data.iloc[idx]['CustomerID'], score) for idx, score in top_3]

# Save results to CSV
lookalike_df = pd.DataFrame.from_dict(lookalike_results, orient='index')
lookalike_df.to_csv("Lookalike.csv", header=False)

print("Lookalike recommendations saved to Lookalike.csv")

Lookalike recommendations saved to Lookalike.csv
