In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler,MultiLabelBinarizer
import numpy as np

# Step 1: Load data
customers_df = pd.read_csv('Customers.csv')  # Customer data
products_df = pd.read_csv('Products.csv')  # Product data
transactions_df = pd.read_csv('Transactions.csv')  # Transaction data


In [2]:
# Step 2: Data Preprocessing
# Assuming 'Transactions.csv' has columns like CustomerID, ProductID, Date, Amount
# Merge customer info with their transactions
customer_transaction_data = pd.merge(transactions_df, customers_df, on='CustomerID', how='left')

In [3]:
# Merge product data with transaction data based on ProductID
customer_transaction_data = pd.merge(customer_transaction_data, products_df, on='ProductID', how='left')

In [4]:
print(customer_transaction_data.columns)

Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price_x', 'CustomerName', 'Region',
       'SignupDate', 'ProductName', 'Category', 'Price_y'],
      dtype='object')


In [5]:
# Feature Engineering: Aggregating transaction data
# Using 'TotalValue' for transaction amount, as it seems to represent the total value of the transaction
customer_features = customer_transaction_data.groupby('CustomerID').agg({
    'TotalValue': ['sum', 'mean', 'std'],  # Total, mean, and std for transaction value
    'ProductID': 'nunique',  # Number of unique products purchased
    'Category': lambda x: ','.join(x.unique()),  # List of unique categories purchased
    'ProductName': 'count'  # Number of products purchased
}).reset_index()

In [6]:
# Flatten multi-level column names
customer_features.columns = ['_'.join(col).strip() for col in customer_features.columns.values]


In [7]:
customer_features = customer_features.rename(columns={'CustomerID_': 'CustomerID'})

In [8]:
# Normalize transaction data (important for consistent comparison)
scaler = StandardScaler()
customer_features[['TotalValue_sum', 'TotalValue_mean', 'TotalValue_std']] = scaler.fit_transform(
    customer_features[['TotalValue_sum', 'TotalValue_mean', 'TotalValue_std']]
)

In [9]:
# Create a combined customer profile, adding demographic info from customers
customer_profile = customers_df[['CustomerID', 'CustomerName', 'Region', 'SignupDate']]  # Keep CustomerID for merging


In [10]:
# Merge the dataframes
customer_data = pd.merge(customer_features, customer_profile, on='CustomerID', how='left')

In [11]:
# Merge product data with transaction data based on ProductID
customer_transaction_data = pd.merge(customer_transaction_data, products_df[['ProductID', 'Category']], on='ProductID', how='left')


In [12]:
# Split the Category_<lambda> column into a list
customer_data['Category_list'] = customer_data['Category_<lambda>'].apply(lambda x: x.split(',') if isinstance(x, str) else [])


In [13]:
# Initialize the MultiLabelBinarizer
mlb = MultiLabelBinarizer()

In [28]:
# Apply the transformation to the 'Category_list' column
category_encoded = mlb.fit_transform(customer_data['Category_list'])


In [30]:
# Create a DataFrame with the encoded categories
category_df = pd.DataFrame(category_encoded, columns=mlb.classes_)


In [32]:
# Join the new DataFrame with the existing customer data
customer_data = customer_data.join(category_df)

In [34]:
# Drop the original Category_<lambda> and Category_list columns
customer_data = customer_data.drop(['Category_<lambda>', 'Category_list'], axis=1)


In [38]:
# Drop non-numeric columns before calculating cosine similarity
numerical_columns = customer_data.select_dtypes(include=['float64', 'int64']).columns

In [44]:
# Select only the numeric columns for similarity calculation
customer_data_numeric = customer_data_numeric.fillna(customer_data_numeric.mean())

In [46]:
# Calculate similarity scores using cosine similarity
cosine_sim = cosine_similarity(customer_data_numeric)


In [48]:
# Generate recommendations for the first 20 customers
recommendations = {}
for idx, row in enumerate(cosine_sim[:20]):  # First 20 customers
    # Get similarity scores for the current customer and sort them in descending order
    sim_scores = [(i, score) for i, score in enumerate(row) if i != idx]
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[:3]
    
    # Map the current customer ID to the recommended customers with similarity scores
    recommendations[customers_df.loc[idx, 'CustomerID']] = [
        {'CustomerID': customers_df.loc[i, 'CustomerID'], 'SimilarityScore': score} for i, score in sim_scores
    ]


In [50]:
# Create a DataFrame for Lookalike.csv
lookalike_df = []
for cust_id, recs in recommendations.items():
    for rec in recs:
        lookalike_df.append({'CustomerID': cust_id, 'RecommendedCustomerID': rec['CustomerID'], 'SimilarityScore': rec['SimilarityScore']})

lookalike_df = pd.DataFrame(lookalike_df)

In [52]:
# Save the recommendations to Lookalike.csv
lookalike_df.to_csv('Lookalike.csv', index=False)


In [54]:
print("Lookalike recommendations have been saved to 'Lookalike.csv'")

Lookalike recommendations have been saved to 'Lookalike.csv'
