In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [2]:
# Load the datasets
customers_df = pd.read_csv('Datasets/Customers.csv')
products_df = pd.read_csv('Datasets/Products.csv')
transactions_df = pd.read_csv('Datasets/Transactions.csv')

In [3]:
# Data Cleaning and Preprocessing (same as in EDA notebook for consistency)
customers_df.rename(columns={'CustomerID': 'CustomerID'}, inplace=True)
customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
products_df.rename(columns={'ProductID': 'ProductID'}, inplace=True)
transactions_df.rename(columns={'TransactionID': 'TransactionID'}, inplace=True)
transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])
customers_df['SignupYear'] = customers_df['SignupDate'].dt.year
transactions_df['TransactionYear'] = transactions_df['TransactionDate'].dt.year

In [4]:
# --- Feature Engineering for Lookalike Model ---

# 1. Customer Profile Features
customer_profile_features = customers_df[['CustomerID', 'Region', 'SignupYear']].copy()

# One-Hot Encode 'Region'
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
region_encoded = encoder.fit_transform(customer_profile_features[['Region']])
region_feature_names = encoder.get_feature_names_out(['Region'])
region_df = pd.DataFrame(region_encoded, columns=region_feature_names, index=customer_profile_features.index)
customer_profile_features = pd.concat([customer_profile_features, region_df], axis=1).drop(['Region'], axis=1)

In [5]:
# 2. Customer Transaction Features
customer_transaction_features = transactions_df.groupby('CustomerID').agg(
    TransactionCount=('TransactionID', 'count'),
    TotalTransactionValue=('TotalValue', 'sum'),
    AvgQuantityPerTransaction=('Quantity', 'mean'),
    LastTransactionDate=('TransactionDate', 'max')
).reset_index()

ref_date = transactions_df['TransactionDate'].max() + pd.Timedelta(days=1)
customer_transaction_features['Recency'] = (ref_date - customer_transaction_features['LastTransactionDate']).dt.days
customer_transaction_features = customer_transaction_features.drop('LastTransactionDate', axis=1)

In [6]:
# 3. Customer Category Preference Features
merged_transaction_product = pd.merge(transactions_df, products_df[['ProductID', 'Category']], on='ProductID', how='left')
category_pref_value = merged_transaction_product.groupby(['CustomerID', 'Category'])['TotalValue'].sum().unstack(fill_value=0)
category_pref_value.columns = [f"CategoryValue_{col}" for col in category_pref_value.columns] # Renaming columns to avoid conflicts
category_pref_quantity = merged_transaction_product.groupby(['CustomerID', 'Category'])['Quantity'].sum().unstack(fill_value=0)
category_pref_quantity.columns = [f"CategoryQuantity_{col}" for col in category_pref_quantity.columns] # Renaming columns

customer_category_features_value = category_pref_value.reset_index()
customer_category_features_quantity = category_pref_quantity.reset_index()


In [7]:
# Merge all features
customer_features = pd.merge(customer_profile_features, customer_transaction_features, on='CustomerID', how='left').fillna(0) # Fill NaN from transactions with 0, indicating no transaction
customer_features = pd.merge(customer_features, customer_category_features_value, on='CustomerID', how='left').fillna(0)
customer_features = pd.merge(customer_features, customer_category_features_quantity, on='CustomerID', how='left').fillna(0)

customer_features.set_index('CustomerID', inplace=True)
customer_features = customer_features.drop('SignupYear', axis=1)

In [8]:
# --- Data Scaling ---
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features)
scaled_customer_features_df = pd.DataFrame(scaled_features, index=customer_features.index, columns=customer_features.columns)


# --- Cosine Similarity Calculation ---
cosine_sim_matrix = cosine_similarity(scaled_customer_features_df)
cosine_sim_df = pd.DataFrame(cosine_sim_matrix, index=scaled_customer_features_df.index, columns=scaled_customer_features_df.index)

In [9]:
# --- Lookalike Recommendation Function ---
def get_lookalikes(customer_id, similarity_matrix_df, top_n=3):
    if customer_id not in similarity_matrix_df.index:
        return "Customer ID not found."

    similarity_scores = similarity_matrix_df[customer_id]
    lookalike_customers = similarity_scores.sort_values(ascending=False)

    # Exclude the customer themselves and get top N
    top_lookalikes = lookalike_customers.drop(customer_id).head(top_n)
    return list(zip(top_lookalikes.index, top_lookalikes.values))


In [10]:
# --- Generate Lookalike Recommendations for first 20 Customers ---
lookalike_map = {}
customer_ids_for_recommendation = customers_df['CustomerID'].head(20).tolist()

for cust_id in customer_ids_for_recommendation:
    lookalikes = get_lookalikes(cust_id, cosine_sim_df)
    lookalike_map[cust_id] = lookalikes

print("Lookalike Recommendations for first 20 customers:")
print(lookalike_map)

Lookalike Recommendations for first 20 customers:
{'C0001': [('C0120', np.float64(0.8242628332695773)), ('C0181', np.float64(0.8227028198320376)), ('C0168', np.float64(0.7432275474372877))], 'C0002': [('C0159', np.float64(0.972351534419235)), ('C0178', np.float64(0.9302581903333679)), ('C0106', np.float64(0.8621560717347246))], 'C0003': [('C0129', np.float64(0.6992516854215831)), ('C0091', np.float64(0.6909768044268577)), ('C0031', np.float64(0.6865599555462545))], 'C0004': [('C0113', np.float64(0.874793862586715)), ('C0012', np.float64(0.8531450253054403)), ('C0148', np.float64(0.7728514213838489))], 'C0005': [('C0140', np.float64(0.92159544752479)), ('C0007', np.float64(0.9153139923935472)), ('C0123', np.float64(0.8324571310027925))], 'C0006': [('C0169', np.float64(0.7730079128858565)), ('C0108', np.float64(0.726602951924099)), ('C0153', np.float64(0.7089250983966404))], 'C0007': [('C0005', np.float64(0.9153139923935472)), ('C0140', np.float64(0.8359833356535368)), ('C0080', np.float

In [11]:
# --- Create Lookalike.csv ---
lookalike_list_for_csv = []
for cust_id, recommendations in lookalike_map.items():
    recommendation_str = ""
    for rec_cust_id, score in recommendations:
        recommendation_str += f"[{rec_cust_id},{score:.4f}];"
    lookalike_list_for_csv.append({'CustomerID': cust_id, 'LookalikeRecommendations': recommendation_str[:-1]}) # Remove trailing semicolon

lookalike_csv_df = pd.DataFrame(lookalike_list_for_csv)
lookalike_csv_df.to_csv('Anshuk_Jirli_Lookalike.csv', index=False)

print("\nLookalike.csv created successfully.")
print(lookalike_csv_df.head(20).to_string())

print("\nTask 2 completed. Lookalike recommendations and Lookalike.csv generated.")


Lookalike.csv created successfully.
   CustomerID                      LookalikeRecommendations
0       C0001  [C0120,0.8243];[C0181,0.8227];[C0168,0.7432]
1       C0002  [C0159,0.9724];[C0178,0.9303];[C0106,0.8622]
2       C0003  [C0129,0.6993];[C0091,0.6910];[C0031,0.6866]
3       C0004  [C0113,0.8748];[C0012,0.8531];[C0148,0.7729]
4       C0005  [C0140,0.9216];[C0007,0.9153];[C0123,0.8325]
5       C0006  [C0169,0.7730];[C0108,0.7266];[C0153,0.7089]
6       C0007  [C0005,0.9153];[C0140,0.8360];[C0080,0.7563]
7       C0008  [C0109,0.7789];[C0098,0.7586];[C0194,0.7560]
8       C0009  [C0198,0.9632];[C0119,0.8824];[C0060,0.8475]
9       C0010  [C0111,0.9059];[C0044,0.7500];[C0062,0.7317]
10      C0011  [C0107,0.7630];[C0190,0.7603];[C0099,0.7366]
11      C0012  [C0113,0.9045];[C0148,0.8731];[C0104,0.8654]
12      C0013  [C0099,0.9350];[C0188,0.8535];[C0107,0.7895]
13      C0014  [C0060,0.8199];[C0058,0.7869];[C0097,0.7696]
14      C0015  [C0033,0.8978];[C0036,0.8360];[C0131,0.7921]
15 