In [191]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [192]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [193]:
# Joining transactions and products dataset
merged = transactions.merge(products, on='ProductID', suffixes=('', '_product'))

In [194]:
# Calculate basic transaction features
tx_features = merged.groupby('CustomerID').agg({
    'TransactionID': 'count',  # Number of transactions
    'TotalValue': ['sum', 'mean'],  # Total and average purchase value
    'Quantity': ['sum', 'mean'],  # Total and average quantity
    'Price': 'mean'  # Average product price
}).round(2)

tx_features.columns = ['tx_count', 'total_value', 'avg_value', 'total_quantity', 'avg_quantity', 'avg_price']

tx_features
        

Unnamed: 0_level_0,tx_count,total_value,avg_value,total_quantity,avg_quantity,avg_price
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
C0001,5,3354.52,670.90,12,2.40,278.33
C0002,4,1862.74,465.68,10,2.50,208.92
C0003,4,2725.38,681.34,14,3.50,195.71
C0004,8,5354.88,669.36,23,2.88,240.64
C0005,3,2034.24,678.08,7,2.33,291.60
...,...,...,...,...,...,...
C0196,4,4982.88,1245.72,12,3.00,416.99
C0197,3,1928.65,642.88,9,3.00,227.06
C0198,2,931.83,465.92,3,1.50,239.70
C0199,4,1979.28,494.82,9,2.25,250.61


In [195]:
# Calculate category preferences (percentage of spend in each category)
category_spend = pd.pivot_table(
    merged,
    values='TotalValue',
    index='CustomerID',
    columns='Category',
    aggfunc='sum',
    fill_value=0
)
category_spend = category_spend.div(category_spend.sum(axis=1), axis=0).round(4)
category_spend.columns = [f'prcnt_{c.lower()}' for c in category_spend.columns]

category_spend


Unnamed: 0_level_0,prcnt_books,prcnt_clothing,prcnt_electronics,prcnt_home decor
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C0001,0.0342,0.0000,0.8428,0.1230
C0002,0.0000,0.5505,0.0000,0.4495
C0003,0.0000,0.0449,0.5083,0.4468
C0004,0.3527,0.0000,0.2532,0.3942
C0005,0.0000,0.0000,0.5803,0.4197
...,...,...,...,...
C0196,0.2630,0.3182,0.0000,0.4188
C0197,0.0000,0.0000,0.4744,0.5256
C0198,0.0000,0.9710,0.0290,0.0000
C0199,0.0000,0.0000,0.3003,0.6997


In [196]:
# Combine all features
customer_features = tx_features.join(category_spend)

customer_features

Unnamed: 0_level_0,tx_count,total_value,avg_value,total_quantity,avg_quantity,avg_price,prcnt_books,prcnt_clothing,prcnt_electronics,prcnt_home decor
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
C0001,5,3354.52,670.90,12,2.40,278.33,0.0342,0.0000,0.8428,0.1230
C0002,4,1862.74,465.68,10,2.50,208.92,0.0000,0.5505,0.0000,0.4495
C0003,4,2725.38,681.34,14,3.50,195.71,0.0000,0.0449,0.5083,0.4468
C0004,8,5354.88,669.36,23,2.88,240.64,0.3527,0.0000,0.2532,0.3942
C0005,3,2034.24,678.08,7,2.33,291.60,0.0000,0.0000,0.5803,0.4197
...,...,...,...,...,...,...,...,...,...,...
C0196,4,4982.88,1245.72,12,3.00,416.99,0.2630,0.3182,0.0000,0.4188
C0197,3,1928.65,642.88,9,3.00,227.06,0.0000,0.0000,0.4744,0.5256
C0198,2,931.83,465.92,3,1.50,239.70,0.0000,0.9710,0.0290,0.0000
C0199,4,1979.28,494.82,9,2.25,250.61,0.0000,0.0000,0.3003,0.6997


In [197]:
# Add customer profile features

#Add region feature (encoding as dummy variables)
customer_features = customer_features.join(pd.get_dummies(customers.set_index('CustomerID')['Region'], prefix='region'))

# Calculate days since signup
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
customer_features['days_since_signup'] = (pd.Timestamp.now() - customers.set_index('CustomerID')['SignupDate']).dt.days

customer_features

Unnamed: 0_level_0,tx_count,total_value,avg_value,total_quantity,avg_quantity,avg_price,prcnt_books,prcnt_clothing,prcnt_electronics,prcnt_home decor,region_Asia,region_Europe,region_North America,region_South America,days_since_signup
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
C0001,5,3354.52,670.90,12,2.40,278.33,0.0342,0.0000,0.8428,0.1230,0,0,0,1,932
C0002,4,1862.74,465.68,10,2.50,208.92,0.0000,0.5505,0.0000,0.4495,1,0,0,0,1079
C0003,4,2725.38,681.34,14,3.50,195.71,0.0000,0.0449,0.5083,0.4468,0,0,0,1,326
C0004,8,5354.88,669.36,23,2.88,240.64,0.3527,0.0000,0.2532,0.3942,0,0,0,1,841
C0005,3,2034.24,678.08,7,2.33,291.60,0.0000,0.0000,0.5803,0.4197,1,0,0,0,896
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C0196,4,4982.88,1245.72,12,3.00,416.99,0.2630,0.3182,0.0000,0.4188,0,1,0,0,965
C0197,3,1928.65,642.88,9,3.00,227.06,0.0000,0.0000,0.4744,0.5256,0,1,0,0,678
C0198,2,931.83,465.92,3,1.50,239.70,0.0000,0.9710,0.0290,0.0000,0,1,0,0,1065
C0199,4,1979.28,494.82,9,2.25,250.61,0.0000,0.0000,0.3003,0.6997,0,1,0,0,786


In [198]:
customer_ids = customer_features.index.to_list()

In [199]:
# Scaling the feature matrix

scaler = StandardScaler()
customer_features = pd.DataFrame(scaler.fit_transform(customer_features), index=customer_ids, columns=customer_features.columns)
customer_features

Unnamed: 0,tx_count,total_value,avg_value,total_quantity,avg_quantity,avg_price,prcnt_books,prcnt_clothing,prcnt_electronics,prcnt_home decor,region_Asia,region_Europe,region_North America,region_South America,days_since_signup
C0001,-0.011458,-0.061701,-0.070280,-0.122033,-0.232998,0.094615,-0.914195,-0.933994,2.320635,-0.475593,-0.532795,-0.579284,-0.548319,1.540416,1.148752
C0002,-0.467494,-0.877744,-0.934953,-0.448000,-0.054528,-0.904011,-1.052428,1.188759,-0.970342,0.757702,1.876893,-0.579284,-0.548319,-0.649175,1.600431
C0003,-0.467494,-0.405857,-0.026292,0.203934,1.730177,-1.094068,-1.052428,-0.760858,1.014475,0.747504,-0.532795,-0.579284,-0.548319,1.540416,-0.713270
C0004,1.356650,1.032547,-0.076769,1.670787,0.623660,-0.447644,0.373151,-0.933994,0.018357,0.548816,-0.532795,-0.579284,-0.548319,1.540416,0.869141
C0005,-0.923530,-0.783929,-0.040028,-0.936951,-0.357927,0.285536,-1.052428,-0.933994,1.295621,0.645138,1.876893,-0.579284,-0.548319,-0.649175,1.038137
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C0196,-0.467494,0.829053,2.351666,-0.122033,0.837825,2.089567,0.010592,0.292999,-0.970342,0.641739,-0.532795,1.726268,-0.548319,-0.649175,1.250149
C0197,-0.923530,-0.841689,-0.188339,-0.610984,0.837825,-0.643025,-1.052428,-0.933994,0.882101,1.045157,-0.532795,1.726268,-0.548319,-0.649175,0.368300
C0198,-1.379566,-1.386975,-0.933942,-1.588886,-1.839232,-0.461168,-1.052428,2.810226,-0.857103,-0.940204,-0.532795,1.726268,-0.548319,-0.649175,1.557414
C0199,-0.467494,-0.813993,-0.812175,-0.610984,-0.500704,-0.304202,-1.052428,-0.933994,0.202273,1.702788,-0.532795,1.726268,-0.548319,-0.649175,0.700146


In [200]:
# implementing Lookalike model

def lookalike(customer_id, n=3):
    if customer_id not in customer_ids:
        raise ValueError(f'Customer with {customer_id} not found')
    
    #Calculate cosine similarity
    customer_vector = customer_features.loc[customer_id].values.reshape(1,-1)
    similarity = cosine_similarity(customer_vector, customer_features)[0]


    # #Get top 3 lookalikes
    lookalikes_index = np.argsort(similarity)[::-1][1:n+1]
    lookalikes = [(customer_ids[ind], similarity[ind]) for ind in lookalikes_index]

    return lookalikes


In [201]:
# Sample lookalike

a = lookalike('C0001')
print(a)

[('C0192', 0.8927178614955527), ('C0120', 0.886413620422104), ('C0112', 0.7871890783013451)]


In [202]:
# Exporting CSV with lookalikes for first 20 customers

output = {}

for customer in customer_ids[:20]:
    output[customer] = lookalike(customer)

output_rows = []
for cust_id, lookalikes in output.items():
    output_rows.append({'customer_id': cust_id, 'lookalikes': lookalikes})

pd.DataFrame(output_rows).to_csv('Lookalike.csv', index=False)




### The lookalike model I've developed uses several key features to determine customer similarity:

#### Transaction Behavior:

 - Transaction frequency
 - Total and average purchase values
 - Quantity of items purchased
 - Average price of products purchased 


#### Category Preferences:

 - Percentage of spend across different product categories (Books, Electronics, Home Decor, Clothing)


#### Customer Details:

 - Region
 - Account age (days since signup)