In [1]:
# Step 1: Load and Preprocess Data
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [3]:
# Load datasets
customers = pd.read_csv('/content/Customers.csv')
products = pd.read_csv('/content/Products.csv')
transactions = pd.read_csv('/content/Transactions.csv')

In [4]:
# Merge datasets
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])
merged_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')


In [8]:
merged_data.head(10)

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68
5,T00442,C0188,P067,2024-12-26 14:40:03,1,300.68,300.68,Anna Ball,South America,2022-05-17,ComfortLiving Bluetooth Speaker,Electronics,300.68
6,T00490,C0195,P067,2024-11-24 11:49:48,3,902.04,300.68,Jeremy Mclaughlin,South America,2024-09-17,ComfortLiving Bluetooth Speaker,Electronics,300.68
7,T00536,C0008,P067,2024-09-22 06:13:59,1,300.68,300.68,David Li,North America,2024-01-13,ComfortLiving Bluetooth Speaker,Electronics,300.68
8,T00564,C0157,P067,2024-12-07 17:57:40,3,902.04,300.68,Miguel Wong,North America,2024-01-30,ComfortLiving Bluetooth Speaker,Electronics,300.68
9,T00631,C0130,P067,2024-05-14 23:14:59,2,601.36,300.68,Robert Jones,South America,2023-04-19,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [7]:
# Step 2: Feature Engineering
# Aggregate transaction data at the customer level
customer_features = merged_data.groupby('CustomerID').agg(
    total_spending=('TotalValue', 'sum'),
    total_quantity=('Quantity', 'sum'),
    unique_products=('ProductID', 'nunique'),
    avg_price=('Price_x', 'mean'),
    signup_region=('Region', 'first'),  # Use the region for customer demographics
    signup_date=('SignupDate', 'first')
).reset_index()


In [9]:
# Convert signup date to account age
customer_features['signup_date'] = pd.to_datetime(customer_features['signup_date'])
customer_features['account_age_days'] = (pd.Timestamp.now() - customer_features['signup_date']).dt.days


In [10]:
# Encode categorical data (Region)
customer_features = pd.get_dummies(customer_features, columns=['signup_region'], drop_first=True)

# Select numeric features for similarity calculation
features = ['total_spending', 'total_quantity', 'unique_products', 'avg_price', 'account_age_days']
features += [col for col in customer_features.columns if col.startswith('signup_region_')]


In [11]:
# Step 3: Normalize Features
scaler = StandardScaler()
normalized_data = scaler.fit_transform(customer_features[features])

# Compute pairwise similarity
similarity_matrix = cosine_similarity(normalized_data)


In [19]:
similarity_matrix

array([[ 1.        ,  0.29923423,  0.43793786, ...,  0.03455965,
        -0.16752318, -0.10284484],
       [ 0.29923423,  1.        , -0.01337362, ...,  0.61118976,
         0.41230806,  0.01301186],
       [ 0.43793786, -0.01337362,  1.        , ..., -0.18463441,
        -0.27943101, -0.32603447],
       ...,
       [ 0.03455965,  0.61118976, -0.18463441, ...,  1.        ,
         0.91204699, -0.42943938],
       [-0.16752318,  0.41230806, -0.27943101, ...,  0.91204699,
         1.        , -0.3710931 ],
       [-0.10284484,  0.01301186, -0.32603447, ..., -0.42943938,
        -0.3710931 ,  1.        ]])

In [12]:
# Step 4: Recommend Similar Customers
# Create a mapping of CustomerID to indices
customer_ids = customer_features['CustomerID'].values
customer_index_map = {id_: idx for idx, id_ in enumerate(customer_ids)}


In [13]:
# Generate recommendations for the first 20 customers
recommendations = {}
for cust_id in customer_ids[:20]:
    idx = customer_index_map[cust_id]
    similarity_scores = similarity_matrix[idx]
    similar_customers = sorted(
        [(customer_ids[i], score) for i, score in enumerate(similarity_scores) if i != idx],
        key=lambda x: x[1],
        reverse=True
    )[:3]  # Top 3 similar customers
    recommendations[cust_id] = similar_customers


In [14]:
# Convert recommendations to DataFrame for saving
lookalike_df = pd.DataFrame({
    "CustomerID": recommendations.keys(),
    "SimilarCustomers": [str(similar) for similar in recommendations.values()]
})

In [18]:
# Save the results
lookalike_df.to_csv('Lookalike.csv', index=False)

# Display the recommendations
lookalike_df.head(20)

Unnamed: 0,CustomerID,SimilarCustomers
0,C0001,"[('C0152', 0.9830526888660549), ('C0011', 0.96..."
1,C0002,"[('C0027', 0.9457418833880694), ('C0159', 0.91..."
2,C0003,"[('C0190', 0.9560200353352262), ('C0031', 0.91..."
3,C0004,"[('C0113', 0.9729224697579103), ('C0102', 0.94..."
4,C0005,"[('C0159', 0.9468517791718506), ('C0007', 0.91..."
5,C0006,"[('C0187', 0.9185811419465975), ('C0048', 0.90..."
6,C0007,"[('C0040', 0.9231895964846478), ('C0005', 0.91..."
7,C0008,"[('C0068', 0.8813467287462325), ('C0065', 0.87..."
8,C0009,"[('C0061', 0.9529690790736022), ('C0167', 0.91..."
9,C0010,"[('C0121', 0.9439406600547386), ('C0197', 0.89..."


In [21]:
# Next Part : Inspect Similarity Scores
# Validate similarity scores for a few customers manually
for cust_id in ['C0001', 'C0002', 'C0003']:  # Example customers
    idx = customer_index_map[cust_id]
    similarity_scores = similarity_matrix[idx]
    similar_customers = sorted(
        [(customer_ids[i], score) for i, score in enumerate(similarity_scores) if i != idx],
        key=lambda x: x[1],
        reverse=True
    )[:3]
    print(f"\nCustomer: {cust_id}")
    print("Top Similar Customers:")
    for sim_cust, score in similar_customers:
        print(f"  CustomerID: {sim_cust}, Similarity Score: {score:.2f}")


Customer: C0001
Top Similar Customers:
  CustomerID: C0152, Similarity Score: 0.98
  CustomerID: C0011, Similarity Score: 0.96
  CustomerID: C0118, Similarity Score: 0.94

Customer: C0002
Top Similar Customers:
  CustomerID: C0027, Similarity Score: 0.95
  CustomerID: C0159, Similarity Score: 0.92
  CustomerID: C0106, Similarity Score: 0.90

Customer: C0003
Top Similar Customers:
  CustomerID: C0190, Similarity Score: 0.96
  CustomerID: C0031, Similarity Score: 0.91
  CustomerID: C0052, Similarity Score: 0.88
