In [1]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load datasets
customers_df = pd.read_csv(r'C:\Users\crai8\Downloads\Customers.csv')
products_df = pd.read_csv(r'C:\Users\crai8\Downloads\Products.csv')
transactions_df = pd.read_csv(r'C:\Users\crai8\Downloads\Transactions.csv')

print("Data loaded successfully!")
print("\nShape of datasets:")
print(f"Customers: {customers_df.shape}")
print(f"Products: {products_df.shape}")
print(f"Transactions: {transactions_df.shape}")

Data loaded successfully!

Shape of datasets:
Customers: (200, 4)
Products: (100, 4)
Transactions: (1000, 7)


In [3]:
def create_customer_features():
    # Transaction-based features
    customer_features = transactions_df.groupby('CustomerID').agg({
        'TransactionID': 'count',  # Number of transactions
        'Quantity': ['sum', 'mean'],  # Total and average quantity
        'TotalValue': ['sum', 'mean', 'max']  # Total spent, average transaction value, max transaction
    }).round(2)
    

    customer_features.columns = ['transaction_count', 'total_quantity', 'avg_quantity',
                               'total_spent', 'avg_transaction', 'max_transaction']
    
    
    trans_products = transactions_df.merge(products_df, on='ProductID')
    

    category_preferences = pd.get_dummies(trans_products['Category'])
    category_preferences = category_preferences.multiply(trans_products['TotalValue'], axis=0)
    category_preferences = category_preferences.groupby(trans_products['CustomerID']).sum()
    
    # Combine all features
    customer_features = customer_features.join(category_preferences)
    
    # Add customer region (one-hot encoded)
    customer_regions = pd.get_dummies(customers_df.set_index('CustomerID')['Region'], prefix='region')
    customer_features = customer_features.join(customer_regions)
    
    return customer_features

# Create features
print("Creating customer features...")
customer_features = create_customer_features()
print("\nFeatures created successfully!")
print(f"Number of features created: {customer_features.shape[1]}")

Creating customer features...

Features created successfully!
Number of features created: 14


In [4]:
# Scale the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features)
scaled_features_df = pd.DataFrame(scaled_features, index=customer_features.index, 
                                columns=customer_features.columns)

# Calculate similarity matrix
print("Calculating similarity scores...")
similarity_matrix = cosine_similarity(scaled_features)
similarity_df = pd.DataFrame(similarity_matrix, 
                           index=customer_features.index,
                           columns=customer_features.index)
print("Similarity matrix created!")

Calculating similarity scores...
Similarity matrix created!


In [5]:
# Function to get top 3 similar customers
def get_top_3_similar(customer_id, similarity_df):
    customer_similarities = similarity_df[customer_id].sort_values(ascending=False)
    customer_similarities = customer_similarities[customer_similarities.index != customer_id]
    top_3 = customer_similarities.head(3)
    return top_3

# Generate recommendations for first 20 customers
print("Generating recommendations for first 20 customers...")
recommendations = {}
first_20_customers = sorted(customers_df['CustomerID'].unique())[:20]

for customer_id in first_20_customers:
    top_3 = get_top_3_similar(customer_id, similarity_df)
    recommendations[customer_id] = [(cust_id, score) for cust_id, score in top_3.items()]

Generating recommendations for first 20 customers...


In [6]:
# Create Lookalike.csv
lookalike_data = []
for customer_id, similar_customers in recommendations.items():
    row = [customer_id]
    for similar_id, score in similar_customers:
        row.extend([similar_id, f"{score:.4f}"])
    lookalike_data.append(row)

# Create DataFrame
columns = ['CustomerID', 
           'Similar1_ID', 'Similar1_Score',
           'Similar2_ID', 'Similar2_Score',
           'Similar3_ID', 'Similar3_Score']

lookalike_df = pd.DataFrame(lookalike_data, columns=columns)

# Save to CSV
lookalike_df.to_csv('FirstName_LastName_Lookalike.csv', index=False)

print("Results for first 5 customers:")
print(lookalike_df.head())
print("\nLookalike.csv has been created successfully!")

Results for first 5 customers:
  CustomerID Similar1_ID Similar1_Score Similar2_ID Similar2_Score  \
0      C0001       C0181         0.8534       C0120         0.8053   
1      C0002       C0043         0.8593       C0159         0.8466   
2      C0003       C0091         0.7501       C0129         0.7012   
3      C0004       C0113         0.8356       C0104         0.7993   
4      C0005       C0186         0.9141       C0007         0.8934   

  Similar3_ID Similar3_Score  
0       C0192         0.7761  
1       C0128         0.8317  
2       C0148         0.6911  
3       C0012         0.7556  
4       C0146         0.8902  

Lookalike.csv has been created successfully!
