In [7]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
import numpy as np

# Load the datasets
customers_df = pd.read_csv(r"C:\Users\harsh\Downloads\Customers.csv")
products_df = pd.read_csv(r"C:\Users\harsh\Downloads\Products.csv")
transactions_df = pd.read_csv(r"C:\Users\harsh\Downloads\Transactions (1).csv")

# Convert date columns to datetime
customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])

# Merge datasets for analysis
merged_df = transactions_df.merge(customers_df, on='CustomerID').merge(products_df, on='ProductID')

# Ensure correct column names
price_column = 'Price_x' if 'Price_x' in merged_df.columns else 'UnitPrice' if 'UnitPrice' in merged_df.columns else None

# Feature Engineering
aggregation_dict = {
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'ProductID': 'nunique',
    'TransactionDate': ['min', 'max']
}
if price_column:
    aggregation_dict[price_column] = 'mean'

customer_features = merged_df.groupby('CustomerID').agg(aggregation_dict).reset_index()

# Renaming columns dynamically
column_names = ['CustomerID', 'TotalValue', 'Quantity', 'UniqueProducts', 'FirstPurchase', 'LastPurchase']
if price_column:
    column_names.append('AvgPrice')
customer_features.columns = column_names

customer_features['CustomerLifetime'] = (customer_features['LastPurchase'] - customer_features['FirstPurchase']).dt.days
customer_features.drop(columns=['FirstPurchase', 'LastPurchase'], inplace=True)

# Prepare features for scaling
feature_cols = ['TotalValue', 'Quantity', 'UniqueProducts', 'CustomerLifetime']
if price_column:
    feature_cols.append('AvgPrice')

# Normalize data
scaler = StandardScaler()
customer_features_scaled = scaler.fit_transform(customer_features[feature_cols])

# Dimensionality Reduction using PCA
pca = PCA(n_components=2)
customer_features_pca = pca.fit_transform(customer_features_scaled)

# Compute similarity using Cosine Similarity
similarity_matrix = cosine_similarity(customer_features_pca)

def find_similar_customers(customer_index, similarity_matrix, top_n=3):
    similar_indices = similarity_matrix[customer_index].argsort()[::-1][1:top_n+1]
    return [(customer_features['CustomerID'].iloc[i], float(similarity_matrix[customer_index][i])) for i in similar_indices]

# Generate lookalike results
lookalike_results = {}
for idx, customer_id in enumerate(customer_features['CustomerID'].iloc[:20]):
    lookalike_results[customer_id] = find_similar_customers(idx, similarity_matrix)

# Save results to CSV
lookalike_list = []
for cust_id, lookalikes in lookalike_results.items():
    row = [cust_id]
    for lookalike_id, score in lookalikes:
        row.extend([lookalike_id, round(score, 4)])
    lookalike_list.append(row)

lookalike_df = pd.DataFrame(lookalike_list, columns=['CustomerID', 'Lookalike1', 'Score1', 'Lookalike2', 'Score2', 'Lookalike3', 'Score3'])
lookalike_df.to_csv('Lookalike.csv', index=False)