In [71]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity


In [72]:
# Load the datasets
customers_df =pd.read_csv(r'C:\Users\Admin\ecommerce_analysis\data\Customers.csv')
products_df = pd.read_csv(r'C:\Users\Admin\ecommerce_analysis\data\Products.csv')
transactions_df = pd.read_csv(r'C:\Users\Admin\ecommerce_analysis\data\Transactions.csv')

In [73]:
# Convert date columns to datetime
customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])


In [74]:
# Create customer features
def create_customer_features():
    # Transaction-based features
    transaction_features = transactions_df.groupby('CustomerID').agg({
        'TotalValue': ['sum', 'mean', 'std'],
        'Quantity': ['sum', 'mean'],
        'TransactionID': 'count'
    }).fillna(0)
    

In [75]:
# Flatten column names
transaction_features.columns = ['_'.join(col).strip() for col in transaction_features.columns.values]
    

In [76]:
# Product category preferences
category_pivot = pd.merge(transactions_df, products_df[['ProductID', 'Category']], on='ProductID')
category_features = pd.crosstab(category_pivot['CustomerID'], category_pivot['Category'])
    

In [77]:
# Customer recency
last_purchase = transactions_df.groupby('CustomerID')['TransactionDate'].max()
first_purchase = transactions_df.groupby('CustomerID')['TransactionDate'].min()
customer_age = (last_purchase - first_purchase).dt.days

In [78]:
# Combine features
def create_feature_matrix(transaction_features, category_features, customer_age):
    feature_matrix = pd.concat([
        transaction_features,
        category_features,
        customer_age.rename('customer_age')
    ], axis=1).fillna(0)
    return feature_matrix

In [79]:
def find_lookalikes(customer_id, feature_matrix, n_recommendations=3):
    # Standardize features
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(feature_matrix)
    
    # Calculate similarity
    similarity_matrix = cosine_similarity(scaled_features)
    
    # Get customer index
    customer_idx = feature_matrix.index.get_loc(customer_id)
    
    # Find most similar customers
    similar_scores = list(enumerate(similarity_matrix[customer_idx]))
    similar_scores = sorted(similar_scores, key=lambda x: x[1], reverse=True)[1:n_recommendations+1]
    
    return [(feature_matrix.index[idx], score) for idx, score in similar_scores]


In [80]:
# Create feature matrix
feature_matrix = create_customer_features()


In [81]:
# Generate lookalikes for first 20 customers
lookalike_results = {}
first_20_customers = customers_df['CustomerID'].head(20).tolist()

for customer_id in first_20_customers:
    try:
        lookalikes = find_lookalikes(customer_id, feature_matrix)
        lookalike_results[customer_id] = lookalikes
    except Exception as e:
        print(f"Error processing customer {customer_id}: {str(e)}")
        lookalike_results[customer_id] = [None, None, None]

# Create and save results DataFrame
results_df = pd.DataFrame.from_dict(lookalike_results, orient='index')
results_df.columns = ['Lookalike_1', 'Lookalike_2', 'Lookalike_3']

# Save results
results_df.to_csv('FirstName_LastName_Lookalike.csv')

# Display results
print("\nLookalike Results:")
print(results_df)

Error processing customer C0001: Expected 2D array, got scalar array instead:
array=nan.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
Error processing customer C0002: Expected 2D array, got scalar array instead:
array=nan.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
Error processing customer C0003: Expected 2D array, got scalar array instead:
array=nan.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
Error processing customer C0004: Expected 2D array, got scalar array instead:
array=nan.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
Error processing customer C0005: Expected 2D array, got scalar array instead