In [19]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

In [24]:
class CustomerLookalikeModel:
    def __init__(self):
        self.feature_matrix = None
        self.customer_ids = None
        self.scaler = StandardScaler()
        
    def load_data(self):
        """Load and prepare the datasets"""
        customers_df = pd.read_csv('Customers.csv')
        products_df = pd.read_csv('Products.csv')
        transactions_df = pd.read_csv('Transactions.csv')
        
        # Convert dates to datetime
        customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
        transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])
        
        return customers_df, products_df, transactions_df
    
    def create_customer_features(self, customers_df, transactions_df, products_df):
        """Create customer features from all available data"""
        
        # Basic transaction metrics
        transaction_features = transactions_df.groupby('CustomerID').agg({
            'TransactionID': 'count',
            'TotalValue': ['sum', 'mean'],
            'Quantity': ['sum', 'mean']
        }).fillna(0)
        
        # Flatten column names
        transaction_features.columns = ['transaction_count', 'total_spend', 
                                     'avg_transaction_value', 'total_quantity', 
                                     'avg_quantity']
        
        # Calculate recency and frequency
        latest_transaction = transactions_df.groupby('CustomerID')['TransactionDate'].max()
        first_transaction = transactions_df.groupby('CustomerID')['TransactionDate'].min()
        customer_lifetime = (latest_transaction - first_transaction).dt.days
        
        # Add recency and frequency features
        transaction_features['customer_lifetime'] = customer_lifetime.fillna(1)
        transaction_features['purchase_frequency'] = (transaction_features['transaction_count'] / 
                                                    transaction_features['customer_lifetime'].clip(lower=1))
        
        # Cap frequency to avoid extreme values
        transaction_features['purchase_frequency'] = transaction_features['purchase_frequency'].clip(upper=1)
        
        # Product category preferences
        category_pivot = pd.merge(transactions_df, 
                                products_df[['ProductID', 'Category']], 
                                on='ProductID')
        
        category_spend = pd.pivot_table(
            category_pivot,
            values='TotalValue',
            index='CustomerID',
            columns='Category',
            aggfunc='sum',
            fill_value=0
        )
        
        # Convert to percentages
        category_totals = category_spend.sum(axis=1)
        category_spend_pct = category_spend.div(category_totals.where(category_totals != 0, 1), axis=0)
        category_spend_pct = category_spend_pct.add_prefix('category_')
        
        # Region encoding
        region_dummies = pd.get_dummies(customers_df.set_index('CustomerID')['Region'], 
                                      prefix='region')
        
        # Combine all features
        features = pd.concat([
            transaction_features,
            category_spend_pct,
            region_dummies
        ], axis=1)
        
        # Final cleaning
        features = features.fillna(0)
        features = features.replace([np.inf, -np.inf], 0)
        
        # Ensure all features are numeric
        for col in features.columns:
            features[col] = pd.to_numeric(features[col], errors='coerce')
        
        features = features.fillna(0)
        
        return features
    
    def fit(self, customers_df, transactions_df, products_df):
        """Prepare the model with customer features"""
        features = self.create_customer_features(customers_df, transactions_df, products_df)
        
        # Store customer IDs
        self.customer_ids = features.index
        
        # Scale features
        self.feature_matrix = self.scaler.fit_transform(features)
        
        # Final validation
        self.feature_matrix = np.nan_to_num(self.feature_matrix, nan=0.0, posinf=0.0, neginf=0.0)
        
        return self
    
    def find_lookalikes(self, customer_id, n_recommendations=3):
        """Find top n similar customers for a given customer ID"""
        if customer_id not in self.customer_ids:
            raise ValueError(f"Customer ID {customer_id} not found in the dataset")
        
        # Get customer index
        customer_idx = np.where(self.customer_ids == customer_id)[0][0]
        
        # Get customer vector and ensure it's clean
        customer_vector = self.feature_matrix[customer_idx].reshape(1, -1)
        customer_vector = np.nan_to_num(customer_vector, nan=0.0, posinf=0.0, neginf=0.0)
        
        # Calculate similarity scores
        similarity_scores = cosine_similarity(customer_vector, self.feature_matrix)[0]
        
        # Get top similar customers (excluding the customer themselves)
        similar_indices = np.argsort(similarity_scores)[::-1][1:n_recommendations+1]
        
        # Create recommendations dictionary
        recommendations = {
            'customer_id': self.customer_ids[similar_indices],
            'similarity_score': similarity_scores[similar_indices]
        }
        
        return pd.DataFrame(recommendations)

In [26]:
def main():
    # Initialize model
    model = CustomerLookalikeModel()
    
    try:
        # Load data
        print("Loading data...")
        customers_df, products_df, transactions_df = model.load_data()
        
        # Fit model
        print("Fitting model...")
        model.fit(customers_df, transactions_df, products_df)
        
        # Generate recommendations for first 20 customers
        print("Generating recommendations...")
        results = []
        for customer_id in customers_df['CustomerID'].iloc[:20]:
            lookalikes = model.find_lookalikes(customer_id)
            result_dict = {
                'source_customer': customer_id,
                'lookalike_customers': lookalikes['customer_id'].tolist(),
                'similarity_scores': lookalikes['similarity_score'].round(4).tolist()
            }
            results.append(result_dict)
        
        # Create and save results DataFrame
        results_df = pd.DataFrame(results)
        results_df.to_csv('Lookalike.csv', index=False)
        print("Results saved to 'Lookalike.csv'")
        
        return results_df
    
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        raise

if __name__ == "__main__":
    results = main()

Loading data...
Fitting model...
Generating recommendations...
Results saved to 'Lookalike.csv'
