In [1]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

class CustomerLookalikeModel:
    def __init__(self):
        self.customer_features = None
        self.similarity_matrix = None
        self.customer_ids = None

    def load_data(self, customers_df, products_df, transactions_df):
        """
        Load and prepare data for the lookalike model
        """
        customers_df['CustomerID'] = customers_df['CustomerID'].astype(str)
        transactions_df['CustomerID'] = transactions_df['CustomerID'].astype(str)

        try:
            customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'], format='mixed')
            transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'], format='mixed')
        except ValueError:
            customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'], infer_datetime_format=True)
            transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'], infer_datetime_format=True)

        min_date = customers_df['SignupDate'].min()
        customers_df['signup_days'] = (customers_df['SignupDate'] - min_date).dt.days

        customer_transactions = self._create_transaction_features(transactions_df)

        product_preferences = self._create_product_preferences(transactions_df, products_df)

        region_dummies = pd.get_dummies(customers_df['Region'], prefix='region')

        self.customer_features = pd.concat([
            customers_df[['CustomerID', 'signup_days']],
            region_dummies,
            customer_transactions,
            product_preferences
        ], axis=1)

        self.customer_features = self.customer_features.fillna(0)

        self.customer_ids = self.customer_features['CustomerID']
        feature_columns = self.customer_features.columns.difference(['CustomerID'])

        scaler = StandardScaler()
        self.customer_features[feature_columns] = scaler.fit_transform(
            self.customer_features[feature_columns]
        )

    def _create_transaction_features(self, transactions_df):
        """
        Create customer-level features from transaction data
        """
        transaction_features = transactions_df.groupby('CustomerID').agg({
            'TransactionID': 'count',
            'TotalValue': ['sum', 'mean', 'std'],
            'Quantity': ['sum', 'mean'],
            'Price': 'mean'
        }).round(2)

        transaction_features.columns = [
            f'{col[0]}_{col[1]}' if col[1] else col[0]
            for col in transaction_features.columns
        ]

        transaction_features = transaction_features.fillna(0)

        purchase_dates = transactions_df.groupby('CustomerID')['TransactionDate'].agg(['min', 'max'])
        transaction_features['purchase_timespan'] = (
            purchase_dates['max'] - purchase_dates['min']
        ).dt.days

        transaction_features['purchase_timespan'] = transaction_features['purchase_timespan'].fillna(0)

        return transaction_features

    def _create_product_preferences(self, transactions_df, products_df):
        """
        Create customer-level features for product category preferences
        """
        txn_products = transactions_df.merge(
            products_df[['ProductID', 'Category']],
            on='ProductID'
        )

        category_preferences = pd.crosstab(
            txn_products['CustomerID'],
            txn_products['Category'],
            normalize='index'
        )

        return category_preferences

    def calculate_similarity(self):
        """
        Calculate similarity matrix between customers
        """
        feature_columns = self.customer_features.columns.difference(['CustomerID'])
        self.similarity_matrix = cosine_similarity(
            self.customer_features[feature_columns]
        )

    def get_lookalikes(self, customer_id, n_recommendations=3):
        """
        Get top N similar customers for a given customer ID
        """
        if self.similarity_matrix is None:
            self.calculate_similarity()

        customer_idx = self.customer_ids[self.customer_ids == customer_id].index[0]

        customer_similarities = self.similarity_matrix[customer_idx]

        similar_indices = np.argsort(customer_similarities)[::-1][1:n_recommendations+1]

        recommendations = pd.DataFrame({
            'similar_customer_id': self.customer_ids.iloc[similar_indices],
            'similarity_score': customer_similarities[similar_indices]
        })

        return recommendations

    def generate_lookalike_csv(self, start_id='C0001', end_id='C0020'):
        """
        Generate lookalike recommendations CSV for a range of customer IDs
        """
        results = []

        customer_range = self.customer_ids[
            (self.customer_ids.str.zfill(5) >= start_id.zfill(5)) &
            (self.customer_ids.str.zfill(5) <= end_id.zfill(5))
        ]

        for customer_id in sorted(customer_range):
            try:
                recommendations = self.get_lookalikes(customer_id)
                similar_customers = [
                    (row['similar_customer_id'], round(row['similarity_score'], 3))
                    for _, row in recommendations.iterrows()
                ]

                results.append({
                    'customer_id': customer_id,
                    'lookalikes': similar_customers
                })
            except Exception as e:
                print(f"Error processing customer {customer_id}: {str(e)}")
                continue

        output_df = pd.DataFrame(results)
        output_df.to_csv('/content/gdrive/My Drive/Lookalike.csv', index=False)

        return output_df

def print_data_info(df, name):
    """Helper function to print dataframe info"""
    print(f"\n{name} Info:")
    print(f"Shape: {df.shape}")
    print("Columns:", df.columns.tolist())
    print("\nFirst few rows:")
    print(df.head())
    print("\nData Types:")
    print(df.dtypes)

def main():
    try:
        customers_df = pd.read_csv('/content/gdrive/My Drive/Customers.csv')
        products_df = pd.read_csv('/content/gdrive/My Drive/Products.csv')
        transactions_df = pd.read_csv('/content/gdrive/My Drive/Transactions.csv')

        print_data_info(customers_df, "Customers")
        print_data_info(products_df, "Products")
        print_data_info(transactions_df, "Transactions")

        model = CustomerLookalikeModel()
        model.load_data(customers_df, products_df, transactions_df)

        lookalike_results = model.generate_lookalike_csv()
        print("\nLookalike recommendations generated successfully!")

        customer_id = 'C0001'
        recommendations = model.get_lookalikes(customer_id)
        print(f"\nTop 3 lookalikes for customer {customer_id}:")
        print(recommendations)

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        import traceback
        traceback.print_exc()
        print("Please check your data format and ensure all required columns are present.")

if __name__ == "__main__":
    main()


Customers Info:
Shape: (200, 4)
Columns: ['CustomerID', 'CustomerName', 'Region', 'SignupDate']

First few rows:
  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15

Data Types:
CustomerID      object
CustomerName    object
Region          object
SignupDate      object
dtype: object

Products Info:
Shape: (100, 4)
Columns: ['ProductID', 'ProductName', 'Category', 'Price']

First few rows:
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P0