In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity

class LookalikeFinder:
    def __init__(self, data):
        self.data = data
        self.preprocess_data()

    def preprocess_data(self):
        # Aggregate customer-level features
        self.customer_features = self.data.groupby('CustomerID.1').agg({
            'Region': 'first',
            'TotalValue': 'sum',
            'ProductID': 'nunique',
            'ProductName': lambda x: list(set(x)),
        }).reset_index()

        # Preprocessing transformers
        numeric_features = ['TotalValue', 'ProductID']
        categorical_features = ['Region']

        self.preprocessor = ColumnTransformer(
            transformers=[
                ('num', StandardScaler(), numeric_features),
                ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
            ])

        # Prepare features for modeling
        self.X = self.preprocessor.fit_transform(self.customer_features[numeric_features + categorical_features])
        self.customer_ids = self.customer_features['CustomerID.1'].values

    def knn_top_recommendations(self, target_customer, n_neighbors=3):
        # KNN Model
        knn = NearestNeighbors(n_neighbors=n_neighbors+1, metric='cosine')
        knn.fit(self.X)

        # Find index of target customer
        target_index = np.where(self.customer_ids == target_customer)[0]

        if len(target_index) == 0:
            return []

        # Get nearest neighbors
        distances, indices = knn.kneighbors(self.X[target_index])

        # Return top recommendations (excluding target customer)
        recommendations = []
        for idx, dist in zip(indices[0][1:], distances[0][1:]):
            customer_row = self.customer_features.iloc[idx]
            recommendations.append({
                'CustomerID.1': self.customer_ids[idx],
                'Similarity Score': 1 - dist
            })

        return recommendations

    def save_lookalikes(self, output_file='Lookalike.csv', top_n_customers=20):
        lookalike_data = []

        for i in range(1, top_n_customers + 1):
            target_customer = f'C{i:04d}'  # Format as C0001, C0002, etc.
            recommendations = self.knn_top_recommendations(target_customer)

            # Prepare the data in the desired format
            row = {'CustomerID.1': target_customer}
            for j, rec in enumerate(recommendations):
                row[f'Lookalike{j+1}'] = rec['CustomerID.1']
                row[f'Similarity Score{j+1}'] = rec['Similarity Score']

            # If there are less than 3 recommendations, fill with None
            for j in range(len(recommendations), 3):
                row[f'Lookalike{j+1}'] = None
                row[f'Similarity Score{j+1}'] = None

            lookalike_data.append(row)

        # Save to CSV
        lookalike_df = pd.DataFrame(lookalike_data)
        lookalike_df.to_csv(output_file, index=False)

# Example usage
if __name__ == '__main__':
    # Load your data
    df = pd.read_csv('/content/final.csv')

    # Initialize Lookalike Finder
    lookalike_model = LookalikeFinder(df)

    # Save the top 3 recommendations for customers C0001 to C0020
    lookalike_model.save_lookalikes(output_file='Lookalike.csv', top_n_customers=20)
