In [24]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder

# Step 1: Load the data
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Step 2: Preprocessing and feature engineering

# One-hot encoding 'Region' for Content-Based Filtering (using only the 'Region' column)
encoder = OneHotEncoder()
region_encoded = encoder.fit_transform(customers[["Region"]]).toarray()  # Only region is used for content-based filtering
region_df = pd.DataFrame(region_encoded, columns=encoder.get_feature_names_out(["Region"]))
customers = pd.concat([customers, region_df], axis=1)

# Create a user-product interaction matrix (Collaborative Filtering)
transaction_data = transactions.groupby(['CustomerID', 'ProductID']).agg({'Quantity': 'sum', 'TotalValue': 'sum'}).reset_index()

# Create a pivot table for collaborative filtering (User-Product matrix)
user_product_matrix = transaction_data.pivot_table(index='CustomerID', columns='ProductID', values='TotalValue', fill_value=0)

# Step 3: Collaborative Filtering (Cosine Similarity on Transaction History)
user_similarity_matrix = cosine_similarity(user_product_matrix)

# Step 4: Content-Based Filtering (Cosine Similarity on 'Region' only)
profile_similarity_matrix = cosine_similarity(region_df)  # Use only the region features for similarity calculation

# Step 5: Align Customer IDs in both matrices
# Ensure the user-product matrix and region_df have the same customer order
common_customers = customers['CustomerID'].isin(user_product_matrix.index)
customers_aligned = customers[common_customers].reset_index(drop=True)
region_df_aligned = region_df.loc[common_customers].reset_index(drop=True)

# Recompute the content-based similarity matrix for the aligned customers
profile_similarity_matrix = cosine_similarity(region_df_aligned)

# Step 6: Combine Collaborative Filtering and Content-Based Filtering

# Assign higher weight to Collaborative Filtering (CF) to make similarity scores closer to 1 (or around 0.9)
weight_cf = 0.53  # Increase the weight of CF to make similarity scores stronger
weight_cb = 0.47  # Decrease the weight of CB

# Combine the two similarity matrices using a weighted sum
combined_similarity = weight_cf * user_similarity_matrix + weight_cb * profile_similarity_matrix

# Step 7: Create a map of cust_id to List[cust_id, score] and generate top 3 Lookalikes for each of the first 20 customers

cust_dict = {}

for i in range(20):  # For customers C0001 to C0020
    customer_id = customers.iloc[i]['CustomerID']

    # Get the similarity scores for this customer
    similarity_scores = combined_similarity[i]

    # Sort the similarity scores in descending order, excluding the customer itself (self-similarity)
    similar_customers_indices = np.argsort(similarity_scores)[::-1][1:4]  # Get top 3 excluding the customer itself

    # Get the customer IDs and scores of the top 3 most similar customers
    similar_customer_ids = customers.iloc[similar_customers_indices]['CustomerID'].values
    similar_customer_scores = similarity_scores[similar_customers_indices]

    # Map the current customer to a list of lookalikes with scores
    cust_dict[customer_id] = [
        [similar_customer_ids[0], similar_customer_scores[0]],
        [similar_customer_ids[1], similar_customer_scores[1]],
        [similar_customer_ids[2], similar_customer_scores[2]],
    ]

# Step 8: Flatten the dictionary into a DataFrame with 7 columns

flattened_results = []

for customer_id, lookalikes in cust_dict.items():
    row = [customer_id]
    for lookalike in lookalikes:
        row.extend(lookalike)  # Add the lookalike customer ID and score
    flattened_results.append(row)

# Create the DataFrame with 7 columns
lookalike_df = pd.DataFrame(flattened_results, columns=["CustomerID",
                                                      "Lookalike_CustomerID_1", "Score_1",
                                                      "Lookalike_CustomerID_2", "Score_2",
                                                      "Lookalike_CustomerID_3", "Score_3"])

# Step 9: Save the Lookalike Recommendations to a CSV file
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike recommendations saved to Lookalike.csv")


Lookalike recommendations saved to Lookalike.csv
