In [9]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Load the datasets
customers = pd.read_csv("/content/Customers.csv")
transactions = pd.read_csv("/content/Transactions.csv")

# Merg customer and transaction data
merged_data = pd.merge(transactions, customers, on="CustomerID")

## Feature Engineering

In [10]:
# Calculate total spending, number of transactions, and average transaction value for each customer
customer_features = merged_data.groupby("CustomerID").agg(
    TotalSpending=("TotalValue", "sum"),
    NumTransactions=("TransactionID", "count"),
    AvgTransactionValue=("TotalValue", "mean")
).reset_index()

# customer demographics
customer_features = pd.merge(customer_features, customers, on="CustomerID")

In [11]:
# Encoding categorical variables (Region)
encoder = OneHotEncoder()
encoded_regions = encoder.fit_transform(customer_features[["Region"]]).toarray()
encoded_regions_df = pd.DataFrame(encoded_regions, columns=encoder.get_feature_names_out(["Region"]))

# Normalize numerical features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features[["TotalSpending", "NumTransactions", "AvgTransactionValue"]])

# Combine all features
features = np.hstack([scaled_features, encoded_regions_df])

# cosine similarity
similarity_matrix = cosine_similarity(features)


In [12]:
# Creating a dictionary to store recommendations
lookalike_map = {}

# Geting top 3 similar customers for each customer in the range C0001 - C0020
for i, customer_id in enumerate(customer_features["CustomerID"]):

    if customer_id in [f"C{str(i).zfill(4)}" for i in range(1, 21)]:

        # Geting similarity scores for the current customer
        similarity_scores = similarity_matrix[i]

        # Exclude the customer itself
        similarity_scores[i] = -1

        # Geting the indices of the top 3 similar customers
        top_3_indices = similarity_scores.argsort()[-3:][::-1]

        # Geting the customer IDs and similarity scores of the top 3
        top_3_customers = [(customer_features.iloc[idx]["CustomerID"], similarity_scores[idx]) for idx in top_3_indices]

        # Adding to the lookalike map
        lookalike_map[customer_id] = top_3_customers

# Converting the map to a DataFrame for saving as CSV
lookalike_df = pd.DataFrame.from_dict(lookalike_map, orient="index")
lookalike_df = lookalike_df.stack().apply(pd.Series).reset_index()

#  Providing 4 column names to match the DataFrame structure
lookalike_df.columns = ["CustomerID", "level_1", "LookalikeCustomerID", "SimilarityScore"]
lookalike_df = lookalike_df.drop(columns=['level_1']) # Assuming 'level_1' is not needed

# Save to CSV
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike model completed. Results saved to Lookalike.csv.")


Lookalike model completed. Results saved to Lookalike.csv.
