In [26]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Merge datasets to create a full dataset
transactions_customers = transactions.merge(customers, on="CustomerID", how="inner")
full_data = transactions_customers.merge(products, on="ProductID", how="inner")

# Step 1: Aggregate customer-level data
customer_profiles = full_data.groupby("CustomerID").agg({
    "TotalValue": ["sum", "mean"],  # Total and average spend
    "Quantity": "sum",             # Total quantity purchased
    "Category": lambda x: list(x.unique()),  # Unique categories purchased
}).reset_index()

# Flatten multi-level columns
customer_profiles.columns = ["CustomerID", "TotalSpend", "AvgSpend", "TotalQuantity", "Categories"]

# Step 2: One-hot encode the categories
categories = pd.get_dummies(full_data[["CustomerID", "Category"]], columns=["Category"]).groupby("CustomerID").sum()

# Step 3: Combine customer features with one-hot encoded categories
customer_features = customer_profiles.merge(categories, on="CustomerID")
customer_features = customer_features.drop(columns=["Categories"])  # Drop non-numeric "Categories" column

# Step 4: Normalize numerical data
scaler = StandardScaler()
customer_features.iloc[:, 1:] = scaler.fit_transform(customer_features.iloc[:, 1:])

# Step 5: Compute cosine similarity
similarity_matrix = cosine_similarity(customer_features.iloc[:, 1:])
similarity_df = pd.DataFrame(
    similarity_matrix,
    index=customer_features["CustomerID"],
    columns=customer_features["CustomerID"]
)

# Step 6: Generate top 3 lookalikes for the first 20 customers
lookalikes = {}

for customer_id in similarity_df.index[:20]:  # Limit to the first 20 customers
    # Sort similarity scores in descending order, exclude the customer itself
    similar_customers = similarity_df.loc[customer_id].sort_values(ascending=False).iloc[1:4]
    lookalikes[customer_id] = list(zip(similar_customers.index, similar_customers.values))

# Step 7: Convert lookalike results to the required format for CSV
lookalike_list = []

for customer_id, similar_customers in lookalikes.items():
    lookalike_list.append({
        "cust_id": customer_id,
        "similar_customers": similar_customers
    })

# Step 8: Create a DataFrame and save to CSV
lookalike_df = pd.DataFrame(lookalike_list)
lookalike_df["similar_customers"] = lookalike_df["similar_customers"].apply(lambda x: [f"({c}, {s:.4f})" for c, s in x])
lookalike_df.to_csv("Glaston_Velvarts_Lookalike.csv", index=False)

print("Glaston_Velvarts_Lookalike.csv file generated successfully!")


Glaston_Velvarts_Lookalike.csv file generated successfully!
