#### **Importing Libraries**

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

#### **Importing Datasets**

In [2]:
customers_df = pd.read_csv("data/Customers.csv")
products_df = pd.read_csv("data/Products.csv")
transactions_df = pd.read_csv("data/Transactions.csv")

#### **2. Feature Engineering**

In [3]:
transaction_features = (
    transactions_df.groupby("CustomerID")
    .agg({
            "TotalValue": ["sum", "mean", "count"], 
            "Quantity": ["sum", "mean"]})
    .reset_index()
)

transaction_features.columns = [
    "CustomerID",
    "total_transaction_value",
    "avg_transaction_value",
    "transaction_count",
    "total_quantity",
    "avg_quantity",
]

Merge with customer dataframe

In [4]:
customer_features = customers_df.merge(transaction_features, on="CustomerID", how="left")

One-Hot Encoding - Region Column

In [5]:
region_encoded = pd.get_dummies(customer_features["Region"], prefix="region")
customer_features = pd.concat([customer_features, region_encoded], axis=1)

#### **3. Feature Preparation**

In [6]:
feature_columns = [
    "total_transaction_value",
    "avg_transaction_value",
    "transaction_count",
    "total_quantity",
    "avg_quantity",
] + [col for col in customer_features.columns if col.startswith("region_")]

Handle missing/infinity values

In [7]:
customer_features[feature_columns] = customer_features[feature_columns].fillna(0)
customer_features[feature_columns] = customer_features[feature_columns].replace([np.inf, -np.inf], 0)

Normalization

In [8]:
scaler = StandardScaler()
normalized_features = scaler.fit_transform(customer_features[feature_columns])

#### **4. Similarity Calculation**

In [9]:
def find_lookalikes(target_customer_id, normalized_features, customer_features, top_n=3):
    if target_customer_id not in customer_features["CustomerID"].values:
        print(f"CustomerID {target_customer_id} not found!")
        return []

    # Find index of target customer
    target_index = customer_features[customer_features["CustomerID"] == target_customer_id].index[0]

    # Calculate cosine similarity
    target_vector = normalized_features[target_index].reshape(1, -1)
    similarities = cosine_similarity(target_vector, normalized_features)[0]

    # Exclude the target customer itself
    similarities[target_index] = -1
    top_indices = np.argsort(similarities)[::-1][:top_n]

    # Get lookalikes with their similarity scores
    lookalikes = [
        {
            "customer_id": customer_features.iloc[idx]["CustomerID"],
            "similarity_score": float(similarities[idx]),
        }
        for idx in top_indices
    ]

    return lookalikes

#### **5. Saving Results**

In [10]:
target_customers = [f"C{str(i).zfill(4)}" for i in range(1, 21)]

In [11]:
with open("Faizan_Mulla_Lookalike.csv", "w") as f:
    f.write("CustomerID, Lookalikes\n")
    for customer_id in target_customers:
        lookalikes = find_lookalikes(customer_id, normalized_features, customer_features)

        lookalike_str = ("[" + ", ".join([f'C{i["customer_id"].split("C")[1]}:{i["similarity_score"]}' for i in lookalikes])+ "]")

        f.write(f'{customer_id}, "{lookalike_str}"\n')

print("Lookalike CSV file generated successfully!")

Lookalike CSV file generated successfully!
