In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import NearestNeighbors

# Load Data
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Merge transactions with products to get product categories
transactions = transactions.merge(products, on="ProductID")

# Aggregate transaction data per customer
customer_spending = transactions.groupby("CustomerID").agg({
    "TotalValue": "sum",
    "Quantity": "sum"
}).reset_index()

# Merge with customer data
customer_profiles = customers.merge(customer_spending, on="CustomerID", how="left").fillna(0)

# One-Hot Encode Region
encoder = OneHotEncoder()
region_encoded = encoder.fit_transform(customer_profiles[["Region"]]).toarray()
region_df = pd.DataFrame(region_encoded, columns=encoder.get_feature_names_out(["Region"]))


customer_profiles = pd.concat([customer_profiles, region_df], axis=1)
customer_profiles.drop(columns=["CustomerName", "Region", "SignupDate"], inplace=True)

# Standardize Numeric Feature
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_profiles.iloc[:, 1:])


nbrs = NearestNeighbors(n_neighbors=4, metric="cosine")  # 4 to include self
nbrs.fit(scaled_features)

def get_lookalikes(customer_id, top_n=3):
    try:
        idx = customer_profiles[customer_profiles["CustomerID"] == customer_id].index[0]
        distances, indices = nbrs.kneighbors([scaled_features[idx]])
        similar_customers = [(customer_profiles.iloc[i]["CustomerID"], round(1 - distances[0][j], 4))
                             for j, i in enumerate(indices[0]) if i != idx]
        return similar_customers[:top_n]
    except IndexError:
        return []

# Generate Lookalike.csv for first 20 customers
lookalike_data = []
for cust_id in customer_profiles["CustomerID"].iloc[:20]:
    lookalikes = get_lookalikes(cust_id)
    row = [cust_id]
    for lookalike in lookalikes:
        row.extend(lookalike)
    while len(row) < 7:
        row.extend(["N/A", "N/A"])  # Fill missing values
    lookalike_data.append(row)


columns = ["CustomerID", "Lookalike1", "Score1", "Lookalike2", "Score2", "Lookalike3", "Score3"]
lookalike_df = pd.DataFrame(lookalike_data, columns=columns)
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike.csv generated successfully!\n")
print(lookalike_df.head())


Lookalike.csv generated successfully!

  CustomerID Lookalike1  Score1 Lookalike2  Score2 Lookalike3  Score3
0      C0001      C0107  0.9968      C0137  0.9961      C0184  0.9961
1      C0002      C0088  0.9982      C0142  0.9943      C0159  0.9895
2      C0003      C0147  0.9980      C0190  0.9973      C0174  0.9838
3      C0004      C0113  0.9945      C0102  0.9804      C0169  0.9791
4      C0005      C0186  0.9984      C0159  0.9980      C0140  0.9934


In [5]:
import pandas as pd
# Load and check the Lookalike.csv file
df = pd.read_csv("Lookalike.csv")
print(df.head(10)) 

  CustomerID Lookalike1  Score1 Lookalike2  Score2 Lookalike3  Score3
0      C0001      C0107  0.9968      C0137  0.9961      C0184  0.9961
1      C0002      C0088  0.9982      C0142  0.9943      C0159  0.9895
2      C0003      C0147  0.9980      C0190  0.9973      C0174  0.9838
3      C0004      C0113  0.9945      C0102  0.9804      C0169  0.9791
4      C0005      C0186  0.9984      C0159  0.9980      C0140  0.9934
5      C0006      C0048  0.9941      C0126  0.9920      C0187  0.9912
6      C0007      C0146  1.0000      C0177  0.9975      C0178  0.9974
7      C0008      C0018  0.9848      C0122  0.9639      C0046  0.9577
8      C0009      C0198  1.0000      C0014  0.9965      C0063  0.9939
9      C0010      C0019  0.9911      C0073  0.9857      C0166  0.9836
