In [8]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.neighbors import NearestNeighbors
from datetime import datetime

# Load datasets
customers = pd.read_csv('dataset/Customers.csv')
products = pd.read_csv('dataset/Products.csv')
transactions = pd.read_csv('dataset/Transactions.csv')

# Merge datasets
data = pd.merge(transactions, customers, on='CustomerID')
data = pd.merge(data, products, on='ProductID')

# Feature Engineering
# 1. Customer tenure (days since signup)
data['SignupDate'] = pd.to_datetime(data['SignupDate'])
data['Tenure'] = (datetime.now() - data['SignupDate']).dt.days
data.rename(columns= {"Price_x": "Price"},inplace= True)
data

In [12]:

# 2. Aggregate transaction data for customer-level features
customer_features = data.groupby('CustomerID').agg({
    'Tenure': 'first',  # Tenure is the same for all rows of a customer
    'Region': 'first',  # Region is the same for all rows of a customer
    'TotalValue': ['sum', 'mean'],  # Total and average spending
    'Quantity': 'sum',  # Total quantity purchased
    'Category': lambda x: x.mode()[0],  # Most frequent category
    'Price': 'mean'  # Average product price
}).reset_index()

# Flatten multi-index columns
customer_features.columns = [
    'CustomerID', 'Tenure', 'Region', 
    'TotalSpending', 'AvgSpending', 
    'TotalQuantity', 'FavoriteCategory', 
    'AvgProductPrice'
]

# 3. Encode categorical variables (Region and FavoriteCategory)
encoder = OneHotEncoder( drop='first')  # Drop first to avoid multicollinearity
encoded_features = encoder.fit_transform(customer_features[['Region', 'FavoriteCategory']])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(['Region', 'FavoriteCategory']))

# Combine encoded features with numerical features
customer_features_final = pd.concat([customer_features.drop(['Region', 'FavoriteCategory'], axis=1), encoded_df], axis=1)

# 4. Normalize numerical features
scaler = MinMaxScaler()
customer_features_scaled = scaler.fit_transform(customer_features_final.iloc[:, 1:])

# Train KNN Model
knn_model = NearestNeighbors(n_neighbors=4, metric='cosine')  # 4 because we exclude the customer itself
knn_model.fit(customer_features_scaled)

# Lookalike Recommendations
lookalikes = {}
for i, cust_id in enumerate(customer_features_final['CustomerID']):
    if cust_id in [f'C{str(i).zfill(4)}' for i in range(1, 21)]:  # Only for C0001 to C0020
        distances, indices = knn_model.kneighbors([customer_features_scaled[i]])
        similar_customers = [(customer_features_final['CustomerID'][j], 1 - distances[0][k])  # Convert distance to similarity
                             for k, j in enumerate(indices[0]) if j != i][:3]  # Exclude self and take top 3
        lookalikes[cust_id] = similar_customers

# Save to CSV
lookalike_df = pd.DataFrame(lookalikes.items(), columns=['CustomerID', 'Lookalikes'])
lookalike_df.to_csv('Lookalike.csv', index=False)

# Print results
print(lookalike_df)

ValueError: Shape of passed values is (199, 1), indices imply (199, 6)