# Imports and dataloading

In [44]:
# Install xgboost
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.1.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.1.1-py3-none-win_amd64.whl (72.0 MB)
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.3/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.5/72.0 MB 1.2 MB/s eta 0:01:00
    --------------------------------------- 1.3/72.0 MB 2.0 MB/s eta 0:00:36
   - -------------------------------------- 2.4/72.0 MB 2.8 MB/s eta 0:00:25
   -- ------------------------------------- 3.7/72.0 MB 3.5 MB/s eta 0:00:20
   -- ------------------------------------- 4.7/72.0 MB 3.9 MB/s eta 0:00:18
   --- ------------------------------------ 6.0/72.0 MB 4.2 MB/s eta 0:00:16
   ---- ---------------------------------

In [45]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score, log_loss
import warnings; warnings.filterwarnings('ignore')

# files
customers   = pd.read_csv('data/cleaned_customer_data.csv')
social      = pd.read_excel('data/customer_social_profiles.xlsx')
transactions = pd.read_excel('data/customer_transactions.xlsx')

print("Files loaded:")
print(f"Customers: {customers.shape} | Social: {social.shape} | Transactions: {transactions.shape}")

Files loaded:
Customers: (187, 10) | Social: (155, 5) | Transactions: (150, 6)


In [24]:
# Merge step-by-step
df = (order_items
      .merge(orders, on='order_id')
      .merge(customers, on='customer_id')
      .merge(products, on='product_id')
      .merge(reviews, on='order_id', how='left')
      .merge(category_name, on='product_category_name', how='left'))

# Display shape and preview
print(f"Dataset shape: {df.shape}")
print(df.head())

Dataset shape: (113314, 33)
                           order_id  order_item_id  \
0  00010242fe8c5a6d1ba2dd792cb16214              1   
1  00018f77f2f0320c557190d7a144bdd3              1   
2  000229ec398224ef6ca0657da4fc703e              1   
3  00024acbcdf0a6daa1e931b038114c75              1   
4  00042b26cf59d7ce69dfabb4e55b4fd9              1   

                         product_id                         seller_id  \
0  4244733e06e7ecb4970a6e2683c13e61  48436dade18ac8b2bce089ec2a041202   
1  e5f2d52b802189ee658865ca93d83a8f  dd7ddc04e1b6c2c614352b383efe2d36   
2  c777355d18b72b67abbeef9df44fd0fd  5b51032eddd242adc84c38acab88f23d   
3  7634da152a4610f1595efa32f14722fc  9d7a1d34a5052409006425275ba1c2b4   
4  ac6c3623068f30de03045865e4e10089  df560393f3a51e74553ab94004ba5c87   

   shipping_limit_date   price  freight_value  \
0  2017-09-19 09:45:35   58.90          13.29   
1  2017-05-03 11:05:13  239.90          19.93   
2  2018-01-18 14:48:30  199.00          17.87   
3  2018-08-1

# Feature Engineering

In [25]:
# Clean & engineer features
df['order_purchase_timestamp'] = pd.to_datetime(df['order_purchase_timestamp'])
df['order_approved_at'] = pd.to_datetime(df['order_approved_at'])

# Target: purchased = 1 (all in this dataset are purchases → we’ll simulate "not purchased")
# We'll create negative samples: random user-product pairs that were NOT bought

# Aggregations per user
user_stats = df.groupby('customer_unique_id').agg(
    user_total_spend=('price', 'sum'),
    user_num_orders=('order_id', 'nunique'),
    user_avg_rating=('review_score', 'mean'),
    user_last_purchase=('order_purchase_timestamp', 'max')
).reset_index()

# Aggregations per product
product_stats = df.groupby('product_id').agg(
    product_price_mean=('price', 'mean'),
    product_rating_mean=('review_score', 'mean'),
    product_num_reviews=('review_score', 'count'),
    product_category=('product_category_name_english', 'first')
).reset_index()

# Merge stats back
df = df.merge(user_stats, on='customer_unique_id').merge(product_stats, on='product_id')

# Recency: days since last purchase for this user
latest_date = df['order_purchase_timestamp'].max()
df['days_since_last_purchase'] = (latest_date - df['user_last_purchase']).dt.days

# Click proxy: assume every product in an order was "seen" → clicked = 1 for purchased
df['clicked'] = 1

print("Feature-engineered columns:")
print(df[['price', 'product_rating_mean', 'user_total_spend', 'days_since_last_purchase', 'clicked']].head())

Feature-engineered columns:
    price  product_rating_mean  user_total_spend  days_since_last_purchase  \
0   58.90             4.444444             58.90                       355   
1  239.90             4.000000            252.78                       349   
2  199.00             4.333333            199.00                       231   
3   12.99             4.000000             12.99                        25   
4  199.90             3.833333            199.90                       575   

   clicked  
0        1  
1        1  
2        1  
3        1  
4        1  


# Generate Negative Samples

In [38]:
# Purchased pairs to avoid
purchased_pairs = set(zip(df['customer_unique_id'], df['product_id']))
n_negatives = len(df) * 3
users = df['customer_unique_id'].unique()
products = df['product_id'].unique()

np.random.seed(42)
neg_samples = []
while len(neg_samples) < n_negatives:
    u = np.random.choice(users)
    p = np.random.choice(products)
    if (u, p) not in purchased_pairs:
        neg_samples.append({'customer_unique_id': u, 'product_id': p, 'purchased': 0})

neg_df = pd.DataFrame(neg_samples)
neg_df = neg_df.merge(user_stats, on='customer_unique_id', how='left')
neg_df = neg_df.merge(product_stats, on='product_id', how='left')

# Build modeling columns
neg_df['clicked'] = 0
neg_df['days_since_last_purchase'] = (latest_date - neg_df['user_last_purchase']).dt.days
neg_df['review_score'] = 0

# Fill missing
fill_dict = {
    'days_since_last_purchase': 365,
    'user_total_spend': 0,
    'user_avg_rating': 3.0,
    'product_rating_mean': 3.0,
    'product_num_reviews': 0,
    'product_price_mean': df['price'].mean()
}
neg_df = neg_df.fillna(fill_dict)
neg_df['category_id'] = neg_df['product_category'].astype('category').cat.codes

print(f"Negative samples: {neg_df.shape}")

Negative samples: (339942, 15)


# Positive sample

In [39]:
# Positive (purchased = 1)
pos_df = df[['customer_unique_id', 'product_id', 'price',
             'product_rating_mean', 'product_num_reviews',
             'user_total_spend', 'user_avg_rating',
             'days_since_last_purchase', 'clicked']].copy()

# Use product_price_mean from product_stats for consistency
pos_df = pos_df.merge(product_stats[['product_id', 'product_price_mean', 'product_category']], 
                      on='product_id', how='left')

pos_df['purchased'] = 1
pos_df['category_id'] = pos_df['product_category'].astype('category').cat.codes

# Final merge
common_cols = ['customer_unique_id', 'product_id', 'product_price_mean', 'product_rating_mean',
               'product_num_reviews', 'user_total_spend', 'user_avg_rating',
               'days_since_last_purchase', 'clicked', 'category_id', 'purchased']

final_df = pd.concat([
    pos_df.rename(columns={'price': 'product_price_mean'})[common_cols],
    neg_df.rename(columns={'product_price_mean': 'product_price_mean'})[common_cols]
], ignore_index=True)

print(f"Final dataset ready: {final_df.shape}")
print(final_df['purchased'].value_counts(normalize=True).round(3))

InvalidIndexError: Reindexing only valid with uniquely valued Index objects