# Imports and dataloading

In [44]:
# Install xgboost
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.1.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.1.1-py3-none-win_amd64.whl (72.0 MB)
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.3/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.5/72.0 MB 1.2 MB/s eta 0:01:00
    --------------------------------------- 1.3/72.0 MB 2.0 MB/s eta 0:00:36
   - -------------------------------------- 2.4/72.0 MB 2.8 MB/s eta 0:00:25
   -- ------------------------------------- 3.7/72.0 MB 3.5 MB/s eta 0:00:20
   -- ------------------------------------- 4.7/72.0 MB 3.9 MB/s eta 0:00:18
   --- ------------------------------------ 6.0/72.0 MB 4.2 MB/s eta 0:00:16
   ---- ---------------------------------

In [45]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score, log_loss
import warnings; warnings.filterwarnings('ignore')

# files
customers   = pd.read_csv('data/cleaned_customer_data.csv')
social      = pd.read_excel('data/customer_social_profiles.xlsx')
transactions = pd.read_excel('data/customer_transactions.xlsx')

print("Files loaded:")
print(f"Customers: {customers.shape} | Social: {social.shape} | Transactions: {transactions.shape}")

Files loaded:
Customers: (187, 10) | Social: (155, 5) | Transactions: (150, 6)


# Data merging

In [48]:
import pandas as pd

customers   = pd.read_csv('data/cleaned_customer_data.csv')
social      = pd.read_excel('data/customer_social_profiles.xlsx')
transactions = pd.read_excel('data/customer_transactions.xlsx')

print("\n=== CUSTOMERS (first 3 rows) ===")
print(customers.head(3))
print("\nColumns:", customers.columns.tolist())

print("\n=== SOCIAL PROFILES (first 3 rows) ===")
print(social.head(3))
print("\nColumns:", social.columns.tolist())

print("\n=== TRANSACTIONS (first 3 rows) ===")
print(transactions.head(3))
print("\nColumns:", transactions.columns.tolist())


=== CUSTOMERS (first 3 rows) ===
   customer_id_clean  transaction_id purchase_date product_category  \
0                151            1001    2024-01-01           Sports   
1                151            1001    2024-01-01           Sports   
2                192            1002    2024-01-02      Electronics   

   purchase_amount  customer_rating social_media_platform  engagement_score  \
0              408              2.3                TikTok                61   
1              408              2.3               Twitter                72   
2              332              4.2             Instagram                60   

   purchase_interest_score review_sentiment  
0                      1.3          Neutral  
1                      1.6          Neutral  
2                      4.3         Positive  

Columns: ['customer_id_clean', 'transaction_id', 'purchase_date', 'product_category', 'purchase_amount', 'customer_rating', 'social_media_platform', 'engagement_score', 'purchase_

# Auto detect merging

In [53]:
# List of common ID column names
id_candidates = [
    'customer_id', 'CustomerID', 'cust_id', 'user_id', 'id', 'customerID', 'CustomerId',
    'customer_key', 'cust_key', 'user_key', 'client_id', 'ClientID', 'account_id'
]

def find_merge_key(df1, df2, candidates=id_candidates):
    for col in candidates:
        if col in df1.columns and col in df2.columns:
            return col
    return None

# Detect keys
key_tc = find_merge_key(transactions, customers)
key_ts = find_merge_key(transactions, social)

print(f"Detected key (transactions ↔ customers): {key_tc}")
print(f"Detected key (transactions ↔ social):    {key_ts}")

Detected key (transactions ↔ customers): None
Detected key (transactions ↔ social):    None


# Safe merge

In [55]:
import pandas as pd

# Reload the files (in case they changed)
customers   = pd.read_csv('data/cleaned_customer_data.csv')
social      = pd.read_excel('data/customer_social_profiles.xlsx')
transactions = pd.read_excel('data/customer_transactions.xlsx')

print("\nCUSTOMERS columns  :", customers.columns.tolist())
print("SOCIAL columns     :", social.columns.tolist())
print("TRANSACTIONS columns:", transactions.columns.tolist())

candidates = [
    'customer_id', 'CustomerID', 'cust_id', 'user_id', 'id', 'customerID',
    'CustomerId', 'client_id', 'ClientID', 'account_id', 'cust_key',
    'customer_key', 'user_key', 'Customer_ID', 'CustID'
]

def find_common(df1, df2, candidates):
    for c in candidates:
        if c in df1.columns and c in df2.columns:
            return c
    return None

key_tc = find_common(transactions, customers, candidates)
key_ts = find_common(transactions, social, candidates)

df = transactions.copy()                     # start with transactions

if key_tc:
    df = df.merge(customers, on=key_tc, how='left')
    print(f"Customers merged on '{key_tc}'")
else:
    print("No common key with customers – continuing without customer data")

if key_ts:
    df = df.merge(social, on=key_ts, how='left')
    print(f"Social profiles merged on '{key_ts}'")
else:
    print("No common key with social – continuing without social data")

print(f"\nMerged dataset shape: {df.shape}")
print(df.head())


CUSTOMERS columns  : ['customer_id_clean', 'transaction_id', 'purchase_date', 'product_category', 'purchase_amount', 'customer_rating', 'social_media_platform', 'engagement_score', 'purchase_interest_score', 'review_sentiment']
SOCIAL columns     : ['customer_id_new', 'social_media_platform', 'engagement_score', 'purchase_interest_score', 'review_sentiment']
TRANSACTIONS columns: ['customer_id_legacy', 'transaction_id', 'purchase_amount', 'purchase_date', 'product_category', 'customer_rating']
No common key with customers – continuing without customer data
No common key with social – continuing without social data

Merged dataset shape: (150, 6)
   customer_id_legacy  transaction_id  purchase_amount purchase_date  \
0                 151            1001              408    2024-01-01   
1                 192            1002              332    2024-01-02   
2                 114            1003              442    2024-01-03   
3                 171            1004              256   

# Feature Engineering

In [75]:
# 4. FEATURE ENGINEERING – FINAL & 100% WORKING
import pandas as pd
import numpy as np

print("ALL COLUMNS:", df.columns.tolist())


# 1. KEYS (use existing ones)

key_tc   = 'customer_id_legacy'      # customer ID
prod_col = 'product_id'              # already exists!
price_col = 'purchase_amount'
date_col  = 'purchase_date'

print(f"Using: customer='{key_tc}', product='{prod_col}'")


# 2. CLEAN UP DUPLICATE product_category (x and y)

# Use the one from original data: 'product_category_x'
if 'product_category_x' in df.columns:
    df['product_category'] = df['product_category_x']
elif 'product_category_y' in df.columns:
    df['product_category'] = df['product_category_y']
else:
    df['product_category'] = 'unknown'

print(f"Category column set to: 'product_category'")

# 3. DATE
df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
df = df.dropna(subset=[date_col]).copy()
latest_date = df[date_col].max()
print(f"Latest date: {latest_date.date()}")

# 4. FINAL FEATURES (already exist – just verify)
required = ['user_total_spend', 'days_since_last_purchase']
for col in required:
    if col not in df.columns:
        if col == 'user_total_spend':
            df[col] = df.groupby(key_tc)['purchase_amount'].transform('sum')
        elif col == 'days_since_last_purchase':
            df['user_last_purchase'] = df.groupby(key_tc)[date_col].transform('max')
            df[col] = (latest_date - df['user_last_purchase']).dt.days

df['clicked'] = 1
df['purchased'] = 1

# 5. SAMPLE PRINT (safe)
sample_cols = [key_tc, prod_col, price_col, 'days_since_last_purchase', 'user_total_spend', 'product_category']
print(f"\nFEATURE ENGINEERING DONE! → {df.shape}")
print("Sample:")
print(df[sample_cols].head(3))

ALL COLUMNS: ['customer_id_legacy', 'transaction_id', 'purchase_amount', 'purchase_date', 'product_category_x', 'customer_rating', 'product_id', 'user_total_spend', 'user_num_orders', 'user_last_purchase', 'user_avg_rating', 'product_price_mean', 'product_category_y', 'product_avg_rating', 'days_since_last_purchase', 'clicked', 'purchased', 'product_category']
Using: customer='customer_id_legacy', product='product_id'
Category column set to: 'product_category'
Latest date: 2024-05-29

FEATURE ENGINEERING DONE! → (150, 18)
Sample:
   customer_id_legacy        product_id  purchase_amount  \
0                 151       1001_Sports              408   
1                 192  1002_Electronics              332   
2                 114  1003_Electronics              442   

   days_since_last_purchase  user_total_spend product_category  
0                       149               408           Sports  
1                        72               823      Electronics  
2                        44 

# Negative sampling

In [79]:
# -------------------------------------------------
# 5. NEGATIVE SAMPLING – FINAL, 100 % WORKING
# -------------------------------------------------
import pandas as pd
import numpy as np

# 1. ENSURE product_category exists
if 'product_category' not in df.columns:
    if 'product_category_y' in df.columns:
        df['product_category'] = df['product_category_y']
    elif 'product_category_x' in df.columns:
        df['product_category'] = df['product_category_x']
    else:
        df['product_category'] = 'unknown'

# 2. SHARED category → id mapping
all_categories = pd.concat([df['product_category'], pd.Series(['unknown'])]).astype('category')
cat_to_id = dict(zip(all_categories.cat.categories, range(len(all_categories.cat.categories))))

# 3. NEGATIVE SAMPLES
purchased_pairs = set(zip(df[key_tc], df[prod_col]))
n_neg = len(df) * 3
users = df[key_tc].unique()
products = df[prod_col].unique()

np.random.seed(42)
neg_samples = []
while len(neg_samples) < n_neg:
    u = np.random.choice(users)
    p = np.random.choice(products)
    if (u, p) not in purchased_pairs:
        neg_samples.append({key_tc: u, prod_col: p, 'purchased': 0})

neg_df = pd.DataFrame(neg_samples)

# user stats
neg_df = neg_df.merge(
    df[[key_tc, 'user_total_spend', 'user_last_purchase']].drop_duplicates(),
    on=key_tc, how='left'
)

# product stats
neg_df = neg_df.merge(
    df[[prod_col, 'product_price_mean', 'product_category']].drop_duplicates(),
    on=prod_col, how='left'
)

# fill missing
neg_df['clicked'] = 0
neg_df['days_since_last_purchase'] = (latest_date - neg_df['user_last_purchase']).dt.days
neg_df['user_total_spend'] = neg_df['user_total_spend'].fillna(0)
neg_df['product_price_mean'] = neg_df['product_price_mean'].fillna(df[price_col].mean())
neg_df['product_category'] = neg_df['product_category'].fillna('unknown')
neg_df['category_id'] = neg_df['product_category'].map(cat_to_id)

# FIX: Remove duplicate columns
neg_df = neg_df.loc[:, ~neg_df.columns.duplicated()]
neg_df = neg_df.reset_index(drop=True)

# 4. POSITIVE SAMPLES
pos_df = df[[key_tc, prod_col, price_col,
            'product_price_mean', 'user_total_spend',
            'days_since_last_purchase', 'clicked']].copy()

pos_df['purchased'] = 1
pos_df = pos_df.merge(
    df[[prod_col, 'product_category']].drop_duplicates(),
    on=prod_col, how='left'
)
pos_df['product_category'] = pos_df['product_category'].fillna('unknown')
pos_df['category_id'] = pos_df['product_category'].map(cat_to_id)
pos_df = pos_df.rename(columns={price_col: 'product_price_mean'})

# FIX: Remove duplicates & reset
pos_df = pos_df.loc[:, ~pos_df.columns.duplicated()]
pos_df = pos_df.reset_index(drop=True)

# 5. FINAL DATASET
common_cols = [
    key_tc, prod_col, 'product_price_mean', 'user_total_spend',
    'days_since_last_purchase', 'clicked', 'category_id', 'purchased'
]

final_df = pd.concat([pos_df[common_cols], neg_df[common_cols]], ignore_index=True)

print(f"\nFINAL DATASET: {final_df.shape}")
print("Class balance:")
print(final_df['purchased'].value_counts(normalize=True).round(3))
print("\nSample:")
print(final_df.head(3))


FINAL DATASET: (600, 8)
Class balance:
purchased
0    0.75
1    0.25
Name: proportion, dtype: float64

Sample:
   customer_id_legacy        product_id  product_price_mean  user_total_spend  \
0                 151       1001_Sports               408.0               408   
1                 192  1002_Electronics               332.0               823   
2                 114  1003_Electronics               442.0              1034   

   days_since_last_purchase  clicked  category_id  purchased  
0                       149        1            4          1  
1                        72        1            2          1  
2                        44        1            2          1  


# Model training/ the best one 


In [81]:
# -------------------------------------------------
# 6. TRAIN, COMPARE & RECOMMEND (Random Forest, Logistic, XGBoost)
# -------------------------------------------------
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score, log_loss
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import joblib
import pickle

# -------------------------------
# 1. Features
# -------------------------------
num_features    = ['product_price_mean', 'user_total_spend', 'days_since_last_purchase']
cat_features    = ['category_id']
binary_features = ['clicked']

X = final_df[num_features + cat_features + binary_features]
y = final_df['purchased']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

# -------------------------------
# 2. Shared Preprocessor
# -------------------------------
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features),
    ('bin', 'passthrough', binary_features)
])

X_train_pre = preprocessor.fit_transform(X_train)
X_val_pre   = preprocessor.transform(X_val)

# -------------------------------
# 3. Train 3 Models
# -------------------------------
models = {}

# Random Forest
models['RandomForest'] = RandomForestClassifier(
    n_estimators=300, max_depth=6, class_weight='balanced',
    n_jobs=-1, random_state=42)
models['RandomForest'].fit(X_train_pre, y_train)

# Logistic Regression
models['Logistic'] = LogisticRegression(
    max_iter=1000, class_weight='balanced', n_jobs=-1, random_state=42)
models['Logistic'].fit(X_train_pre, y_train)

# XGBoost
scale_pos = (y_train == 0).sum() / (y_train == 1).sum()
models['XGBoost'] = xgb.XGBClassifier(
    n_estimators=300, max_depth=6, learning_rate=0.1,
    scale_pos_weight=scale_pos, eval_metric='logloss',
    n_jobs=-1, random_state=42, verbosity=0)
models['XGBoost'].fit(X_train_pre, y_train)

# -------------------------------
# 4. Compare Models
# -------------------------------
results = []
for name, model in models.items():
    y_pred = model.predict(X_val_pre)
    y_prob = model.predict_proba(X_val_pre)[:, 1] if hasattr(model, 'predict_proba') else y_pred
    results.append({
        'Model': name,
        'F1'   : round(f1_score(y_val, y_pred), 3),
        'LogLoss': round(log_loss(y_val, y_prob), 3)
    })

results_df = pd.DataFrame(results).sort_values('F1', ascending=False).reset_index(drop=True)
print("\nMODEL COMPARISON")
print(results_df.to_string(index=False))

# -------------------------------
# 5. Save Best Model
# -------------------------------
best_name = results_df.iloc[0]['Model']
best_model = models[best_name]

joblib.dump(best_model,   f'best_model_{best_name}.pkl')
joblib.dump(preprocessor, 'preprocessor.pkl')

with open('recommendation_artifacts.pkl', 'wb') as f:
    pickle.dump({
        'cat_to_id'   : cat_to_id,
        'latest_date' : latest_date,
        'key_tc'      : key_tc,
        'prod_col'    : prod_col,
        'price_col'   : price_col,
        'df_snapshot' : df[[key_tc, prod_col, 'product_category',
                           'product_price_mean', 'user_total_spend',
                           'user_last_purchase']].drop_duplicates()
    }, f)

print(f"\nBEST MODEL: {best_name}")
print(f"Saved: best_model_{best_name}.pkl | preprocessor.pkl | recommendation_artifacts.pkl")

# -------------------------------
# 6. RECOMMEND FUNCTION (uses best model)
# -------------------------------
def recommend(customer_id, n=5, model=best_model, top_n_cand=100):
    user_row = df[df[key_tc] == customer_id][[key_tc,
                'user_total_spend', 'user_last_purchase']].head(1)
    if user_row.empty:
        return f"Customer {customer_id} not found."

    purchased = set(df[df[key_tc] == customer_id][prod_col])
    cand = df[~df[prod_col].isin(purchased)][[prod_col,
                'product_price_mean', 'product_category']].drop_duplicates()

    if cand.empty:
        return "No new products."

    cand = cand.sample(min(top_n_cand, len(cand)), random_state=42).copy()
    cand = cand.merge(user_row, how='cross')
    cand['clicked'] = 0
    cand['days_since_last_purchase'] = (latest_date - cand['user_last_purchase']).dt.days
    cand['category_id'] = cand['product_category'].map(cat_to_id)

    X_cand = preprocessor.transform(cand[num_features + cat_features + binary_features])
    cand['score'] = model.predict_proba(X_cand)[:, 1]

    out = cand.nlargest(n, 'score')[['product_category', 'product_price_mean', 'score']]
    out.columns = ['Category', 'Price', 'Score']
    return out.round(3)

# -------------------------------
# 7. TEST RECOMMENDATION
# -------------------------------
test_cust = df[key_tc].iloc[0]
print(f"\nTOP 5 RECOMMENDATIONS FOR CUSTOMER {test_cust}:")
print(recommend(test_cust))


MODEL COMPARISON
       Model  F1  LogLoss
RandomForest 1.0    0.057
    Logistic 1.0    0.032
     XGBoost 1.0    0.003

BEST MODEL: RandomForest
Saved: best_model_RandomForest.pkl | preprocessor.pkl | recommendation_artifacts.pkl

TOP 5 RECOMMENDATIONS FOR CUSTOMER 151:
       Category  Price  Score
99     Clothing   64.0  0.053
78       Sports   91.0  0.043
25       Sports   88.0  0.040
30       Sports   85.0  0.038
87  Electronics  228.0  0.032
