# Imports and dataloading

In [44]:
# Install xgboost
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.1.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.1.1-py3-none-win_amd64.whl (72.0 MB)
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.3/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.5/72.0 MB 1.2 MB/s eta 0:01:00
    --------------------------------------- 1.3/72.0 MB 2.0 MB/s eta 0:00:36
   - -------------------------------------- 2.4/72.0 MB 2.8 MB/s eta 0:00:25
   -- ------------------------------------- 3.7/72.0 MB 3.5 MB/s eta 0:00:20
   -- ------------------------------------- 4.7/72.0 MB 3.9 MB/s eta 0:00:18
   --- ------------------------------------ 6.0/72.0 MB 4.2 MB/s eta 0:00:16
   ---- ---------------------------------

In [45]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score, log_loss
import warnings; warnings.filterwarnings('ignore')

# files
customers   = pd.read_csv('data/cleaned_customer_data.csv')
social      = pd.read_excel('data/customer_social_profiles.xlsx')
transactions = pd.read_excel('data/customer_transactions.xlsx')

print("Files loaded:")
print(f"Customers: {customers.shape} | Social: {social.shape} | Transactions: {transactions.shape}")

Files loaded:
Customers: (187, 10) | Social: (155, 5) | Transactions: (150, 6)


# Data merging

In [48]:
import pandas as pd

customers   = pd.read_csv('data/cleaned_customer_data.csv')
social      = pd.read_excel('data/customer_social_profiles.xlsx')
transactions = pd.read_excel('data/customer_transactions.xlsx')

print("\n=== CUSTOMERS (first 3 rows) ===")
print(customers.head(3))
print("\nColumns:", customers.columns.tolist())

print("\n=== SOCIAL PROFILES (first 3 rows) ===")
print(social.head(3))
print("\nColumns:", social.columns.tolist())

print("\n=== TRANSACTIONS (first 3 rows) ===")
print(transactions.head(3))
print("\nColumns:", transactions.columns.tolist())


=== CUSTOMERS (first 3 rows) ===
   customer_id_clean  transaction_id purchase_date product_category  \
0                151            1001    2024-01-01           Sports   
1                151            1001    2024-01-01           Sports   
2                192            1002    2024-01-02      Electronics   

   purchase_amount  customer_rating social_media_platform  engagement_score  \
0              408              2.3                TikTok                61   
1              408              2.3               Twitter                72   
2              332              4.2             Instagram                60   

   purchase_interest_score review_sentiment  
0                      1.3          Neutral  
1                      1.6          Neutral  
2                      4.3         Positive  

Columns: ['customer_id_clean', 'transaction_id', 'purchase_date', 'product_category', 'purchase_amount', 'customer_rating', 'social_media_platform', 'engagement_score', 'purchase_

# Auto detect merging

In [53]:
# List of common ID column names
id_candidates = [
    'customer_id', 'CustomerID', 'cust_id', 'user_id', 'id', 'customerID', 'CustomerId',
    'customer_key', 'cust_key', 'user_key', 'client_id', 'ClientID', 'account_id'
]

def find_merge_key(df1, df2, candidates=id_candidates):
    for col in candidates:
        if col in df1.columns and col in df2.columns:
            return col
    return None

# Detect keys
key_tc = find_merge_key(transactions, customers)
key_ts = find_merge_key(transactions, social)

print(f"Detected key (transactions ↔ customers): {key_tc}")
print(f"Detected key (transactions ↔ social):    {key_ts}")

Detected key (transactions ↔ customers): None
Detected key (transactions ↔ social):    None


# Safe merge

In [None]:
import pandas as pd

# Reload the files (in case they changed)
customers   = pd.read_csv('data/cleaned_customer_data.csv')
social      = pd.read_excel('data/customer_social_profiles.xlsx')
transactions = pd.read_excel('data/customer_transactions.xlsx')

print("\nCUSTOMERS columns  :", customers.columns.tolist())
print("SOCIAL columns     :", social.columns.tolist())
print("TRANSACTIONS columns:", transactions.columns.tolist())

candidates = [
    'customer_id', 'CustomerID', 'cust_id', 'user_id', 'id', 'customerID',
    'CustomerId', 'client_id', 'ClientID', 'account_id', 'cust_key',
    'customer_key', 'user_key', 'Customer_ID', 'CustID'
]

def find_common(df1, df2, candidates):
    for c in candidates:
        if c in df1.columns and c in df2.columns:
            return c
    return None

key_tc = find_common(transactions, customers, candidates)
key_ts = find_common(transactions, social, candidates)

df = transactions.copy()                     # start with transactions

if key_tc:
    df = df.merge(customers, on=key_tc, how='left')
    print(f"Customers merged on '{key_tc}'")
else:
    print("No common key with customers – continuing without customer data")

if key_ts:
    df = df.merge(social, on=key_ts, how='left')
    print(f"Social profiles merged on '{key_ts}'")
else:
    print("No common key with social – continuing without social data")

print(f"\nMerged dataset shape: {df.shape}")
print(df.head())