In [11]:
import pandas as pd
import numpy as np
import xgboost as xgb

# Load data
df_train = pd.read_csv('/Users/moisempongo/Documents/uni-projects/hotel-estimation/data/raw/subset_train_set_VU_DM.csv')
df_test = pd.read_csv('/Users/moisempongo/Documents/uni-projects/hotel-estimation/data/raw/subset_test_set_VU_DM.csv')

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GroupKFold
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
from sklearn.metrics import ndcg_score




# 1. Data Loading and Preprocessing
# Column alignment
train_cols = set(df_train.columns)
test_cols = set(df_test.columns)

# Add missing columns to test data
for col in train_cols - test_cols:
    if df_train[col].dtype == 'object':
        df_test[col] = ''
    else:
        df_test[col] = 0  # Fill with 0 for numerical columns
        
# Data type alignment
for col in train_cols.intersection(test_cols):
    if df_train[col].dtype != df_test[col].dtype:
        try:
            df_test[col] = df_test[col].astype(df_train[col].dtype)
        except ValueError:
            print(f"Warning: Could not convert column '{col}' to {df_train[col].dtype}")
            # Handle the error appropriately

# Select the subset of columns in train data
df_test = df_test[list(train_cols)] # Reorder columns


# 2. Feature Engineering

# Define a function for feature engineering 
def engineer_features(df):
    # User-centric features
    df['user_total_clicks'] = df.groupby('srch_id')['click_bool'].transform('sum')
    df['user_total_bookings'] = df.groupby('srch_id')['booking_bool'].transform('sum')
    df['user_avg_booking_price'] = df.groupby('srch_id')['price_usd'].transform(
        lambda x: np.mean(x[df['booking_bool'] == 1])
    )
    df['date_time'] = pd.to_datetime(df['date_time'])
    df['day_of_week'] = df['date_time'].dt.dayofweek
    df['month'] = df['date_time'].dt.month
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)

    # Hotel-centric features
    competitor_cols = [col for col in df.columns if 'comp' in col and 'rate_percent_diff' in col]
    df['composite_hotel_score'] = df['prop_starrating'] + df['prop_review_score']
    for col in competitor_cols:
        df[col] = df[col].fillna(df[col].median())
        df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())

    # Interaction features
    df['price_diff_avg_booking'] = df['price_usd'] - df['user_avg_booking_price']

    # Target variable creation (for training data)
    if 'booking_bool' in df.columns:
        df['target'] = 0
        df.loc[df['booking_bool'] == 1, 'target'] = 5
        df.loc[df['click_bool'] == 1, 'target'] = 1

    # Feature Scaling
    numerical_features_to_scale = [
        'user_total_clicks', 'user_total_bookings', 'user_avg_booking_price',
        'price_diff_avg_booking', 'price_usd', 'prop_location_score1',
        'prop_location_score2', 'prop_log_historical_price'
    ] + competitor_cols
    scaler = MinMaxScaler()
    df[numerical_features_to_scale] = scaler.fit_transform(df[numerical_features_to_scale])

    return df

# Apply feature engineering to both training and test data
df_train = engineer_features(df_train)
df_test = engineer_features(df_test)

# 3. Data Splitting (for training data)

features = [
    col for col in df_train.columns if col not in ['target', 'srch_id', 'prop_id', 'booking_bool', 'click_bool', 'date_time']
]
X = df_train[features]
y = df_train['target']
groups = df_train['srch_id']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=groups)

# 4. Model Training using XGBoost

# Define parameters for XGBoost
params = {
    'objective': 'rank:pairwise',
    'eval_metric': 'ndcg@5',
    'eta': 0.1,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
}

# Create DMatrix for training and validation
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

# Train the XGBoost model
model = xgb.train(params, dtrain, num_boost_round=100, evals=[(dtrain, 'train'), (dval, 'val')], early_stopping_rounds=10)

# 5. Model Evaluation and Prediction

# Prepare test data for prediction
X_test = df_test[features]
dtest = xgb.DMatrix(X_test)

# Make predictions on test data
y_pred = model.predict(dtest)

# Get search IDs for grouping
test_srch_ids = df_test['srch_id'].values

# Calculate NDCG@5
ndcg_score = ndcg_at_k(y_test.values, y_pred, k=5, group_labels=test_srch_ids)
print(f"NDCG@5 on test data: {ndcg_score}")

# 6. Submission File Creation

# Create submission dataframe
submission_df = pd.DataFrame({'SearchId': df_test['srch_id'], 'PropertyId': df_test['prop_id'], 'Score': y_pred})

# Sort by SearchId and Score
submission_df = submission_df.sort_values(['SearchId', 'Score'], ascending=[True, False])

# Save submission file
submission_df[['SearchId', 'PropertyId']].to_csv('submission.csv', index=False)

  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))


[0]	train-ndcg@5:1.00000	val-ndcg@5:1.00000
[1]	train-ndcg@5:1.00000	val-ndcg@5:1.00000
[2]	train-ndcg@5:1.00000	val-ndcg@5:1.00000
[3]	train-ndcg@5:1.00000	val-ndcg@5:1.00000
[4]	train-ndcg@5:1.00000	val-ndcg@5:1.00000
[5]	train-ndcg@5:1.00000	val-ndcg@5:1.00000
[6]	train-ndcg@5:1.00000	val-ndcg@5:1.00000
[7]	train-ndcg@5:1.00000	val-ndcg@5:1.00000
[8]	train-ndcg@5:1.00000	val-ndcg@5:1.00000
[9]	train-ndcg@5:1.00000	val-ndcg@5:1.00000
[10]	train-ndcg@5:1.00000	val-ndcg@5:1.00000


NameError: name 'ndcg_at_k' is not defined

In [None]:
sub

Unnamed: 0,SearchId,PropertyId
15,150,74211
30,150,128711
5,150,39493
8,150,53072
14,150,73865
...,...,...
99105,332757,116758
99100,332757,83037
99095,332757,21727
99099,332757,82013
