In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

In [None]:
df_train = pd.read_csv('/Users/dominicdebiaso/Development/datasets/kaggle_expedia_train.csv')
df_test = pd.read_csv('/Users/dominicdebiaso/Development/datasets/kaggle_expedia_test.csv', index_col='id')

### Data Processing

In [None]:
df_train.head()

In [None]:
# Missing values 
df_train.isnull().sum()

In [None]:
df_test.isnull().sum()

In [None]:
# Difference between data sets
print(list(set(df_train.columns) - set(df_test.columns)))

In [None]:
## Clean up columns
train = df_train.dropna(axis=1)
train = train.drop(['cnt', 'is_booking', 'user_id'], axis=1)
train = train.select_dtypes(include=[np.number])

test = df_test.dropna(axis=1)
test = test.drop(['user_id'], axis=1)
test = test.select_dtypes(include=[np.number])

## Split data into X and y
X_train = train.iloc[:,:-1]
X_test = test
y_train = train.iloc[:,-1]
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Model Development

In [None]:
dtrain = xgb.DMatrix(X_train, y_train)
dtest = xgb.DMatrix(X_test)

param = {
    'eval_metric':'map@5', #mean average precision for set of queries with 5 events
                           #ensure that all classes are recognized 
    'objective':'multi:softprob', #softprob gives the matrix with prob value of each class trying to predict
                                  #softmax gives the class with the max prob as output
    'num_class':100
}
xgb_cv = xgb.cv(param, dtrain, num_boost_round=1000, nfold=5, metrics=['mae@5'], early_stopping_rounds=100)
best_nrounds = xgb_cv.shape[0]

bst = xgb.train(param, dtrain, num_boost_round=best_nrounds)
preds = bst.predict(dtest)

df_xgb = pd.DataFrame(preds, index=X_test.index, columns=['hotel_cluster'])
# df_xgb.to_csv('/Users/dominicdebiaso/Desktop/kaggle_expedia_xgb.csv')