In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import accuracy_score, ndcg_score
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

  from pandas import MultiIndex, Int64Index


# Airbnb Dataset

In [2]:
train_df = pd.read_csv("datasets/airbnb/train_users_2.csv")
test_df = pd.read_csv("datasets/airbnb/test_users.csv")
train_df.shape, test_df.shape

((213451, 16), (62096, 15))

In [3]:
train_df.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US
3,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,other
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US


In [4]:
# Drop id column
train_df.drop(columns="id", inplace=True)

In [5]:
for data in [train_df, test_df]:
    data.date_account_created = pd.to_datetime(data['date_account_created'])
    data['account_year'] = data.date_account_created.dt.year
    data['account_month'] = data.date_account_created.dt.month
    data['account_day'] = train_df.date_account_created.dt.day

In [6]:
train_df = train_df[['timestamp_first_active',
 'gender',
 'age',
 'signup_method',
 'signup_flow',
 'language',
 'affiliate_channel',
 'affiliate_provider',
 'signup_app',
 'first_device_type',
 'first_browser',
 'account_year',
 'account_month',
 'account_day','country_destination']]

test_df = test_df[['timestamp_first_active',
 'gender',
 'age',
 'signup_method',
 'signup_flow',
 'language',
 'affiliate_channel',
 'affiliate_provider',
 'signup_app',
 'first_device_type',
 'first_browser',
 'account_year',
 'account_month',
 'account_day']]

In [7]:
for data in [train_df,test_df]:
    data.age = data.age.apply(lambda x: np.nan if x<18 else x)
    data.age = data.age.apply(lambda x: np.nan if x>100 else x)

In [8]:
# replace nan age values with mean age
for data in [train_df,test_df]:
    data.age.fillna(data.age.mean(),inplace=True)

In [9]:
# check for any nan values after pre-processing
train_df.isna().sum()

timestamp_first_active    0
gender                    0
age                       0
signup_method             0
signup_flow               0
language                  0
affiliate_channel         0
affiliate_provider        0
signup_app                0
first_device_type         0
first_browser             0
account_year              0
account_month             0
account_day               0
country_destination       0
dtype: int64

In [10]:
def remove_outliers(df, name="feature"):
    #Store the 25th and 75th percentile 
    q25,q75 = np.percentile(df[name],25), np.percentile(df[name],75)
    #Calculate the Interquartile range
    iqr_cut = 1.5*(q75-q25)
    #Create variable of lower and upper cut
    lower,upper = q25-iqr_cut, q75+iqr_cut
    #Remove the outliers
    df = df[(df[name] >= lower) & (df[name] <= upper)]
    return df

In [11]:
train = train_df.copy()
train = remove_outliers(train,'age')

# Define inputs and target cols
inputs_col = train.columns[:-1]
target_col = ['country_destination']

# Define inputs
inputs = train[inputs_col].copy()
target = train[target_col].copy()

# Define numerical and categorical columns
numerical_cols = inputs.select_dtypes(include=['int64','float64']).columns.to_list()
categorical_cols = inputs.select_dtypes(include='object').columns.to_list()

# Normalization
scaler = MinMaxScaler().fit(inputs[numerical_cols])
inputs[numerical_cols] = scaler.transform(inputs[numerical_cols])

# label encoding
for col in categorical_cols:
    # label encoding
    encoder = LabelEncoder().fit(inputs[col])
    inputs[col] = encoder.transform(inputs[col])
    
enc_countries = {'NDF':0,'US':1,'FR':2,'CA':3,'GB':4,'ES':5,'IT':6,'PT':7,'NL':8,'DE':9,'AU':10,'other':11}
target['country_destination'] = target['country_destination'].apply(lambda x:enc_countries[x])


# Define X variable
X = inputs[numerical_cols + categorical_cols]
# Define y variable
y = target['country_destination']

In [12]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=3)

In [13]:
print('X_train:', len(X_train))
print('y_train:', len(y_train))
print('X_val:', len(X_test))
print('y_train:', len(y_test))

X_train: 127762
y_train: 127762
X_val: 42588
y_train: 42588


In [14]:
xgb = XGBClassifier(random_state=3, n_jobs=-1, max_depth=3, n_estimators=100, objective='multi:softprob', learning_rate=0.3, use_label_encoder=False)
xgb.fit(X_train, y_train, eval_metric="merror")
train_accuracy_score = accuracy_score(xgb.predict(X_train), y_train)
test_accuracy_score = accuracy_score(xgb.predict(X_test), y_test)
train_ndcg_score = ndcg_score(pd.get_dummies(y_train).to_numpy(), xgb.predict_proba(X_train))
test_ndcg_score = ndcg_score(pd.get_dummies(y_test).to_numpy(), xgb.predict_proba(X_test))

In [15]:
train_accuracy_score, test_accuracy_score, train_ndcg_score, test_ndcg_score

(0.6606737527590363,
 0.6601155255001409,
 0.8512604299810991,
 0.8505165619174855)

In [16]:
xgb_importance_df = pd.DataFrame({'features': X.columns,
                  'importance': xgb.feature_importances_}).sort_values('importance',ascending=True)

In [17]:
px.bar(xgb_importance_df, x='importance', y='features')

In [18]:
# save model

import pickle
pkl_file = open("pkl_files/XGBoost.pkl", "wb")
pickle.dump(xgb, pkl_file)
pkl_file.close()