In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import accuracy_score, ndcg_score
import plotly.express as px
import random
import warnings
warnings.filterwarnings('ignore')

  from pandas import MultiIndex, Int64Index


# Recommender System

In [2]:
cities = pd.read_csv('datasets/recommender/city_codes.csv', encoding="latin-1")
cities.loc[cities.destination == "IAH"]

Unnamed: 0,city,destination,country
3589,Houston,IAH,United States


In [3]:
data = [   
    ('user_id', 'airport_id', 'airport', 'region', 'season'),
    # IF u355541 flew 8 times in the summer
        '''
        4 to asia    - then asian summer flights get a  4 - 0.2
        2 to florida - if florida destination is twice, then +0.2
        1 to europe  
        1 to latin
        '''
]

In [4]:
origs = [   
    (1, 'ORD', 'Midwest'),
    (2, 'LAX', 'West'),
    (3, 'SFO', 'West'),
    (4, 'IAH', 'South'),
    (5, 'DEN', 'Snow'),
    (6, 'IAD', 'East'),
    (7, 'EWR', 'East')
]

dests = [
    (1, 'ORD', 'Midwest'),
    (2, 'LAX', 'Sun'),
    (3, 'SFO', 'West'),
    (4, 'IAH', 'Sun'),
    (5, 'DEN', 'Snow'),
    (6, 'IAD', 'East'),
    (7, 'EWR', 'East'),
    (8, 'MCO', 'Sun'),
    (9, 'TPA', 'Sun'),
    (10, 'MIA', 'Sun'),
    (11, 'RSW', 'Sun'),

    # Europe
    (12, 'LHR', 'EURO'),
    (13, 'CVG', 'EURO'),
    (14, 'AMS', 'EURO'),
    (15, 'DUB', 'EURO'),
    (16, 'BRU', 'EURO'),
    (17, 'MUC', 'EURO'),
    (18, 'FRA', 'EURO'),
    (19, 'BHM', 'EURO'),

    # Asia
    (20, 'ICN', 'ASIA'),
    (21, 'NRT', 'ASIA'),
    (22, 'KIX', 'ASIA'),
    (23, 'PVG', 'ASIA'),
    (24, 'HKG', 'ASIA'),
    (25, 'PEK', 'ASIA'),
    (26, 'TPE', 'ASIA'),
    (27, 'SIN', 'ASIA'),

    # Latin America
    (28, 'EZE', 'LAT'),
    (29, 'GRU', 'LAT'),
    (30, 'GIG', 'LAT'),
    (31, 'BOG', 'LAT'),
    (32, 'SCL', 'LAT'),
    (33, 'TGU', 'LAT'),
    (34, 'MEX', 'LAT')
]

In [5]:
# Create an empty DataFrame
data = pd.DataFrame()

# Select amount of random data to generate 
n = 3000

# Append random data to new DataFrame columns
data['user_id'] = [random.choice(range(1,99)) for i in range(n)] 
data['origin_tuple'] = [random.choice(origs) for i in range(n)]
data['destination_tuple'] = [random.choice(dests) for i in range(n)]

# Transform Tuple() items to DataFrame column
data = data.join(pd.DataFrame(data['origin_tuple'].values.tolist(), columns = 
                              ['orig_id', 'orig', 'orig_reg']
                             )
                )
data = data.join(pd.DataFrame(data['destination_tuple'].values.tolist(), columns = 
                              ['dest_id', 'dest', 'dest_reg']
                             )
                )
data = data[[
    'user_id',
    'dest_id',
    'orig',
    'dest',
    'dest_reg'
]]

In [6]:
counts = data.groupby(['user_id', 'dest_reg'], as_index = False)['dest'].count()
counts = counts.rename(columns = {'dest': 'freq'})

data = pd.merge(data, counts, on = ['user_id', 'dest_reg'], how = 'left')

In [77]:
data.to_csv("datasets/recommender/user_flight_data.csv", index=False)

In [8]:
# flight_data = pd.read_csv("datasets/recommender/user_flight_data.csv")

In [40]:
import implicit
import scipy.sparse as sparse

'''
The implicit library expects data as a item-user matrix so we create two matricies,
one for fitting the model (item-user) and one for recommendations (user-item)
'''
sparse_item_user = sparse.csr_matrix((data['freq'].astype(float), (data['dest_id'], data['user_id'])))
sparse_user_item = sparse.csr_matrix((data['freq'].astype(float), (data['user_id'], data['dest_id'])))

In [41]:
# Initialize the als model and fit it using the sparse item-user matrix
model = implicit.als.AlternatingLeastSquares(factors = 20, regularization = 0.1, iterations = 20)

# Calculate the confidence by multiplying it by our alpha value.
alpha_val = 15
data_conf = (sparse_user_item * alpha_val).astype('double')

#Fit the model
model.fit(data_conf)

100%|██████████| 20/20 [00:00<00:00, 1538.46it/s]


In [78]:
# Create recommendations for user with id
user_id = int(input('Enter User ID: '))

# Use the implicit recommender
indexes, score = model.recommend(user_id, sparse_user_item[user_id])

airports = []
scores = []

for i in range(len(indexes)):
    if not data.dest.loc[data.user_id == indexes[i]].empty:
        airports.append(data.dest.loc[data.user_id == indexes[i]].iloc[0])
        scores.append(score[i].round(3))

# recommendations = pd.DataFrame({"destination": airports , "score": scores})
recommendations = pd.DataFrame({"destination": airports})
recommendations

Unnamed: 0,destination
0,TPA
1,IAH
2,GIG
3,LHR
4,HKG
5,MIA
6,BRU
7,IAD
8,BRU
9,DUB


In [76]:
final = pd.merge(recommendations, cities, on="destination", how="left")
final

Unnamed: 0,destination,score,city,country
0,TPA,1.023,Tampa,United States
1,IAH,0.932,Houston,United States
2,GIG,0.877,Rio De Janeiro,Brazil
3,LHR,0.819,London,United Kingdom
4,HKG,0.773,Hong Kong,China
5,MIA,0.711,Miami,United States
6,BRU,0.685,Brussels,Belgium
7,IAD,0.58,"Dulles, DC",United States
8,BRU,0.54,Brussels,Belgium
9,DUB,0.51,Dublin,Ireland


# Airbnb Dataset

In [None]:
train_df = pd.read_csv("datasets/airbnb/train_users_2.csv")
test_df = pd.read_csv("datasets/airbnb/test_users.csv")
train_df.shape, test_df.shape

In [None]:
train_df.head()

In [None]:
# Drop id column
train_df.drop(columns="id", inplace=True)

In [None]:
for data in [train_df, test_df]:
    data.date_account_created = pd.to_datetime(data['date_account_created'])
    data['account_year'] = data.date_account_created.dt.year
    data['account_month'] = data.date_account_created.dt.month
    data['account_day'] = train_df.date_account_created.dt.day

In [None]:
train_df = train_df[['timestamp_first_active',
 'gender',
 'age',
 'signup_method',
 'signup_flow',
 'language',
 'affiliate_channel',
 'affiliate_provider',
 'signup_app',
 'first_device_type',
 'first_browser',
 'account_year',
 'account_month',
 'account_day','country_destination']]

test_df = test_df[['timestamp_first_active',
 'gender',
 'age',
 'signup_method',
 'signup_flow',
 'language',
 'affiliate_channel',
 'affiliate_provider',
 'signup_app',
 'first_device_type',
 'first_browser',
 'account_year',
 'account_month',
 'account_day']]

In [None]:
for data in [train_df,test_df]:
    data.age = data.age.apply(lambda x: np.nan if x<18 else x)
    data.age = data.age.apply(lambda x: np.nan if x>100 else x)

In [None]:
# replace nan age values with mean age
for data in [train_df,test_df]:
    data.age.fillna(data.age.mean(),inplace=True)

In [None]:
# check for any nan values after pre-processing
train_df.isna().sum()

In [None]:
def remove_outliers(df, name="feature"):
    #Store the 25th and 75th percentile 
    q25,q75 = np.percentile(df[name],25), np.percentile(df[name],75)
    #Calculate the Interquartile range
    iqr_cut = 1.5*(q75-q25)
    #Create variable of lower and upper cut
    lower,upper = q25-iqr_cut, q75+iqr_cut
    #Remove the outliers
    df = df[(df[name] >= lower) & (df[name] <= upper)]
    return df

In [None]:
train = train_df.copy()
train = remove_outliers(train,'age')

# Define inputs and target cols
inputs_col = train.columns[:-1]
target_col = ['country_destination']

# Define inputs
inputs = train[inputs_col].copy()
target = train[target_col].copy()

# Define numerical and categorical columns
numerical_cols = inputs.select_dtypes(include=['int64','float64']).columns.to_list()
categorical_cols = inputs.select_dtypes(include='object').columns.to_list()

# Normalization
scaler = MinMaxScaler().fit(inputs[numerical_cols])
inputs[numerical_cols] = scaler.transform(inputs[numerical_cols])

# label encoding
for col in categorical_cols:
    # label encoding
    encoder = LabelEncoder().fit(inputs[col])
    inputs[col] = encoder.transform(inputs[col])
    
enc_countries = {'NDF':0,'US':1,'FR':2,'CA':3,'GB':4,'ES':5,'IT':6,'PT':7,'NL':8,'DE':9,'AU':10,'other':11}
target['country_destination'] = target['country_destination'].apply(lambda x:enc_countries[x])


# Define X variable
X = inputs[numerical_cols + categorical_cols]
# Define y variable
y = target['country_destination']

In [None]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=3)

In [None]:
print('X_train:', len(X_train))
print('y_train:', len(y_train))
print('X_val:', len(X_test))
print('y_train:', len(y_test))

In [None]:
xgb = XGBClassifier(random_state=3, n_jobs=-1, max_depth=3, n_estimators=100, objective='multi:softprob', learning_rate=0.3, use_label_encoder=False)
xgb.fit(X_train, y_train, eval_metric="merror")
train_accuracy_score = accuracy_score(xgb.predict(X_train), y_train)
test_accuracy_score = accuracy_score(xgb.predict(X_test), y_test)
train_ndcg_score = ndcg_score(pd.get_dummies(y_train).to_numpy(), xgb.predict_proba(X_train))
test_ndcg_score = ndcg_score(pd.get_dummies(y_test).to_numpy(), xgb.predict_proba(X_test))

In [None]:
train_accuracy_score, test_accuracy_score, train_ndcg_score, test_ndcg_score

In [None]:
xgb_importance_df = pd.DataFrame({'features': X.columns,
                  'importance': xgb.feature_importances_}).sort_values('importance',ascending=True)

In [None]:
px.bar(xgb_importance_df, x='importance', y='features')

In [None]:
# save model

import pickle
pkl_file = open("pkl_files/XGBoost_airbnb.pkl", "wb")
pickle.dump(xgb, pkl_file)
pkl_file.close()