# Capstone: Airbnb Price Listing Prediction
## Part 4 Model Tuning

_Authors: Evonne Tham_

In the previous notebook, the XGBoost produced a high $R^2$ score of 0.9665 and 7.849000e-01 for the train and validation sets respectively, and an $RMSE$ of 8995.01. Despite this, the model needs to be tuned by narrowing the features from 324 features to a more manageable number so that the model is more generalisable and for inferences about the data to be easily made. 

This will be done by utilising the features importance, a built-in function in XGBoost, after they have been modelled. This model will be used as the production model in the next notebook.

## Contents of this notebook
- [1. Import Necessary Libraries and Load Data](#1.-Import-Necessary-Libraries-and-Load-Data)
- [2. GridSearch for Hyperparameter Tuning](#2.-GridSearch-for-Hyperparameter-Tuning)
    - [2.1. Defining Function for modelling](#2.1-Defining-Function-for-modelling)
    - [2.2. Fitting Models](#2.2.-Fitting-Models)
- [3. Model Evaluation](#3.-Model-Evaluation)
- [4. Re-training the Best Model (XGBoost)](#4.-Re-training-the-Best-Model-(XGBoost))
- [5. Feature Selection](#5.-Feature-Selection)
    - [5.1. Feature Importances](#5.1.-Feature-Importances)
    - [5.2. Dropping Features](#5.2.-Dropping-Features)
- [6. Re-training the XGBoost with Selected Features](#6.-Re-training-the-XGBoost-with-Selected-Features)
- [7. Learning Curve](#7.-Learning-Curve)


## 1. Import Necessary Libraries and Load Data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# modelling
import time
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler 
from sklearn.linear_model import LinearRegression, ElasticNetCV
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

# from sklearn.metrics import explained_variance_score, mean_squared_error, r2_score, accuracy_score
# from xgboost import plot_importance


#Hide warnings
import warnings
warnings.filterwarnings('ignore')

  import pandas.util.testing as tm


In [2]:
# Load in Data 
train = pd.read_csv('../datasets/train.csv')

#Set id as index 
train.set_index('id', inplace=True)

print(f"Total Number of Listing: {train.shape[0]} | Total Number of Features: {train.shape[1]}")

Total Number of Listing: 10789 | Total Number of Features: 300


---
## 2. Model Prep

In [3]:
# Create X and y variables
features = [col for col in train._get_numeric_data().columns if col != 'price' and col != 'id' and col != 'host_id']

X = train[features]
y = train['price']

# Validation Set 
X_train, X_val, y_train, y_val = train_test_split(X, 
                                                  y, 
                                                  test_size=0.25,
                                                  random_state = 42) 

---
## 3. GridSearch for Hyperparameter Tuning

### 3.1 Defining Function for modelling

In [4]:
def grid_modeller_val_scorer(classifier): 
    
    '''
    takes arguments "lr", "enet", "svr", "xgb"
    '''
    
    start = time.time()
    
    # Model instantiation
    clf_lr = LinearRegression()
    clf_enet = ElasticNetCV()
    clf_svr = SVR()
    clf_xgb = XGBRegressor()
    
    # Building the model pipelines incl. preprocessing where needed 
    # Setting up the parameter grids
    if classifier == "lr":
        pipe_lr = Pipeline([('rs', RobustScaler()),
                             ('clf_lr', clf_lr)])
        
        param_grid_lr = [{'clf_lr__fit_intercept': [True, False],
                          'clf_lr__normalize': [True, False]}]
        

        gs = GridSearchCV(pipe_lr, 
                          param_grid_lr, 
                          cv=5, 
                          n_jobs=1, 
                          verbose=1, 
                          scoring = "r2") 
        
        gs.fit(X_train, y_train)
        
        
    elif classifier == "enet":
        pipe_enet = Pipeline([('rs', RobustScaler()), 
                             ('clf_enet', clf_enet)])
        
        param_grid_enet = [{'clf_enet__l1_ratio': [.1, .5, .7, .9, .95, .99, 1],
                            'clf_enet__n_alphas': [1,10,100,1000,10000]}]
    
        gs = GridSearchCV(pipe_enet, 
                          param_grid_enet, 
                          cv=5, 
                          n_jobs=1, 
                          verbose=1, 
                          scoring = "r2") 
    
        gs.fit(X_train, y_train)
        
        
    elif classifier == "svr":
        pipe_svr = Pipeline([('rs', RobustScaler()),
                             ("clf_svr", clf_svr)])

        param_grid_svr = [{"clf_svr__C":[1,10], 
                          "clf_svr__gamma":[0.001, 0.01, 0.1, 1], 
                          "clf_svr__kernel":('linear', 'rbf')}]  
        
        gs = GridSearchCV(pipe_svr, 
                          param_grid_svr, 
                          cv=5, 
                          n_jobs=1, 
                          verbose=1,
                          scoring = "r2") 
        
        gs.fit(X_train, y_train)   
        
    
    elif classifier == "xgb":
        pipe_xgb = Pipeline([('rs', RobustScaler()),
                            ("clf_xgb",clf_xgb)])
        
        param_grid_xgb  = [{
            'clf_xgb__gamma':[0, 0.3], 
            'clf_xgb__learning_rate': [0.05, 0.3], 
            'clf_xgb__max_depth':[2,3,5], 
            'clf_xgb__n_estimators': [1000], 
            'clf_xgb__subsample': [0.05, 0.3, 0.5]
        }]
                        
        gs = GridSearchCV(pipe_xgb, 
                          param_grid_xgb, 
                          cv=5, 
                          n_jobs=-1, 
                          verbose=1,
                          scoring = "r2") 
        
        gs.fit(X_train, y_train)
        
    end = time.time()
        
    #get scores
    train_score = gs.score(X_train, y_train)
    val_score = gs.score(X_val, y_val)
    y_pred = gs.predict(X_val)
    
    #get R2, MSE Score, RMSE score
    r2 = r2_score(y_val, y_pred)
    mse = mean_squared_error(y_val, y_pred)
    rmse = (mean_squared_error(y_val, y_pred))**0.5
    
    
    
    metrics_list= [train_score, val_score, gs.best_score_, r2, mse, rmse]
    
    # Print out total run time 
    print(f"Time taken to run: {round((end - start)/60,1)} minutes")
    print('==================================================================================')
    print('')

    # print out accuracy, estimator and parameters from GridSearchCV
    print(f'Best train accuracy score = {train_score}')
    print(f'Best validation accuracy score = {val_score}')
    print(f'Best grid search score = {gs.best_score_}')
    print(f'R2 score = {r2}')
    print(f'Mean Square Error = {mse}')
    print(f"Root mean squared error = {rmse}")
    print('==================================================================================')
    print('')
    
    print(f'Best estimator = {gs.best_estimator_}')
    print(f'Best parameters = {gs.best_params_}')
    print('==================================================================================')
    print('')
    
    print(f"metrics list for {classifier}:", metrics_list)
    
    return

In [5]:
# # Check for Parameters of model
# clf_xgb = XGBRegressor()
# pipe_xgb = Pipeline([('ss', RobustScaler()),
#                             ("clf_xgb",clf_xgb)])
# pipe_xgb.get_params().keys()

### 3.2. Fitting Models

#### i. Linear Regression

In [6]:
grid_modeller_val_scorer("lr")

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    4.7s finished


Time taken to run: 0.1 minutes

Best train accuracy score = 0.8304163219962034
Best validation accuracy score = -1472075.2379907111
Best grid search score = -2170959888198331.2
R2 score = -1472075.2379907111
Mean Square Error = 629065072969101.9
Root mean squared error = 25081169.689013746

Best estimator = Pipeline(memory=None,
         steps=[('rs',
                 RobustScaler(copy=True, quantile_range=(25.0, 75.0),
                              with_centering=True, with_scaling=True)),
                ('clf_lr',
                 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                                  normalize=False))],
         verbose=False)
Best parameters = {'clf_lr__fit_intercept': True, 'clf_lr__normalize': False}

metrics list for lr: [0.8304163219962034, -1472075.2379907111, -2170959888198331.2, -1472075.2379907111, 629065072969101.9, 25081169.689013746]


#### ii. ElasticNetCV

In [None]:
grid_modeller_val_scorer("enet")

Fitting 5 folds for each of 35 candidates, totalling 175 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


#### iii. Support Vector Regressor

In [None]:
grid_modeller_val_scorer("svr")

#### iv. Extreme Gradient Boosting Trees Regressor ("XGB") 

In [None]:
grid_modeller_val_scorer("xgb")

---
## 4. Model Evaluation
The evaluation metrics used will be mean squared error (for loss) and r squared (for accuracy).

In [None]:
metrics_list= ["train_score", "val_score", "gs.best_score_", "r2", "mse", "rmse"]

lr = [0.3874269279942756, -1.1662068491359909e+17, 
      -1.7179846319682506e+16, -1.1662068491359909e+17, 
      4.865832573216899e+25, 6975552001968.661]
    
enet = [0.3735451842108308, 0.35577415679938695, 
        0.3441756543727746, 0.35577415679938695, 
        268794090.4030937, 16394.94100029316]

svr = [0.09799514764047101, 0.09759825210558948, 
       0.08737507533950861, 0.09759825210558948, 
       376514322.6765994, 19403.97698093356]

xgb = [0.9665104649768522, 0.7849005858043501, 
       0.7623192112420136, 0.7849005858043501, 
       80910129.01162435, 8995.005781633738]


eval_data = [lr, enet, svr, xgb]

column_names = metrics_list

index = ["Linear Regression", "ElasticNetCV", "Support Vector Regressor", "XGBoost"]

eval_df = pd.DataFrame(eval_data, columns=column_names, index=index)
    
eval_df = eval_df.round(decimals = 4)
eval_df

Based on this table and our particular output reproduced above, we can see that linear regression performed the worst as the train set has a $R^2$ of 0.3874 and -1.166207e+17 on the validation set. This shows that the model is arbitrarily worse as it does not follow the trend of the data.

Support Vector Machine and ElasticNet performed poorly, eventhough there's not much difference between the $R^2$ score of 0.098 and 0.3735 on train set and $R^2$ score of 0.0976 and 0.3558 on validation set respectively. 

Hence the clear winner is eXtreme Gradient Boosting. It has the highest $R^2$ of 0.785 on the entire validation set means that our model is able to account for almost 80% of the variance in the target variable. With a $RMSE$ score of 8995.01, what this means is that our model's prediction is on average off by 8995¥ in terms of predicting the property's price. This is pretty good for a preliminary model using regression techniques.

***The best parameters are:*** 
- gamma: 0, 
- learning_rate: 0.05, 
- max_depth: 5, 
- n_estimators: 1000, 
- subsample: 0.3


---
## 5. Re-training the Best Model (XGBoost)

In [None]:
# Create X and y variables
features = [col for col in train._get_numeric_data().columns if col != 'price' and col != 'id' and col != 'host_id']
X = train[features]
y = train['price']

# Train/Validation Split
X_train, X_val, y_train, y_val = train_test_split(X, 
                                                  y, 
                                                  test_size=0.25,
                                                  random_state = 42) 

rs = RobustScaler()
X_train_rs = rs.fit_transform(X_train)
X_val_rs = rs.transform(X_val)

In [None]:
# Instantiate Best Model
xgb = XGBRegressor(gamma = 0,
                   learning_rate = 0.05, 
                   max_depth = 5, 
                   n_estimators = 1000, 
                   subsample = 0.3)

# Fit Model
xgb.fit(X_train_rs, y_train)

#Predict target values
pred = xgb.predict(X_val_rs)
residuals = y_val - pred

In [None]:
# # Plot Residuals
# plt.figure(figsize=(15,8))

# plt.scatter(pred, residuals)
# plt.axhline(0, linestyle='-', color='r')
# plt.title('Residual distribution plot for XGBoost regression', fontsize=14)
# plt.xlabel('Predicted values', fontsize=12)
# plt.ylabel('Residuals', fontsize=12);

--- 
## 6. Feature Selection

### 6.1. Feature Importances

In [None]:
# Visualizing top features in our production model. 
key_features = pd.DataFrame([xgb.feature_importances_], columns = X.columns).T
key_features.sort_values(0, ascending = False, inplace = True)
key_features.head()

In [None]:
# Plotting feature importances
plt.figure(figsize=(10,50))
plt.barh(key_features.index, key_features[0], align='center') 
plt.title("Feature importances in the XGBoost model", fontsize=20)
plt.xlabel("Feature importance")
plt.margins(y=0.01)
plt.show()

<div class="alert alert-block alert-warning">
About a good number of features have a feature importance of 0 in this XGBoost regression model, and could potentially be removed.

The top 10 most important features are:


The most important features the rental being the entire flat. Which makes sense. Asking price is higher if the offer is for the entire flat/house. This could also suggest that offering the flat/house as a whole, rather than each bedroom individually, may be better overall, given the large difference in importance compared to the second most important feature.

It is not surprising that the second how many people the property accommodates, as that's one of the main things you would use to search for properties with in the first place.

It is perhaps more surprising that location features did not appear in the top ten. Although we can observe that belonging to a certain neighbourhood increases price more than others and Score (accessibility measure) also shows some importance, they are of relative low importance compared to the top 3 features. Review Scores Location is higher on the importance list (number 11). This is, it is likely renters put more weight in other's opinion about location instead of judging the location based on neighbourhood and venues around the property. This could also be because Edinburgh is a small and walkable city with good transportation services. Thus, location is not a major problem to reaching main touristic attractions and amenities.

The eight most important feature is related to how many other listings the host manages on Airbnb, rather than the listing itself. This result showed on this analysis of Airbnb listings in London, only this feature was the third most important. What the researcher (and former data scientists at an Airbnb management company) explains is that this does not mean that a host that manages more properties will result in a listing gaining higher prices, and could be due to experienced hosts setting higher prices. Also, it could be that big Airbnb management companies that have lots of listings tend to manage more expensive properties than single listing hosts.
</div>

### 6.2. Dropping Features

## 7. Re-training the XGBoost with Selected Features

## 8. Learning Curve

In [None]:
# from sklearn.model_selection import learning_curve
# import matplotlib.pyplot as plt
# plt.style.use('ggplot')
# %matplotlib inline

# def plot_learning_curve(estimator, clf, X, y, ylim=None, cv=None, train_sizes=None):
#     plt.figure()
#     plt.title(f'Learning Curves ({clf})')
#     plt.ylim(*ylim)
#     plt.xlabel("Training examples")
#     plt.ylabel("Score")
#     train_sizes, train_scores, test_scores = learning_curve(
#         estimator, X, y, cv=cv, train_sizes=train_sizes)
#     train_scores_mean = np.mean(train_scores, axis=1)
#     train_scores_std = np.std(train_scores, axis=1)
#     test_scores_mean = np.mean(test_scores, axis=1)
#     test_scores_std = np.std(test_scores, axis=1)

#     plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
#                      train_scores_mean + train_scores_std, alpha=0.1,
#                      color="r")
#     plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
#                      test_scores_mean + test_scores_std, alpha=0.1, color="g")
#     plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
#              label="Training score")
#     plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
#              label="Cross-validation score")

#     plt.legend(loc="best")
#     plt.grid(True)
#     return

In [None]:
# train_sizes = np.linspace(.1, 1.0, 5)
# ylim = (0.9, 1.01)
# cv = 5

# plot_learning_curve(pipe_lr, "Linear Regression", X_val, y_val, 
#                     ylim=ylim, cv=cv, train_sizes=train_sizes)
# plot_learning_curve(pipe_enet, "ElasticNetCV", X_val, y_val, 
#                     ylim=ylim, cv=cv, train_sizes=train_sizes)
# plot_learning_curve(pipe_svr, "SVR", X_val, y_val, 
#                     ylim=ylim, cv=cv, train_sizes=train_sizes)
# plot_learning_curve(pipe_xgb, "XGBoost", X_val, y_val,
#                     ylim=ylim, cv=cv, train_sizes=train_sizes)

# plt.show()

----> Proceed to the next notebook for [Production Model](./05_Production_Model.ipynb)