# Predicting Airbnb Prices for Munich

The goal of our data mining project is to predict prices for new Airbnb listings in Munich. To achieve this, we will train a regression model on existing Airbnb data from www.insideairbnb.com.

## Table of Contents
##### [1 Preprocessing](#preprocessing)
##### [2 Data Mining](#data_mining)
##### [3 Interpretation and Evaluation](#interpretation_evaluation)

<a id='preprocessing'></a>
## 1 Preprocessing

In [49]:
%run modules/preprocessing.py
%run modules/evaluation.py

In [50]:
features, label = load_and_preprocess_dataset()

2019-11-24 20:37:00 : Dataset loaded and preprocessed.


In [57]:
#selected_features = select_best_features_f(features, label, number_of_features = 10)
selected_features = features[['accommodates', 'beds', 'bedrooms', 'host_is_superhost', 'distance_centre']]
features_train, features_test, label_train, label_test = stratified_train_test_split(selected_features, label)
features_train, label_train = delete_price_outliers(features_train, label_train)
print(selected_features.head())

   accommodates  beds  bedrooms  host_is_superhost  distance_centre
0             2   1.0       1.0                  0         0.009353
1             2   1.0       1.0                  1         0.090967
2             5   3.0       1.0                  0         0.063802
3             4   1.0       1.0                  1         0.018608
4             4   1.0       1.0                  0         0.109902


<a id='data_mining'></a>
## 2 Data Mining

### 2.1 Evaluation of a Dummy Regressor

In [43]:
baseline_prediction(features_train, features_test, label_train, label_test)

Performance of Dummy Regressor (Mean) : 142.69050160645492


### 2.2 Evaluation of Support Vector Regression

In [56]:
# Basic Single Run of Support Vector Regression

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR

scores = []
k_fold_cross_validation = KFold(10, True, 1)
for train_index, test_index in k_fold_cross_validation.split(features_train):
        
    # Support Vector Regressor (SVR)
    svr = SVR(kernel='linear', C=1, epsilon=0.1)
    svr.fit(features_train, label_train)  
    
    # Evaluation using testing dataset
    scores.append(svr.score(features_test, label_test))  

# Calculate performance measures
print(np.mean(scores), " - ", features_test.columns)

0.2902009321279804  -  Index(['accommodates', 'beds', 'bedrooms'], dtype='object')


In [8]:
# Iterate every number of best features on SVR

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR

best_r_squared = 0

for i in range(1, 15):
    
    # Prepare features
    selected_features = select_best_features_f(features, label, number_of_features = i)
    features_train, features_test, label_train, label_test = stratified_train_test_split(selected_features, label)
    features_train, label_train = delete_price_outliers(features_train, label_train)
    print(selected_features.columns)

    # Support Vector Regressor (SVR)
    svr = SVR(kernel='linear', C=0.7, epsilon=0.7)
    svr.fit(features_train, label_train)  

    # Evaluate Results
    r_squared = svr.score(features_test, label_test)
    predictions = svr.predict(features_test)
    root_mean_squared_error = calculate_root_mean_squared_error(label_test, predictions)

    print("R2 :", str(r_squared))
    print("Root Mean Squared Error :", str(root_mean_squared_error))
    
    if(r_squared > best_r_squared):
        best_r_squared = r_squared
    
    print("Best R2 :", best_r_squared)

Index(['accommodates'], dtype='object')
R2 : 0.28231957313550826
Root Mean Squared Error : 120.44740971028456
Best R2 : 0.28231957313550826
Index(['accommodates', 'beds'], dtype='object')
R2 : 0.2865571595502657
Root Mean Squared Error : 120.09128883003923
Best R2 : 0.2865571595502657
Index(['accommodates', 'beds', 'bedrooms'], dtype='object')
R2 : 0.2901585923205554
Root Mean Squared Error : 119.7877971750042
Best R2 : 0.2901585923205554
Index(['accommodates', 'beds', 'bedrooms', 'cleaning_fee'], dtype='object')
R2 : 0.29005303041719566
Root Mean Squared Error : 119.79670378262847
Best R2 : 0.2901585923205554
Index(['accommodates', 'beds', 'bedrooms', 'cleaning_fee', 'room_type'], dtype='object')
R2 : 0.2889670717406365
Root Mean Squared Error : 119.88829129855677
Best R2 : 0.2901585923205554
Index(['accommodates', 'beds', 'bedrooms', 'cleaning_fee', 'room_type',
       'cancellation_policy'],
      dtype='object')
R2 : 0.2876677750742961
Root Mean Squared Error : 119.9977794493154
Be

In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import f_regression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import itertools
from math import sqrt

# test different regression approaches
estimators = [ LinearRegression(), Ridge(), KNeighborsRegressor(), DecisionTreeRegressor(), MLPRegressor(), SVR() ]
svr = [ SVR(kernel='linear') ]
pipeline = Pipeline( [ ('preprocessing', StandardScaler()), ('estimator', None) ])

# define a parameter grid
parameters = {
    'estimator': svr
}

# define and run a grid search using MSE as scoring metric
search = GridSearchCV(pipeline, parameters, cv=10, scoring='neg_mean_squared_error')
search.fit(features_train, label_train)

# evaluate on test set
predictions = search.predict(features_test)
mse = mean_squared_error(label_test, predictions)
r2 = r2_score(label_test, predictions)

print("Best Model: {}".format(search.best_params_))
print("RMSE: {}".format(sqrt(mse)))
print("R^2: {}".format(r2))

Best Model: {'estimator': SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma='auto_deprecated', kernel='linear', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)}
RMSE: 119.91004283624567
R^2: 0.288709040495277


### 2.3 Evaluation of the Support Vector Machine

In [60]:
%run modules/evaluation.py

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR

best_r2 = 0

# Generate all feature combinations
feature_combinations = generate_feature_combinations(features_train)

for feature_combination in feature_combinations:
    
    # Filter the selected features
    selected_features_train = features_train[feature_combination]
    selected_features_test = features_test[feature_combination]

    # Support Vector Regressor (SVR) using training dataset
    svr = SVR(kernel='linear', C=0.2, epsilon=0.1)
    svr.fit(selected_features_train, label_train)

    # Evaluation using testing dataset
    score = svr.score(selected_features_test, label_test) 

    # Calculate performance measures
    print(score, " - ", feature_combination)

    # Save best model
    if(score > best_r2):
          best_r2 = score
            
    print("Best: ", best_r2)

0.2834380434283179  -  ['accommodates']
Best:  0.2834380434283179
0.15482218098492562  -  ['beds']
Best:  0.2834380434283179
0.07205478751841932  -  ['bedrooms']
Best:  0.2834380434283179
-0.07003971842387413  -  ['host_is_superhost']
Best:  0.2834380434283179
-0.08010452658419909  -  ['distance_centre']
Best:  0.2834380434283179
0.2891591720060819  -  ['accommodates', 'beds']
Best:  0.2891591720060819
0.28442730223231827  -  ['accommodates', 'bedrooms']
Best:  0.2891591720060819
0.2823672676349661  -  ['accommodates', 'host_is_superhost']
Best:  0.2891591720060819
0.2832951087475435  -  ['accommodates', 'distance_centre']
Best:  0.2891591720060819
0.15771569035969157  -  ['beds', 'bedrooms']
Best:  0.2891591720060819
0.1547815809946922  -  ['beds', 'host_is_superhost']
Best:  0.2891591720060819
0.15505859341669592  -  ['beds', 'distance_centre']
Best:  0.2891591720060819
0.07707866181322964  -  ['bedrooms', 'host_is_superhost']
Best:  0.2891591720060819
0.07199101526620133  -  ['bedro

### 2.4 Evaluation of Linear Regression

In [102]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt
from sklearn.model_selection import train_test_split

#, stratify=y_binned
#, random_state = 42
x_train, x_test, y_train, y_test = train_test_split(features, label, test_size = 0.2, random_state = 42, stratify=bins)

#x_train, y_train = delete_price_outliers(x_train, y_train)

reg = LinearRegression()
reg.fit(x_train, y_train)
prediction = reg.predict(x_test)
mse = mean_squared_error(y_test, prediction)
r2 = r2_score(y_test, prediction)
print("MSE:", mse)
print("RMSE:", sqrt(mse))
print("R^2:", r2)

MSE: 12205.405453113695
RMSE: 110.47807679858342
R^2: 0.39620656940976784


### 2.5 Evaluation of Advanced Regression

In [128]:
#1. Ridge Regression
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

rr_est = Ridge()
param = {"alpha": [1e-10, 1e-8, 1e-4,1e-3, 1e-2, 1, 5, 10, 20]}

rr_est_opt = GridSearchCV(rr_est, param, scoring="neg_mean_squared_error", cv=5)
rr_est_opt.fit(x_train, y_train)

# Get best param
print(rr_est_opt.best_params_)
print(rr_est_opt.best_score_)

{'alpha': 20}
-4696.814613726855


In [129]:
# Test with alpha 20
from sklearn.linear_model import ridge
rr = Ridge(alpha = 20)
rr.fit(x_train, y_train)
rr.coef_

#Test and Evaluate
rr_predictions = rr.predict(x_test)

mse = mean_squared_error(y_test, rr_predictions)
r2 = r2_score(y_test, rr_predictions)

print("MSE:", mse)
print("RMSE:", sqrt(mse))
print("R^2:", r2)

MSE: 11276.885237588422
RMSE: 106.19267977402407
R^2: 0.3831739616006318


## 2.6. Evaluation of KNN Regression

In [2]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn import neighbors
import numpy as np
import matplotlib.pyplot as plt
from sklearn import neighbors
from sklearn import metrics
from sklearn.metrics import fbeta_score, make_scorer, mean_squared_error
import itertools
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import plotly.express as px

#### Evaluate development of error term with different values for K

In [None]:
%run modules/evaluation.py

best_nmse = -100000

neg_mse = []
for K in range(30):
    K = K + 1
    knn = neighbors.KNeighborsRegressor(n_neighbors=K)

    # k-fold cross-validation (k = 10)
    scores = []
    k_fold_cross_validation = KFold(10, True, 1)
    for train_index, test_index in k_fold_cross_validation.split(features):

        # KNN Regression using training dataset
        knn = neighbors.KNeighborsRegressor(n_neighbors=K)
        knn.fit(x_train, y_train)

        # Evaluation using testing dataset
        scores.append(cross_val_score(knn, x_train, y_train, scoring='neg_mean_squared_error'))  

    # Calculate performance measures
    print("Negative mean squared error: ", str(np.mean(scores)), "for a K of", K)
    neg_mse.append(np.mean(scores))
    
    # Save best model
    if(np.mean(scores) > best_nmse):
          best_nmse = np.mean(scores)
            
# plotting the rmse values against k values
curve = px.line(data, x="Value of K", y="Negative Mean Squared Error", title='Change of NMSE for different values of K')
curve.show()


print("Best: ", best_nmse)

Negative Mean squared error:  -21477.76092454774 for a K of 1
Negative Mean squared error:  -18172.159012237866 for a K of 2
Negative Mean squared error:  -17441.135494347203 for a K of 3
Negative Mean squared error:  -16850.657433589662 for a K of 4
Negative Mean squared error:  -16791.13280286299 for a K of 5
Negative Mean squared error:  -16765.48719289242 for a K of 6
Negative Mean squared error:  -16734.8804725024 for a K of 7
Negative Mean squared error:  -16628.455845007862 for a K of 8
Negative Mean squared error:  -16626.042864958483 for a K of 9
Negative Mean squared error:  -16627.058175746464 for a K of 10
Negative Mean squared error:  -16611.733558632426 for a K of 11
Negative Mean squared error:  -16613.513410009924 for a K of 12
Negative Mean squared error:  -16660.906663192593 for a K of 13
Negative Mean squared error:  -16697.553606518184 for a K of 14
Negative Mean squared error:  -16750.762711447842 for a K of 15
Negative Mean squared error:  -16744.852812006735 for 

#### GridSearch to evaluate the best parameter values

In [11]:
# create an estimator
knn_estimator = neighbors.KNeighborsRegressor()
parameters = {
    'n_neighbors': range(2, 30),
    'weights': ['uniform', 'distance'],
    'metric': ['manhattan', 'euclidean', 'minkowski', 'chebyshev']
}

# specify the cross validation
k_fold_cross_validation = KFold(10, True, 1)

grid_search_estimator = GridSearchCV(knn_estimator, parameters, cv=k_fold_cross_validation, scoring='neg_mean_squared_error')
grid_search_estimator.fit(x_train,y_train)
grid_search_estimator.best_params_

# evaluate on test set
predictions = grid_search_estimator.predict(x_test)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print("Best Model: {}".format(grid_search_estimator.best_params_))
print("RMSE: {}".format(sqrt(mse)))
print("R^2: {}".format(r2))


# print the best parameter setting
print("best score is {} with params {}".format(grid_search_estimator.best_score_, grid_search_estimator.best_params_))

Best Model: {'metric': 'manhattan', 'n_neighbors': 12, 'weights': 'distance'}
RMSE: 130.14655641574794
R^2: 0.16208196988783952
best score is -15456.122970596134 with params {'metric': 'manhattan', 'n_neighbors': 12, 'weights': 'distance'}


#### Tryout different feature combinations

In [None]:
%run modules/evaluation.py

best_r2 = 0
best_feature_combi = 0

# Generate all feature combinations
feature_combinations = generate_feature_combinations(features)

for feature_combination in feature_combinations:
    # Filter the selected features
    
    x_train = x_train[feature_combination]
    y_train = y_train[feature_combination]
            
    # Tryout with manhatten
    knn = KNeighborsRegressor(n_neighbors = 9, metric = 'manhattan', weights= 'uniform')
    knn.fit(x_train, y_train)
    price_predicted = knn.predict(x_test)

    # evaluate using different measures
    mse = mean_squared_error(y_test, price_predicted)
    r2 = r2_score(y_test, price_predicted)
        
    print("Evaluation of feature combination", feature_combination)
    print("MSE:", mse)
    print("RMSE:", sqrt(mse))
    print("R^2:", r2)
        
    if(r2 > best_r2):
        best_r2 = r2
        best_feature_combi = feature_combination
            
    print("Best score so far:", best_r2, "with the features:", best_feature_combi)
        
print("Best score overall: ", best_r2, "with the features:", best_feature_combi)

<a id='interpretation_evaluation'></a>
## 3 Interpretation and Evaluation

In [10]:
# Check correlation of independant variables
corr_matrix = preprocessed_df.corr()
for (column_name, column_data) in corr_matrix.iteritems():
    for row_name, value in column_data.iteritems():
        if(value > 0.6 and column_name != row_name):
            print(column_name, " ", row_name, " ", value)

accommodates   beds   0.7060608155629018
bedrooms   beds   0.6501529074983061
beds   accommodates   0.7060608155629018
beds   bedrooms   0.6501529074983061
require_guest_profile_picture   require_guest_phone_verification   0.7871857495927765
require_guest_phone_verification   require_guest_profile_picture   0.7871857495927765
verification_government_id   verification_jumio   0.9765520138913213
verification_jumio   verification_government_id   0.9765520138913213
verification_offline_government_id   verification_selfie   0.6336336790497336
verification_selfie   verification_offline_government_id   0.6336336790497336
