# Predicting Airbnb Prices for Munich

The goal of our data mining project is to predict prices for new Airbnb listings in Munich. To achieve this, we will train a regression model on existing Airbnb data from www.insideairbnb.com.

## Table of Contents
##### [1 Preprocessing](#preprocessing)
##### [2 Data Mining](#data_mining)
##### [3 Interpretation and Evaluation](#interpretation_evaluation)

<a id='preprocessing'></a>
## 1 Preprocessing

In [87]:
%run modules/preprocessing.py
%run modules/evaluation.py
preprocessed_df = load_and_preprocess_dataset()

2019-11-21 21:28:23 : Dataset loaded and preprocessed.


In [78]:
print(preprocessed_df['minimum_nights'].unique())

[0 1 2]


In [86]:
corr_matrix = preprocessed_df.corr()
for (column_name, column_data) in corr_matrix.iteritems():
    for row_name, value in column_data.iteritems():
        if(value > 0.6 and column_name != row_name):
            print(column_name, " ", row_name, " ", value)

accommodates   beds   0.7060608155629018
bedrooms   beds   0.6501529074983061
beds   accommodates   0.7060608155629018
beds   bedrooms   0.6501529074983061
require_guest_profile_picture   require_guest_phone_verification   0.7871857495927765
require_guest_phone_verification   require_guest_profile_picture   0.7871857495927765
verification_government_id   verification_jumio   0.9765520138913213
verification_jumio   verification_government_id   0.9765520138913213
verification_offline_government_id   verification_selfie   0.6336336790497336
verification_selfie   verification_offline_government_id   0.6336336790497336


In [88]:
%run modules/preprocessing.py
features = select_best_features(preprocessed_df, number_of_features = 47) # Total number of features: 47
label = preprocessed_df['maximum_price']

ValueError: k should be >=0, <= n_features = 45; got 47. Use k='all' to return all features.

In [None]:
# if you want to delete outliers, execute this after the train-test-split:

# %run modules/preprocessing.py
# x_train, y_train = delete_price_outliers(x_train, y_train)

<a id='data_mining'></a>
## 2 Data Mining

### 2.1 Baseline Prediction

In [4]:
%run modules/evaluation.py
baseline_prediction(features, label)

Performance of Dummy Regressor: 143.85943496805479


### 2.2 Evaluation of different Regression Approaches

In [31]:
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import f_regression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import itertools
from math import sqrt

estimators = [ LinearRegression(), Ridge(), KNeighborsRegressor(), DecisionTreeRegressor(), MLPRegressor(), SVR() ]
pipeline = Pipeline([ ('preprocessing', StandardScaler()), ('estimator', None) ])
parameters = {
    'estimator': estimators
}
search = GridSearchCV(pipeline, parameters, cv=10, scoring='neg_mean_squared_error')

for i in range(1, 15):
    features = select_best_features(preprocessed_df, number_of_features = i)
    x_train, x_test, y_train, y_test = train_test_split(features, label, test_size = 0.2, random_state = 0)
    search.fit(x_train, y_train)
    
    predictions = search.predict(x_test)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)

    print("Best Model: {}".format(search.best_params_))
    print("RMSE: {}".format(sqrt(mse)))
    print("R^2: {}".format(r2))

Selected Features:  16    maximum_nights
Name: Specs, dtype: object
Best Model: {'estimator': MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(100,), learning_rate='constant',
             learning_rate_init=0.001, max_iter=200, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=None, shuffle=True, solver='adam', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False)}
RMSE: 133.0154338203911
R^2: -0.006317069604503622
Selected Features:  16      maximum_nights
13    security_deposit
Name: Specs, dtype: object
Best Model: {'estimator': MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(100,), learning_rate='constant',
             learning_rate_init=0.001,

In [16]:
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import f_regression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import itertools
from math import sqrt

x_train, x_test, y_train, y_test = train_test_split(features, label, test_size = 0.2, random_state = 0)

x_train, y_train = delete_price_outliers(x_train, y_train)

# test different regression approaches
estimators = [ LinearRegression(), Ridge(), KNeighborsRegressor(), DecisionTreeRegressor(), MLPRegressor(), SVR() ]
svr = [ SVR() ]
pipeline = Pipeline( [ ('preprocessing', StandardScaler()), ('estimator', None) ])

# define a parameter grid
parameters = {
    'estimator': estimators
}

# define and run a grid search using MSE as scoring metric
search = GridSearchCV(pipeline, parameters, cv=10, scoring='neg_mean_squared_error')
search.fit(x_train, y_train)

# evaluate on test set
predictions = search.predict(x_test)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print("Best Model: {}".format(search.best_params_))
print("RMSE: {}".format(sqrt(mse)))
print("R^2: {}".format(r2))

Best Model: {'estimator': MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(100,), learning_rate='constant',
             learning_rate_init=0.001, max_iter=200, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=None, shuffle=True, solver='adam', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False)}
RMSE: 142.5537543342086
R^2: 0.2519363931631057


### 2.3 Evaluation of the Support Vector Machine

In [89]:
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import f_regression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import itertools
from math import sqrt

%run modules/preprocessing.py

x_train, x_test, y_train, y_test = train_test_split(preprocessed_df.drop(['maximum_price'], 1), label, test_size = 0.2, random_state = 0)
tmp = select_best_features_f(x_train, y_train, 0.05)

                               feature      F value  p_value
21                            internet     0.075386   0.7837
20    require_guest_phone_verification     0.124135   0.7246
19       require_guest_profile_picture     0.371246   0.5423
27                           breakfast     0.801910   0.3705
35                  verification_phone     0.901576   0.3424
3               host_identity_verified     1.099106   0.2945
2                 host_has_profile_pic     1.157993   0.2819
29                    24-hour check-in     1.366989   0.2424
32                     smoking allowed     1.476614   0.2243
25                              washer     1.494722   0.2215
38                  verification_jumio     1.638808   0.2005
34                  verification_email     1.851011   0.1737
33                host_lives_in_munich     2.178843   0.1400
43                 verification_google     2.440783   0.1183
37          verification_government_id     2.809352   0.0938
42             verificat

In [50]:
%run modules/evaluation.py
%run modules/preprocessing.py

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR

best_r2 = 0

# k-fold cross-validation (k = 10)
scores = []
k_fold_cross_validation = KFold(10, True, 1)
for train_index, test_index in k_fold_cross_validation.split(tmp_features):

    # Split the dataset for training and testing
    x_train, x_test, y_train, y_test = tmp_features.loc[train_index, :], tmp_features.loc[test_index, :], label[train_index], label[test_index]

    # Support Vector Regressor (SVR) using training dataset
    svr = SVR(C=0.7, cache_size=200, coef0=0.0, degree=3, epsilon=0.7,
    gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)
    svr.fit(x_train, y_train)

    # Evaluation using testing dataset
    predictions = svr.predict(x_test)
    scores.append(sqrt(mean_squared_error(y_test, predictions)))  

# Calculate performance measures
print("rmse: ", str(np.mean(scores)))

rmse:  128.45771016278348


In [None]:
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import f_regression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import itertools
from math import sqrt

estimators = [ LinearRegression(), Ridge(), KNeighborsRegressor(), DecisionTreeRegressor(), MLPRegressor(), SVR() ]
pipeline = Pipeline([ ('preprocessing', StandardScaler()), ('estimator', None) ])
parameters = {
    'estimator': estimators
}
search = GridSearchCV(pipeline, parameters, cv=10, scoring='neg_mean_squared_error')

for i in range(1, 15):
    features = select_best_features(preprocessed_df, number_of_features = i)
    x_train, x_test, y_train, y_test = train_test_split(features, label, test_size = 0.2, random_state = 0)
    search.fit(x_train, y_train)
    
    predictions = search.predict(x_test)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)

    print("Best Model: {}".format(search.best_params_))
    print("RMSE: {}".format(sqrt(mse)))
    print("R^2: {}".format(r2))

In [48]:
%run modules/evaluation.py

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR

best_r2 = 0

tmp_features = preprocessed_df[['accommodates', 'host_is_superhost', 'distance_centre']]

# k-fold cross-validation (k = 10)
scores = []
k_fold_cross_validation = KFold(10, True, 1)
for train_index, test_index in k_fold_cross_validation.split(tmp_features):

    # Split the dataset for training and testing
    x_train, x_test, y_train, y_test = tmp_features.loc[train_index, :], tmp_features.loc[test_index, :], label[train_index], label[test_index]

    # Support Vector Regressor (SVR) using training dataset
    svr = SVR(kernel='linear', C = 0.7)
    svr.fit(x_train, y_train)

    # Evaluation using testing dataset
    scores.append(svr.score(x_test, y_test))  

# Calculate performance measures
print("r2: ", str(np.mean(scores)))

KeyboardInterrupt: 

In [33]:
%run modules/evaluation.py

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR

best_r2 = 0

# Generate all feature combinations
feature_combinations = generate_feature_combinations(already_preprocessed)

for feature_combination in feature_combinations:
    
    # Filter the selected features
    selected_features = already_preprocessed[feature_combination]
    
    # k-fold cross-validation (k = 10)
    scores = []
    k_fold_cross_validation = KFold(10, True, 1)
    for train_index, test_index in k_fold_cross_validation.split(selected_features):
    
        # Split the dataset for training and testing
        x_train, x_test, y_train, y_test = selected_features.loc[train_index, :], selected_features.loc[test_index, :], label[train_index], label[test_index]

        # Support Vector Regressor (SVR) using training dataset
        svr = SVR(kernel='linear', C = 0.7)
        svr.fit(x_train, y_train)

        # Evaluation using testing dataset
        scores.append(svr.score(x_test, y_test))  

    # Calculate performance measures
    print(np.mean(scores), " - ", feature_combination)

    # Save best model
    if(np.mean(scores) > best_r2):
          best_r2 = np.mean(scores)
        
print("Best: ", best_r2)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

### 2.4 Evaluation of Linear Regression

<a id='interpretation_evaluation'></a>
## 3 Interpretation and Evaluation