# Predicting Airbnb Prices for Munich

The goal of our data mining project is to predict prices for new Airbnb listings in Munich. To achieve this, we will train a regression model on existing Airbnb data from www.insideairbnb.com.

## Table of Contents
##### [1 Preprocessing](#preprocessing)
##### [2 Data Mining](#data_mining)
##### [3 Interpretation and Evaluation](#interpretation_evaluation)

<a id='preprocessing'></a>
## 1 Preprocessing

In [1]:
%run modules/preprocessing.py
label, features = load_and_preprocess_dataset()

2019-11-20 00:06:31 : Dataset loaded and preprocessed.


In [2]:
import pandas as pd
pd.set_option('display.max_columns', 27)
already_preprocessed = features.drop(['host_since', 'host_location', 'amenities', 'security_deposit', 'cleaning_fee', 'extra_people', 'guests_included', 'minimum_nights', 'maximum_nights', 'host_verifications', 'cancellation_policy', 'zipcode', 'is_location_exact', 'availability_30', 'availability_60', 'availability_90', 'availability_365'], 1)
normalized_features = pd.DataFrame(preprocessing.MinMaxScaler().fit_transform(already_preprocessed), columns=already_preprocessed.columns)
print(already_preprocessed)

      host_is_superhost  host_total_listings_count  host_has_profile_pic  \
0                     0                        1.0                     1   
1                     1                        1.0                     1   
2                     0                        3.0                     1   
3                     1                        2.0                     1   
4                     0                        1.0                     1   
5                     1                        1.0                     1   
6                     1                        2.0                     1   
7                     1                        2.0                     1   
8                     1                        1.0                     1   
9                     1                        1.0                     1   
10                    0                        1.0                     1   
11                    0                        2.0                     1   
12          

In [3]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import pandas as pd

# Select k best features using SelectKBest (k = 5)
best_features = SelectKBest(score_func=chi2, k=8)
fit = best_features.fit(already_preprocessed, label)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(already_preprocessed.columns)

# concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns, dfscores], axis=1)
featureScores.columns = ['Specs', 'Score']

best_features = already_preprocessed[featureScores['Specs']]
print(best_features)

      host_is_superhost  host_total_listings_count  host_has_profile_pic  \
0                     0                        1.0                     1   
1                     1                        1.0                     1   
2                     0                        3.0                     1   
3                     1                        2.0                     1   
4                     0                        1.0                     1   
5                     1                        1.0                     1   
6                     1                        2.0                     1   
7                     1                        2.0                     1   
8                     1                        1.0                     1   
9                     1                        1.0                     1   
10                    0                        1.0                     1   
11                    0                        2.0                     1   
12          

<a id='data_mining'></a>
## 2 Data Mining

### 2.1 Evaluation of different Regression Approaches

In [49]:
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import f_regression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import itertools
from math import sqrt

x_train, x_test, y_train, y_test = train_test_split(already_preprocessed, label, test_size = 0.2, random_state = 0)

# test different regression approaches
estimators = [ LinearRegression(), Ridge(), KNeighborsRegressor(), DecisionTreeRegressor(), MLPRegressor(), SVR() ]
svr = [ SVR() ]
pipeline = Pipeline( [ ('preprocessing', StandardScaler()), ('estimator', None) ])

# define a parameter grid
parameters = {
    'estimator': estimators
}

# define and run a grid search using MSE as scoring metric
search = GridSearchCV(pipeline, parameters, cv=10, scoring='neg_mean_squared_error')
search.fit(x_train, y_train)

# evaluate on test set
predictions = search.predict(x_test)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print("Best Model: {}".format(search.best_params_))
print("RMSE: {}".format(sqrt(mse)))
print("R^2: {}".format(r2))

Best Model: {'estimator': KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform')}
RMSE: 75.8739390296274
R^2: 0.2601077836400705


### 2.2 Evaluation of the Support Vector Machine

In [4]:
%run modules/evaluation.py

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR

best_r2 = 0

# k-fold cross-validation (k = 10)
scores = []
k_fold_cross_validation = KFold(10, True, 1)
for train_index, test_index in k_fold_cross_validation.split(already_preprocessed):

    # Split the dataset for training and testing
    x_train, x_test, y_train, y_test = already_preprocessed.loc[train_index, :], already_preprocessed.loc[test_index, :], label[train_index], label[test_index]

    # Support Vector Regressor (SVR) using training dataset
    svr = SVR(kernel='linear', C = 0.7)
    svr.fit(x_train, y_train)

    # Evaluation using testing dataset
    scores.append(svr.score(x_test, y_test))  

# Calculate performance measures
print("r2: ", str(np.mean(scores)))


r2:  0.24676258439810939


In [33]:
%run modules/evaluation.py

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR

best_r2 = 0

# Generate all feature combinations
feature_combinations = generate_feature_combinations(already_preprocessed)

for feature_combination in feature_combinations:
    
    # Filter the selected features
    selected_features = already_preprocessed[feature_combination]
    
    # k-fold cross-validation (k = 10)
    scores = []
    k_fold_cross_validation = KFold(10, True, 1)
    for train_index, test_index in k_fold_cross_validation.split(selected_features):
    
        # Split the dataset for training and testing
        x_train, x_test, y_train, y_test = selected_features.loc[train_index, :], selected_features.loc[test_index, :], label[train_index], label[test_index]

        # Support Vector Regressor (SVR) using training dataset
        svr = SVR(kernel='linear', C = 0.7)
        svr.fit(x_train, y_train)

        # Evaluation using testing dataset
        scores.append(svr.score(x_test, y_test))  

    # Calculate performance measures
    print(np.mean(scores), " - ", feature_combination)

    # Save best model
    if(np.mean(scores) > best_r2):
          best_r2 = np.mean(scores)
        
print("Best: ", best_r2)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

<a id='interpretation_evaluation'></a>
## 3 Interpretation and Evaluation