# Predicting Airbnb Prices for Munich

The goal of our data mining project is to predict prices for new Airbnb listings in Munich. To achieve this, we will train a regression model on existing Airbnb data from www.insideairbnb.com.

## Table of Contents
##### [1 Preprocessing](#preprocessing)
##### [2 Data Mining](#data_mining)
##### [3 Interpretation and Evaluation](#interpretation_evaluation)

<a id='preprocessing'></a>
## 1 Preprocessing

In [None]:
%run modules/preprocessing.py
df = load_and_preprocess_dataset()
 
features = df.drop('max_price', 1)
normalized_features = pd.DataFrame(preprocessing.MinMaxScaler().fit_transform(features), columns=features.columns)

label = df['max_price']

<a id='data_mining'></a>
## 2 Data Mining

### 2.1 Evaluation of different Regression Approaches

In [None]:
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import f_regression
from sklearn.svm import SVR

x_train, x_test, y_train, y_test = train_test_split(x_select, y, test_size = 0.2, random_state = 0)

# test different regression approaches
estimators = [ LinearRegression(), Ridge(), KNeighborsRegressor(), DecisionTreeRegressor(), MLPRegressor(), SVR() ]
pipeline = Pipeline( [ ('preprocessing', StandardScaler()), ('estimator', None) ])

# define a parameter grid
parameters = {
    'estimator': estimators
}

# define and run a grid search using MSE as scoring metric
search = GridSearchCV(pipeline, parameters, cv=10, scoring='neg_mean_squared_error')
search.fit(x_train, y_train)

# evaluate on test set
predictions = search.predict(x_test)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print("Best Model: {}".format(search.best_params_))
print("RMSE: {}".format(sqrt(mse)))
print("R^2: {}".format(r2))

### 2.2 Evaluation of the Support Vector Machine

In [2]:
%run modules/evaluation.py

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR

best_r2 = 0

# Generate all feature combinations
feature_combinations = generate_feature_combinations(x)

for feature_combination in feature_combinations:
    
    # Filter the selected features
    selected_features = x[feature_combination]
    
    # k-fold cross-validation (k = 10)
    scores = []
    k_fold_cross_validation = KFold(10, True, 1)
    for train_index, test_index in k_fold_cross_validation.split(selected_features):
    
        # Split the dataset for training and testing
        x_train, x_test, y_train, y_test = selected_features.loc[train_index, :], selected_features.loc[test_index, :], y[train_index], y[test_index]

        # Support Vector Regressor (SVR) using training dataset
        svr = SVR(kernel='linear', C = 0.7)
        svr.fit(x_train, y_train)

        # Evaluation using testing dataset
        scores.append(svr.score(x_test, y_test))  

    # Calculate performance measures
    print(np.mean(scores), " - ", feature_combination)

    # Save best model
    if(np.mean(scores) > best_r2):
          best_r2 = np.mean(scores)

        
print("Best: ", best_r2)

MAE= 49.54 , RMSE= 74.42 , R2= 0.001  -  ['room_type']
MAE= 41.04 , RMSE= 61.68 , R2= 0.314  -  ['accommodates']
MAE= 49.38 , RMSE= 72.06 , R2= 0.063  -  ['bedrooms']
MAE= 46.81 , RMSE= 68.37 , R2= 0.157  -  ['beds']
MAE= 50.61 , RMSE= 73.32 , R2= 0.03  -  ['max_people']
MAE= 51.75 , RMSE= 76.74 , R2= -0.062  -  ['distance_centre']
MAE= 40.82 , RMSE= 61.31 , R2= 0.322  -  ['room_type', 'accommodates']
MAE= 46.61 , RMSE= 69.5 , R2= 0.129  -  ['room_type', 'bedrooms']
MAE= 45.43 , RMSE= 67.05 , R2= 0.189  -  ['room_type', 'beds']
MAE= 48.39 , RMSE= 71.1 , R2= 0.088  -  ['room_type', 'max_people']
MAE= 49.3 , RMSE= 73.88 , R2= 0.015  -  ['room_type', 'distance_centre']
MAE= 40.96 , RMSE= 61.62 , R2= 0.315  -  ['accommodates', 'bedrooms']
MAE= 41.02 , RMSE= 61.67 , R2= 0.314  -  ['accommodates', 'beds']
MAE= 41.13 , RMSE= 61.73 , R2= 0.313  -  ['accommodates', 'max_people']
MAE= 40.51 , RMSE= 61.11 , R2= 0.326  -  ['accommodates', 'distance_centre']
MAE= 46.17 , RMSE= 67.78 , R2= 0.171  - 

<a id='interpretation_evaluation'></a>
## 3 Interpretation and Evaluation