In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from table_reader import TableReader

from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.multioutput import MultiOutputRegressor

import warnings
warnings.simplefilter('ignore')

In [2]:
tr = TableReader()
df = tr.properties_vector(include_amenitites=True)
geodata = tr.geodata_vector()
ratings = tr.reviews_vector()[['listingID','num_reviews', 'rating']]
tr.close()


In [3]:
df = pd.merge(df, geodata, on='listingID', how='left')

In [4]:
df = pd.merge(df, ratings, on='listingID', how='left')

In [5]:
df.dropna(subset=['zipcode'], inplace=True)
df = df[df['num_reviews'] > 10]

In [6]:
features = df[df.columns.drop(['price', 'listingID'])]
label = df['price']

## Linear, targeting price

In [7]:
SINGLE_OUT_ESTIMATORS = {
    "Extra trees": ExtraTreesRegressor(n_estimators=10,
                                       max_features=32,
                                       random_state=0),
    "K-nn": KNeighborsRegressor(),
    "Linear regression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet(random_state=0),
    "RandomForestRegressor": RandomForestRegressor(max_depth=4, random_state=2),
    "Decision Tree Regressor":DecisionTreeRegressor(max_depth=5)
}

In [8]:
for name, estimator in SINGLE_OUT_ESTIMATORS.items():
    scores = cross_val_score(estimator, X=features, y=label, cv=10)
    print("Average R^2 for "+name+": "+str(scores.mean()))

Average R^2 for Extra trees: 0.5786629423352914
Average R^2 for K-nn: 0.36925311377681214
Average R^2 for Linear regression: 0.6195716554273784
Average R^2 for Ridge: 0.6171356225843643
Average R^2 for Lasso: 0.5385026751640745
Average R^2 for ElasticNet: 0.47995124329128513
Average R^2 for RandomForestRegressor: 0.5456357042501481
Average R^2 for Decision Tree Regressor: 0.5211697604584234


## Linear, targeting rating

In [9]:
rating_features = df[df.columns.drop(['listingID', 'num_reviews', 'rating'])]
rating_label = df[['rating']]

In [10]:
for name, estimator in SINGLE_OUT_ESTIMATORS.items():
    scores = cross_val_score(estimator, X=rating_features, y=rating_label, cv=10)
    print("Average R^2 for "+name+": "+str(scores.mean()))

Average R^2 for Extra trees: -0.0317766806537122
Average R^2 for K-nn: -0.10344004014442684
Average R^2 for Linear regression: 0.10204533947076636
Average R^2 for Ridge: 0.10257308091539222
Average R^2 for Lasso: -0.024250369984792776
Average R^2 for ElasticNet: -0.006897679777952803
Average R^2 for RandomForestRegressor: 0.052398073664614374
Average R^2 for Decision Tree Regressor: -0.011633697482042338


## Polynomial, targeting price

In [11]:
def build_poly(model, degree=1):
    if degree == 1:
        return Pipeline([
            ("std", StandardScaler()), 
            ("reg", model), 
        ])
    
    return Pipeline([
        ("std", StandardScaler()),
        ("poly", PolynomialFeatures(degree)), 
        ("reg", model), 
    ])

In [14]:
for name, estimator in SINGLE_OUT_ESTIMATORS.items():
    scores = cross_val_score(build_poly(estimator, degree=2), X=features, y=label, cv=10)
    print("Average R^2 for "+name+": "+str(scores.mean()))

Average R^2 for Extra trees: 0.5815050736097197
Average R^2 for K-nn: 0.4480275622390442
Average R^2 for Linear regression: -8406403964423514.0
Average R^2 for Ridge: -6.757101976046664
Average R^2 for Lasso: 0.6360945654824148
Average R^2 for ElasticNet: 0.6247501864948364
Average R^2 for RandomForestRegressor: 0.5538917263131917
Average R^2 for Decision Tree Regressor: 0.5130065732570964


## Multiple output, targeting price and rating

In [15]:
MULTI_OUT_ESTIMATORS = {
    **SINGLE_OUT_ESTIMATORS,
    "MultiO/P GBR" :MultiOutputRegressor(GradientBoostingRegressor(n_estimators=5)),
    "MultiO/P AdaB" :MultiOutputRegressor(AdaBoostRegressor(n_estimators=5))
}

In [16]:
multi_features = df[df.columns.drop(['price', 'listingID', 'num_reviews', 'rating'])]
multi_labels = df[['price', 'rating']]

In [17]:
for name, estimator in MULTI_OUT_ESTIMATORS.items():
    scores = cross_val_score(estimator, X=multi_features, y=multi_labels, cv=10)
    print("Average R^2 for "+name+": "+str(scores.mean()))

Average R^2 for Extra trees: 0.5654554671217886
Average R^2 for K-nn: 0.5351820518347556
Average R^2 for Linear regression: 0.6083586325752434
Average R^2 for Ridge: 0.6057717503105784
Average R^2 for Lasso: 0.5257313338192381
Average R^2 for ElasticNet: 0.4642904503834554
Average R^2 for RandomForestRegressor: 0.5436730419258623
Average R^2 for Decision Tree Regressor: 0.5259789700316642
Average R^2 for MultiO/P GBR: 0.1635027603470701
Average R^2 for MultiO/P AdaB: 0.24160298378944028
