In [52]:
%matplotlib notebook
import os
import json
import time
import pickle
import requests
import pandas as pd
import seaborn as sns
import numpy as np 
import matplotlib.pyplot as plt
from table_reader import TableReader
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, RidgeCV, LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures

Load Data

In [21]:
tr = TableReader()
df = tr.properties_vector(include_amenitites=True)
tr.close()

<_io.TextIOWrapper name='config.json' mode='r' encoding='UTF-8'>


In [22]:
features = df[df.columns.drop(['price', 'listingID'])]
label = df['price']

df.describe()

Index(['listingID', 'accomodates', 'bathrooms', 'bedrooms', 'beds', 'price',
       'Apartment', 'Condominium', 'Guest suite', 'House',
       'Serviced apartment', 'Townhouse', 'Entire home/apt', 'Private room',
       'Shared room', 'Washer', 'Keypad', 'Shampoo', 'Cable TV',
       'Indoor fireplace', 'Host greets you', 'Pets allowed',
       'Pets live on this property', 'Laptop friendly workspace',
       'translation missing: en.hosting_amenity_50',
       'Buzzer/wireless intercom', 'Coffee maker',
       'Extra pillows and blankets', 'Fire extinguisher', 'Wifi', 'TV',
       'translation missing: en.hosting_amenity_49', 'Smoke detector', 'Oven',
       'Free parking on premises', 'Smoking allowed', 'Self check-in',
       'Bathtub', 'Free street parking', 'Private entrance', 'Breakfast',
       'Doorman', 'Other', 'Family/kid friendly', 'Iron', 'Hangers',
       'Private living room', 'Hot water', 'Gym', 'Safety card', 'Kitchen',
       'Internet', 'Air conditioning', 'Dishes an

Unnamed: 0,listingID,accomodates,bathrooms,bedrooms,beds,price,Apartment,Condominium,Guest suite,House,...,Hot tub,Elevator,Hair dryer,Lockbox,Wheelchair accessible,24-hour check-in,Smart lock,Suitable for events,First aid kit,Essentials
count,12517.0,12517.0,12517.0,12517.0,12517.0,12517.0,12517.0,12517.0,12517.0,12517.0,...,12517.0,12517.0,12517.0,12517.0,12517.0,12517.0,12517.0,12517.0,12517.0,12517.0
mean,18473130.0,3.655988,1.352995,1.371842,1.946249,199.795957,0.453543,0.091156,0.043061,0.208916,...,0.030199,0.219382,0.571782,0.068786,0.033474,0.120157,0.010226,0.050971,0.407046,0.914197
std,9250700.0,2.34447,0.822944,0.9572,1.587389,336.427271,0.497857,0.287842,0.203004,0.40655,...,0.171141,0.413844,0.49484,0.253101,0.179879,0.325158,0.10061,0.219947,0.491303,0.280085
min,3344.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,12216660.0,2.0,1.0,1.0,1.0,79.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,18226450.0,3.0,1.0,1.0,1.0,115.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,25972100.0,4.0,1.5,2.0,2.0,195.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
max,34888100.0,17.0,50.0,11.0,50.0,13000.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Split data to training data and test data

In [51]:
X_train, X_test, y_train, y_test = tts(features, label, test_size=0.2)


Simple Regression with Ordinary Least Squares (OLS)

In [30]:
regr = LinearRegression()
regr.fit(X_train,y_train)
print(mean_squared_error(y_test, regr.predict(X_test)))
print(regr.score(X_test,y_test))

60317.01243172686
0.23320458625117768


Ridge Regression

In [31]:
clf = Ridge(alpha=0.5)
clf.fit(X_train, y_train)
print(mean_squared_error(y_test, clf.predict(X_test)))
print(clf.score(X_test, y_test))

60315.05178702025
0.2332295114473769


Choose alpha for Ridge Regression

In [32]:
import numpy as np

# try 200 different alphas between -10 and -2
n_alphas = 200
alphas = np.logspace(-10, -2, n_alphas)

clf = RidgeCV(alphas=alphas)
clf.fit(X_train, y_train)

#which alpha did it pick?
print(clf.alpha_)

0.01


In [33]:
clf.score(X_test, y_test)

0.2332042723657325

Lasso Regression

In [34]:
clf = Lasso(alpha=0.5)
clf.fit(X_train, y_train)
print(mean_squared_error(y_test, clf.predict(X_test)))
clf.score(X_test, y_test)

60221.50512962824


0.234418747203059

Logistic Regression

In [53]:
model = LogisticRegression().fit(X_train, y_train)
get_internal_params(model)




classes_ [    0    10    13    14    15    16    17    18    19    20    21    22
    23    24    25    26    27    28    29    30    31    32    33    34
    35    36    37    38    39    40    41    42    43    44    45    46
    47    48    49    50    51    52    53    54    55    56    57    58
    59    60    61    62    63    64    65    66    67    68    69    70
    71    72    73    74    75    76    77    78    79    80    81    82
    83    84    85    86    87    88    89    90    91    92    93    94
    95    96    97    98    99   100   101   102   103   104   105   106
   107   108   109   110   111   112   113   114   115   116   117   118
   119   120   121   122   123   124   125   126   127   128   129   130
   131   132   133   134   135   136   137   138   139   140   141   142
   143   144   145   146   147   148   149   150   151   152   153   154
   155   156   157   158   159   160   161   162   163   164   165   166
   167   168   169   170   171   172   173

In [54]:
print(mean_squared_error(y_test, clf.predict(X_test)))
clf.score(X_test, y_test)

93494.24816734677


0.1914133626415957