In [3]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
%matplotlib inline
import statsmodels.formula.api as sm
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold
from pygam import LinearGAM, s, f
from sklearn.preprocessing import LabelEncoder

# Seaborn visualization library
import seaborn as sns

In [4]:
#load training data
listing_train = pd.read_csv("../data/listings_train.csv").dropna()

#load test data
listing_test = pd.read_csv("../data/listings_test.csv").dropna()

listing_train.head()

Unnamed: 0,host_total_listings_count,room_type,latitude,longitude,bathrooms,bedrooms,beds,price,security_deposit,cleaning_fee,availability_365,number_of_reviews
0,1,Private room,42.347956,-71.155196,1.0,1,1,52,1,65,365,26
1,85,Entire home/apt,42.349299,-71.08347,1.0,0,1,110,1,104,107,38
2,6,Entire home/apt,42.341902,-71.073792,1.0,1,1,67,45,56,322,9
3,1,Entire home/apt,42.319235,-71.105016,2.0,2,2,103,8,113,341,49
4,1,Entire home/apt,42.346452,-71.134896,1.0,0,1,8,24,82,41,13


In [5]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [6]:
train_list = listing_train.drop(labels=['room_type'], axis=1)
test_list = listing_test.drop(labels=['room_type'], axis=1)

room_type = OneHotEncoder(drop='first', sparse=False).fit(listing_train.room_type.to_numpy().reshape(-1,1))

room_type_train = pd.DataFrame(room_type.transform(listing_train.room_type.to_numpy().reshape(-1,1)))
room_type_test = pd.DataFrame(room_type.transform(listing_test.room_type.to_numpy().reshape(-1,1)))
room_type_train.columns = 'room_type' + room_type_train.columns.astype(str)
room_type_test.columns = 'room_type' + room_type_test.columns.astype(str)

room_type_train.head()

Unnamed: 0,room_type0,room_type1
0,1.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0


In [7]:
train_lin = pd.concat([room_type_train, train_list], axis=1)
test_lin = pd.concat([room_type_test, test_list], axis=1)

train_lin.head()

Unnamed: 0,room_type0,room_type1,host_total_listings_count,latitude,longitude,bathrooms,bedrooms,beds,price,security_deposit,cleaning_fee,availability_365,number_of_reviews
0,1.0,0.0,1,42.347956,-71.155196,1.0,1,1,52,1,65,365,26
1,0.0,0.0,85,42.349299,-71.08347,1.0,0,1,110,1,104,107,38
2,0.0,0.0,6,42.341902,-71.073792,1.0,1,1,67,45,56,322,9
3,0.0,0.0,1,42.319235,-71.105016,2.0,2,2,103,8,113,341,49
4,0.0,0.0,1,42.346452,-71.134896,1.0,0,1,8,24,82,41,13


In [8]:
terms = [x for x in train_lin.columns if x != 'price']
terms

['room_type0',
 'room_type1',
 'host_total_listings_count',
 'latitude',
 'longitude',
 'bathrooms',
 'bedrooms',
 'beds',
 'security_deposit',
 'cleaning_fee',
 'availability_365',
 'number_of_reviews']

In [9]:
formula = "+".join(terms)
formula = 'price ~ {}'.format(formula)

reg = sm.ols(formula=formula, data=train_lin).fit()

r_score_train = r2_score(train_lin.price, reg.predict(train_lin))
r_score_test = r2_score(test_lin.price, reg.predict(test_lin))

print('\n###########################################################\n')
print("Linear Regression - Using One Hot Encoding")
print('Training R-Squared = {:0.5f}'.format(r_score_train))
print('Testing R-Squared = {:0.5f}'.format(r_score_test))
print('\n###########################################################\n')
print(reg.summary())



###########################################################

Linear Regression - Using One Hot Encoding
Training R-Squared = 0.24942
Testing R-Squared = 0.18479

###########################################################

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.249
Model:                            OLS   Adj. R-squared:                  0.247
Method:                 Least Squares   F-statistic:                     120.7
Date:                Wed, 06 May 2020   Prob (F-statistic):          1.45e-260
Time:                        14:24:24   Log-Likelihood:                -26307.
No. Observations:                4370   AIC:                         5.264e+04
Df Residuals:                    4357   BIC:                         5.272e+04
Df Model:                          12                                         
Covariance Type:            nonrobust                            

In [10]:
poly = PolynomialFeatures(degree=3, include_bias=False)
train_poly = pd.DataFrame(poly.fit_transform(train_list.drop(labels=['price'], axis=1)))
test_poly = pd.DataFrame(poly.transform(test_list.drop(labels=['price'], axis=1)))
train_poly.columns = 'poly' + train_poly.columns.astype(str)
test_poly.columns = 'poly' + test_poly.columns.astype(str)
train_poly['price'] = train_list.price
test_poly['price'] = test_list.price

train_poly.head()

Unnamed: 0,poly0,poly1,poly2,poly3,poly4,poly5,poly6,poly7,poly8,poly9,...,poly276,poly277,poly278,poly279,poly280,poly281,poly282,poly283,poly284,price
0,1.0,42.347956,-71.155196,1.0,1.0,1.0,1.0,65.0,365.0,26.0,...,1542125.0,109850.0,8659625.0,616850.0,43940.0,48627125.0,3463850.0,246740.0,17576.0,52
1,85.0,42.349299,-71.08347,1.0,0.0,1.0,1.0,104.0,107.0,38.0,...,1157312.0,411008.0,1190696.0,422864.0,150176.0,1225043.0,435062.0,154508.0,54872.0,110
2,6.0,42.341902,-71.073792,1.0,1.0,1.0,45.0,56.0,322.0,9.0,...,1009792.0,28224.0,5806304.0,162288.0,4536.0,33386248.0,933156.0,26082.0,729.0,67
3,1.0,42.319235,-71.105016,2.0,2.0,2.0,8.0,113.0,341.0,49.0,...,4354229.0,625681.0,13139753.0,1888117.0,271313.0,39651821.0,5697769.0,818741.0,117649.0,103
4,1.0,42.346452,-71.134896,1.0,0.0,1.0,24.0,82.0,41.0,13.0,...,275684.0,87412.0,137842.0,43706.0,13858.0,68921.0,21853.0,6929.0,2197.0,8


In [11]:
terms = [x for x in train_poly.columns if x != 'price']
formula = "+".join(terms)
formula = 'price ~ {}'.format(formula)

poly_reg = sm.ols(formula=formula, data=train_poly).fit()

r_score_train = r2_score(train_poly.price, poly_reg.predict(train_poly))
r_score_test = r2_score(test_poly.price, poly_reg.predict(test_poly))

print('\n###########################################################\n')
print("Polynomial Regression - Using Label Encoding")
print('Training R-Squared = {:0.5f}'.format(r_score_train))
print('Testing R-Squared = {:0.5f}'.format(r_score_test))
print('\n###########################################################\n')
print(poly_reg.summary())


###########################################################

Polynomial Regression - Using Label Encoding
Training R-Squared = 0.28444
Testing R-Squared = 0.11183

###########################################################

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.284
Model:                            OLS   Adj. R-squared:                  0.237
Method:                 Least Squares   F-statistic:                     6.011
Date:                Wed, 06 May 2020   Prob (F-statistic):          5.46e-154
Time:                        14:24:27   Log-Likelihood:                -26203.
No. Observations:                4370   AIC:                         5.295e+04
Df Residuals:                    4098   BIC:                         5.469e+04
Df Model:                         271                                         
Covariance Type:            nonrobust                          

## GAM

In [12]:
from pygam import LinearGAM, s, f
from sklearn.preprocessing import LabelEncoder
#your code here

print('Columns of the Dataset: \n   {}'.format("\n   ".join(train_lin.columns)))


Columns of the Dataset: 
   room_type0
   room_type1
   host_total_listings_count
   latitude
   longitude
   bathrooms
   bedrooms
   beds
   price
   security_deposit
   cleaning_fee
   availability_365
   number_of_reviews


In [13]:
airbnb_train = train_lin
airbnb_test = test_lin

airbnb_train.head()

Unnamed: 0,room_type0,room_type1,host_total_listings_count,latitude,longitude,bathrooms,bedrooms,beds,price,security_deposit,cleaning_fee,availability_365,number_of_reviews
0,1.0,0.0,1,42.347956,-71.155196,1.0,1,1,52,1,65,365,26
1,0.0,0.0,85,42.349299,-71.08347,1.0,0,1,110,1,104,107,38
2,0.0,0.0,6,42.341902,-71.073792,1.0,1,1,67,45,56,322,9
3,0.0,0.0,1,42.319235,-71.105016,2.0,2,2,103,8,113,341,49
4,0.0,0.0,1,42.346452,-71.134896,1.0,0,1,8,24,82,41,13


In [14]:
airbnb_train.columns

Index(['room_type0', 'room_type1', 'host_total_listings_count', 'latitude',
       'longitude', 'bathrooms', 'bedrooms', 'beds', 'price',
       'security_deposit', 'cleaning_fee', 'availability_365',
       'number_of_reviews'],
      dtype='object')

In [15]:
smoothings = np.logspace(-3, 6, 9)
cv = 10

kf = KFold(n_splits=cv, random_state=42, shuffle=True)
scores = np.zeros((cv,len(smoothings)))

print(smoothings,'\n')
print(scores.shape, '\n')
print(type(kf))

[1.00000000e-03 1.33352143e-02 1.77827941e-01 2.37137371e+00
 3.16227766e+01 4.21696503e+02 5.62341325e+03 7.49894209e+04
 1.00000000e+06] 

(10, 9) 

<class 'sklearn.model_selection._split.KFold'>


In [16]:
for i, (train_index, test_index) in enumerate(kf.split(airbnb_train)):
    train_df = airbnb_train.iloc[train_index,:]
    test_df = airbnb_train.iloc[test_index,:]
    
    for j, smoothing in enumerate(smoothings):
        cur_model = LinearGAM(f(0) + s(1) + s(2) + s(3) + s(4) + s(5) + s(6) + s(7) + s(8) + s(9) + s(10) + s(11), lam = smoothing).fit(train_df, train_df.price)
        scores[i,j] = r2_score(test_df.price, cur_model.predict(test_df))
                              
best_lambda = smoothings[np.argmax(np.mean(scores, axis = 0))]
best_lambda

0.01333521432163324

In [17]:
airbnb_gam = LinearGAM(f(0) + s(1) + s(2) + s(3) + s(4) + s(5) + s(6) + s(8) + s(9) + s(10) + s(11),
                       lam=best_lambda).fit(airbnb_train, airbnb_train.price)
r_score_train = r2_score(airbnb_train.price, airbnb_gam.predict(airbnb_train))
r_score_test = r2_score(airbnb_test.price, airbnb_gam.predict(airbnb_test))


print('\n###########################################################\n')
print('Lambda = {:0.3f}'.format(best_lambda))
print('Training R-Squared = {:0.5f}'.format(r_score_train))
print('Testing R-Squared = {:0.5f}'.format(r_score_test))
print('\n###########################################################\n')
print(airbnb_gam.summary())


###########################################################

Lambda = 0.013
Training R-Squared = 1.00000
Testing R-Squared = 1.00000

###########################################################

LinearGAM                                                                                                 
Distribution:                        NormalDist Effective DoF:                                    138.9043
Link Function:                     IdentityLink Log Likelihood:                    -1.2257104114387733e+17
Number of Samples:                         4370 AIC:                                 2.451420822877549e+17
                                                AICc:                                2.451420822877549e+17
                                                GCV:                                                   0.0
                                                Scale:                                                 0.0
                                                Pseudo 

 
Please do not make inferences based on these values! 

Collaborate on a solution, and stay up to date at: 
github.com/dswah/pyGAM/issues/163 

  if sys.path[0] == '':
