# DC Housing Random Forest Model

In [101]:
import pandas as pd
import numpy as np
import operator
import geopandas
from geopy.geocoders import Nominatim
from geopandas.tools import geocode
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, train_test_split
from sklearn.ensemble import RandomForestRegressor
from scipy.optimize import minimize_scalar

In [102]:
from sklearn.preprocessing import OneHotEncoder


In [103]:
listings = pd.read_csv("listings-complete.csv", low_memory = False)

In [140]:
col_list = list(listings.columns)
col_list

['zipcode',
 'latitude',
 'longitude',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'amenities',
 'square_feet',
 'price',
 'weekly_price',
 'monthly_price',
 'security_deposit',
 'cleaning_fee',
 'guests_included',
 'extra_people',
 'minimum_nights',
 'maximum_nights',
 'minimum_minimum_nights',
 'maximum_minimum_nights',
 'minimum_maximum_nights',
 'maximum_maximum_nights',
 'minimum_nights_avg_ntm',
 'maximum_nights_avg_ntm',
 'availability_30',
 'availability_60',
 'availability_90',
 'availability_365',
 'is_business_travel_ready',
 'require_guest_profile_picture',
 'require_guest_phone_verification',
 'nearby_crimes',
 'dist_to_closest_metro',
 'nearby_booze',
 'building_permits_nearby',
 'flexible',
 'moderate',
 'strict',
 'strict_14_with_grace_period',
 'super_strict_30',
 'super_strict_60',
 'Entire home/apt',
 'Hotel room',
 'Private room',
 'Shared room',
 'Aparthotel',
 'Apartment',
 'Barn',
 'Bed and breakfast',
 'Boat',
 'Boutique hotel',
 'Bungalow',
 'Camper/R

In [138]:
listings.head(10)

Unnamed: 0,zipcode,latitude,longitude,accommodates,bathrooms,bedrooms,beds,amenities,square_feet,price,...,"Shaw, Logan Circle","Sheridan, Barry Farm, Buena Vista","Southwest Employment Area, Southwest/Waterfront, Fort McNair, Buzzard Point","Spring Valley, Palisades, Wesley Heights, Foxhall Crescent, Foxhall Village, Georgetown Reservoir","Takoma, Brightwood, Manor Park","Twining, Fairlawn, Randle Highlands, Penn Branch, Fort Davis Park, Fort Dupont","Union Station, Stanton Park, Kingman Park","West End, Foggy Bottom, GWU","Woodland/Fort Stanton, Garfield Heights, Knox Hill","Woodridge, Fort Lincoln, Gateway"
0,20001,38.90982,-77.02016,16,3.5,4.0,5.0,"{TV,Internet,Wifi,""Air conditioning"",Kitchen,""...",,433,...,1,0,0,0,0,0,0,0,0,0
1,20011,38.95888,-77.02554,4,3.5,4.0,2.0,"{TV,""Cable TV"",Internet,Wifi,""Air conditioning...",,154,...,0,0,0,0,0,0,0,0,0,0
2,20009,38.91842,-77.0275,2,1.0,1.0,1.0,"{Wifi,""Air conditioning"",""Pets live on this pr...",,75,...,0,0,0,0,0,0,0,0,0,0
3,20020,38.86314,-76.98836,1,1.0,1.0,1.0,"{Internet,Wifi,Kitchen,""Free street parking"",""...",,55,...,0,0,0,0,0,0,0,0,0,0
4,20009,38.9276,-77.03926,2,1.0,1.0,1.0,"{""Cable TV"",""Air conditioning"",Heating,""Smoke ...",,88,...,0,0,0,0,0,0,0,0,0,0
5,20017,38.94008,-76.98936,9,1.0,2.0,2.0,"{TV,""Cable TV"",Internet,Wifi,""Air conditioning...",,120,...,0,0,0,0,0,0,0,0,0,0
6,20003,38.88791,-76.99668,2,1.5,1.0,1.0,"{TV,Wifi,""Air conditioning"",Kitchen,""Pets live...",,83,...,0,0,0,0,0,0,0,0,0,0
7,20001,38.91331,-77.02436,2,2.5,1.0,1.0,"{TV,""Cable TV"",Internet,Wifi,""Air conditioning...",,475,...,1,0,0,0,0,0,0,0,0,0
8,20019,38.90645,-76.94305,4,1.0,1.0,1.0,"{Internet,Wifi,""Air conditioning"",Kitchen,""Fre...",,52,...,0,0,0,0,0,0,0,0,0,0
9,20002,38.91263,-76.99221,1,3.0,1.0,1.0,"{""Cable TV"",Internet,Wifi,""Air conditioning"",K...",,99,...,0,0,0,0,0,0,0,0,0,0


In [104]:
idx = np.r_[1:39, 40:43, 44:48, 50, 57, 75:77, 81:97, 101:106]
listings.drop(listings.columns[idx], axis = 1, inplace = True)
listings['is_business_travel_ready'].replace('f', 0, inplace = True)
listings['is_business_travel_ready'].replace('t', 1, inplace = True)
listings['require_guest_profile_picture'].replace('f', 0, inplace = True)
listings['require_guest_profile_picture'].replace('t', 1, inplace = True)
listings['require_guest_phone_verification'].replace('f', 0, inplace = True)
listings['require_guest_phone_verification'].replace('t', 1, inplace = True)
dummy = pd.get_dummies(listings['cancellation_policy'])
listings = pd.concat([listings, dummy], axis = 1)
listings.drop(columns = ['cancellation_policy'], axis = 1, inplace = True)
dummy = pd.get_dummies(listings['room_type'])
listings = pd.concat([listings, dummy], axis = 1)
listings.drop(columns = ['room_type'], axis = 1, inplace = True)
dummy = pd.get_dummies(listings['property_type'])
listings = pd.concat([listings, dummy], axis = 1)
listings.drop(columns = ['property_type'], axis = 1, inplace = True)
dummy = pd.get_dummies(listings['neighbourhood_cleansed'])
listings = pd.concat([listings, dummy], axis = 1)
listings.drop(columns = ['neighbourhood_cleansed'], axis = 1, inplace = True)
listings.drop(columns = ['id'], axis = 1, inplace = True)
listings.replace([np.inf, -np.inf, "nan"], np.nan, inplace=True)

In [105]:
def cleanup_zipcode(zipcode):
    stripped = str(zipcode).strip("DC ")
    stripped = stripped.split("-", 1)[0]
    return stripped.split(":", 1)[0]
listings['zipcode'] = listings['zipcode'].apply(lambda zipcode: cleanup_zipcode(zipcode))
print(listings.zipcode.unique())

['20001' '20011' '20009' '20020' '20017' '20003' '20019' '20002' '20007'
 '20005' '20010' '20024' '20012' '20016' '20032' '20008' '20004' '20037'
 '20015' '20036' '20018' 'nan' '20006' '20910' '20268' '20815' '20064'
 '20372' '20712' '20781' '20745' '20912' '20052' '22202' '20748' '20743']


In [107]:
def getAddress(coord_string):
    geolocator = Nominatim(user_agent="gatech_goat_team")
    location = geolocator.reverse(coord_string)
    address = location.raw
    return address['address']['postcode']

for ind in range(len(listings)):
    if (listings.loc[ind, 'zipcode'] == "nan"):
        sample = str(listings.loc[ind, 'latitude']) + ", " + str(listings.loc[ind, 'longitude'])
        listings.loc[ind, 'zipcode'] = getAddress(sample)
listings['zipcode'] = listings['zipcode'].apply(lambda zipcode: cleanup_zipcode(zipcode))
print(listings.zipcode.unique())

['20001' '20011' '20009' '20020' '20017' '20003' '20019' '20002' '20007'
 '20005' '20010' '20024' '20012' '20016' '20032' '20008' '20004' '20037'
 '20015' '20036' '20018' '20219' '20006' '20910' '20057' '20540' '20268'
 '20815' '20064' '20566' '20372' '20712' '20781' '20745' '20912' '20052'
 '22202' '20748' '20743']


In [108]:
features = listings.drop(['latitude', 'longitude', 'amenities', 'square_feet', 'is_business_travel_ready', 'weekly_price',
       'monthly_price', 'security_deposit', 'cleaning_fee', 'guests_included',
       'extra_people', 'maximum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights',
       'minimum_maximum_nights', 'maximum_maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm',
       'availability_60', 'availability_90', 'availability_365', 'require_guest_profile_picture', 'require_guest_phone_verification'], axis = 1)
features.dropna(subset = ["bathrooms", "bedrooms", "beds"], inplace = True)
features.replace([np.inf, -np.inf], np.nan).dropna(axis = 0, how = 'any', thresh = None, subset = None, inplace = True)

In [109]:
response = 1-features['availability_30']/30
features.drop(['availability_30'], axis = 1, inplace = True)
x_train, x_test, y_train, y_test = train_test_split(features, response, test_size = 0.3, random_state = 111, shuffle = True)
regr = LinearRegression().fit(x_train, y_train)
weightings = []
for i, coef in enumerate(regr.coef_):
    weightings.append((x_train.columns[i], abs(coef)))
weightings.sort(key=operator.itemgetter(1), reverse=True)
for weight in weightings:
    print(f"{weight[0]}: {weight[1]}")

Aparthotel: 0.5015358960383818
Resort: 0.48665644005891834
Tiny house: 0.44820918371535806
Dome house: 0.272476279560089
Woodland/Fort Stanton, Garfield Heights, Knox Hill: 0.25281358217317285
Cottage: 0.24163600850774947
Boat: 0.2395530464360966
Camper/RV: 0.22694106073565484
Shared room: 0.21957104938588015
Entire home/apt: 0.18617260810248562
Townhouse: 0.1798027985767261
Loft: 0.16783211784016577
Bed and breakfast: 0.15428035701081894
House: 0.15121043179113922
Apartment: 0.1504227296095649
super_strict_60: 0.14899885339927496
strict: 0.14884590741981574
River Terrace, Benning, Greenway, Dupont Park: 0.1310967083233028
Condominium: 0.1269797790332018
West End, Foggy Bottom, GWU: 0.12670408923676382
Barn: 0.12641617744406397
Guest suite: 0.11464281850871816
Eastland Gardens, Kenilworth: 0.11225679075906382
Boutique hotel: 0.11154424459920806
Southwest Employment Area, Southwest/Waterfront, Fort McNair, Buzzard Point: 0.11034694244679788
North Cleveland Park, Forest Hills, Van Ness: 

In [135]:

regr = LinearRegression().fit(x_train, y_train)
weightings = []
for i, coef in enumerate(regr.coef_):
    weightings.append((x_train.columns[i], abs(coef)))
weightings.sort(key=operator.itemgetter(1), reverse=True)
for weight in weightings:
    print(f"{weight[0]}: {weight[1]}")

Aparthotel: 0.5015358960383818
Resort: 0.48665644005891834
Tiny house: 0.44820918371535806
Dome house: 0.272476279560089
Woodland/Fort Stanton, Garfield Heights, Knox Hill: 0.25281358217317285
Cottage: 0.24163600850774947
Boat: 0.2395530464360966
Camper/RV: 0.22694106073565484
Shared room: 0.21957104938588015
Entire home/apt: 0.18617260810248562
Townhouse: 0.1798027985767261
Loft: 0.16783211784016577
Bed and breakfast: 0.15428035701081894
House: 0.15121043179113922
Apartment: 0.1504227296095649
super_strict_60: 0.14899885339927496
strict: 0.14884590741981574
River Terrace, Benning, Greenway, Dupont Park: 0.1310967083233028
Condominium: 0.1269797790332018
West End, Foggy Bottom, GWU: 0.12670408923676382
Barn: 0.12641617744406397
Guest suite: 0.11464281850871816
Eastland Gardens, Kenilworth: 0.11225679075906382
Boutique hotel: 0.11154424459920806
Southwest Employment Area, Southwest/Waterfront, Fort McNair, Buzzard Point: 0.11034694244679788
North Cleveland Park, Forest Hills, Van Ness: 

In [136]:
regr.score(x_test, y_test)

0.09237985994326614

In [113]:
rfr = RandomForestRegressor().fit(x_train, y_train)



In [114]:
weightings = []
for i, coef in enumerate(rfr.feature_importances_):
    weightings.append((x_train.columns[i], abs(coef)))
weightings.sort(key=operator.itemgetter(1), reverse=True)
for weight in weightings:
    print(f"{weight[0]}: {weight[1]}")

dist_to_closest_metro: 0.13885604623429904
price: 0.11664676735271526
nearby_crimes: 0.11365744745734176
building_permits_nearby: 0.10102178794228564
nearby_booze: 0.07109489943933954
minimum_nights: 0.05830412886078927
zipcode: 0.041426333248699064
accommodates: 0.03903933167748571
beds: 0.03451035223290476
Shared room: 0.029394064944760788
bathrooms: 0.02483271069114037
bedrooms: 0.018670204662482016
strict_14_with_grace_period: 0.01752013721300974
Apartment: 0.014974006344544237
Entire home/apt: 0.013818103729724284
House: 0.010767185933686278
moderate: 0.010551641649602756
flexible: 0.00980278885738665
Townhouse: 0.009648595145354806
Columbia Heights, Mt. Pleasant, Pleasant Plains, Park View: 0.007184315172570658
Guest suite: 0.006833524721498052
Condominium: 0.0065829148250986964
Union Station, Stanton Park, Kingman Park: 0.0065826934826038385
Edgewood, Bloomingdale, Truxton Circle, Eckington: 0.006180289620340121
Capitol Hill, Lincoln Park: 0.0051294468918407525
Southwest Employm

In [115]:
rfr.score(x_test, y_test)

0.060037094621342546

In [116]:
params = {'n_estimators': [250], 'max_depth': [25], 'oob_score': [True, False]}
rfr = RandomForestRegressor()
model = GridSearchCV(rfr, params, verbose = 10)
model.fit(x_train, y_train)

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV] max_depth=25, n_estimators=250, oob_score=True ..................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  max_depth=25, n_estimators=250, oob_score=True, score=0.141, total=   6.9s
[CV] max_depth=25, n_estimators=250, oob_score=True ..................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.9s remaining:    0.0s


[CV]  max_depth=25, n_estimators=250, oob_score=True, score=0.136, total=   6.3s
[CV] max_depth=25, n_estimators=250, oob_score=True ..................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   13.2s remaining:    0.0s


[CV]  max_depth=25, n_estimators=250, oob_score=True, score=0.148, total=   6.2s
[CV] max_depth=25, n_estimators=250, oob_score=False .................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   19.3s remaining:    0.0s


[CV]  max_depth=25, n_estimators=250, oob_score=False, score=0.136, total=   6.3s
[CV] max_depth=25, n_estimators=250, oob_score=False .................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   25.6s remaining:    0.0s


[CV]  max_depth=25, n_estimators=250, oob_score=False, score=0.127, total=   6.7s
[CV] max_depth=25, n_estimators=250, oob_score=False .................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   32.3s remaining:    0.0s


[CV]  max_depth=25, n_estimators=250, oob_score=False, score=0.147, total=   7.0s


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   39.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   39.3s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': [25], 'n_esti

In [125]:
print(model.best_params_)

{'max_depth': 25, 'n_estimators': 250, 'oob_score': True}


In [126]:
model.score(x_test, y_test)

0.17866907708196345

In [127]:
model.score(features, response)

0.6596323119471916

In [128]:
x_train_pca = x_train

In [129]:
rfr_model = RandomForestRegressor(max_depth = 25, n_estimators = 250, oob_score = True).fit(x_train, y_train)

In [130]:
rfr_model.score(features, response)

0.658908679910761

In [131]:
weights = []
for i, coef in enumerate(rfr_model.feature_importances_):
    weights.append((x_train.columns[i], abs(coef)))
weights.sort(key = operator.itemgetter(1), reverse = True)
for w in weights:
    print(f"{w[0]}: {w[1]}")

dist_to_closest_metro: 0.13249353773655678
price: 0.12561165714902423
nearby_crimes: 0.11439299377833105
building_permits_nearby: 0.10268316462406088
nearby_booze: 0.06870709582720935
minimum_nights: 0.05551950026346414
zipcode: 0.04132250545082709
accommodates: 0.039848823846308105
beds: 0.03239468627147435
Shared room: 0.0319225903338574
bathrooms: 0.0251300650314867
bedrooms: 0.018902927323434994
strict_14_with_grace_period: 0.015125246440833608
Apartment: 0.013501320258730106
Entire home/apt: 0.013327583067482406
flexible: 0.011240783725264356
Townhouse: 0.009835440093189917
moderate: 0.009552004420765276
House: 0.009144276782940317
Condominium: 0.007469661238460325
Guest suite: 0.006681027217866091
Columbia Heights, Mt. Pleasant, Pleasant Plains, Park View: 0.00655266104181654
Edgewood, Bloomingdale, Truxton Circle, Eckington: 0.005774748602449152
Capitol Hill, Lincoln Park: 0.005385270236561111
Union Station, Stanton Park, Kingman Park: 0.005267458019593512
Brightwood Park, Crest

In [142]:
# need to process inputs for the model
dist_to_closest_metro = 4.3
def objective_function(price):
    return -price*rfr_model.predict([y_train])
res = minimize_scalar(objective_function)
print(res)

ValueError: Number of features of the model must match the input. Model n_features is 83 and input n_features is 6495 

In [None]:
rfr_model.feature_importances_