In [1]:
import pandas as pd
import numpy as np
import folium
import json
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import NMF, PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
import xgboost as xgb
from src.nlp_helpers import *
from src.model_helpers import *

In [12]:
data = pd.read_csv('data/AB_NYC_2019.csv')

In [13]:
data['last_review'].fillna('None', inplace=True)
data['reviews_per_month'].fillna(0.0, inplace=True)
data['name'].fillna('Unknown', inplace=True)
data['host_name'].fillna('Unknown', inplace=True)

In [15]:
data

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,Entire home/apt,Private room,Shared room
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,149,1,9,2018-10-19,0.21,6,365,0,1,0
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,225,1,45,2019-05-21,0.38,2,355,1,0,0
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.94190,150,3,0,,0.00,1,365,0,1,0
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,89,1,270,2019-07-05,4.64,1,194,1,0,0
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,80,10,9,2018-11-19,0.10,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48890,36484665,Charming one bedroom - newly renovated rowhouse,8232441,Sabrina,Brooklyn,Bedford-Stuyvesant,40.67853,-73.94995,70,2,0,,0.00,2,9,0,1,0
48891,36485057,Affordable room in Bushwick/East Williamsburg,6570630,Marisol,Brooklyn,Bushwick,40.70184,-73.93317,40,4,0,,0.00,2,36,0,1,0
48892,36485431,Sunny Studio at Historical Neighborhood,23492952,Ilgar & Aysel,Manhattan,Harlem,40.81475,-73.94867,115,10,0,,0.00,1,27,1,0,0
48893,36485609,43rd St. Time Square-cozy single bed,30985759,Taz,Manhattan,Hell's Kitchen,40.75751,-73.99112,55,1,0,,0.00,6,2,0,0,1


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
id                                48895 non-null int64
name                              48895 non-null object
host_id                           48895 non-null int64
host_name                         48895 non-null object
neighbourhood_group               48895 non-null object
neighbourhood                     48895 non-null object
latitude                          48895 non-null float64
longitude                         48895 non-null float64
room_type                         48895 non-null object
price                             48895 non-null int64
minimum_nights                    48895 non-null int64
number_of_reviews                 48895 non-null int64
last_review                       48895 non-null object
reviews_per_month                 48895 non-null float64
calculated_host_listings_count    48895 non-null int64
availability_365                  48895 non-null int64

In [14]:
data = data.merge(pd.get_dummies(data['room_type']), left_index=True, right_index=True)
data.drop(columns = 'room_type', inplace=True)

In [7]:
vectorizer, vocab = text_vectorizer(data['name'], use_tfidf=True, use_stemmer=False)
X = vectorizer(data['name'])

In [None]:
nmf = NMF(n_components=50, max_iter=10, alpha=0.0)
W = nmf.fit_transform(X)
H = nmf.components_
print('reconstruction error: ', nmf.reconstruction_err_)

In [None]:
hand_labels = hand_label_topics(H, vocab)

In [None]:
print(data[data['name'].index==100])
analyze(100, data['name'], W, hand_labels)

In [None]:
labels = analyze_all(data, W, hand_labels)

In [None]:
data['labels'] = labels
#data = data.merge(pd.get_dummies(data['labels']), left_index=True, right_index=True)
#data.drop(columns = 'labels', inplace=True)

In [7]:
X = np.column_stack((data['neighbourhood'], data['minimum_nights'], data['number_of_reviews'], data['reviews_per_month'], data['calculated_host_listings_count'], data['availability_365'], data['Entire home/apt'], data['Private room'], data['Shared room'], data['name']))
y = data['price'].values

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [11]:
#param = {'max_depth':2, 'eta':1, 'objective': 'binary:logistic', 'nthread':4, 'eval_metric':'auc'}
#evallist = [(np.column_stack((X_test,y_test)), 'eval'), (np.column_stack((X_train,y_train)), 'train')]
#xbst = xgb.train(param, np.column_stack((X_train,y_train)), 10, evallist)

In [32]:
gdbr = GradientBoostingRegressor(learning_rate=0.1, loss='ls', n_estimators=100)
rf = RandomForestRegressor(n_estimators=100, n_jobs=-1)
abr = AdaBoostRegressor(DecisionTreeRegressor(), learning_rate=0.1, loss='linear', n_estimators=100)

In [None]:
cross_val(gdbr, X_train, y_train, 10)
cross_val(rf, X_train, y_train, 10)
cross_val(abr, X_train, y_train, 10)

GradientBoostingRegressor Train CV | MSE: 46657.000 | R2: 0.170
RandomForestRegressor     Train CV | MSE: 48236.740 | R2: 0.132
