In [112]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import mpl_toolkits
import matplotlib.mlab
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV
from sklearn.externals import joblib

# Calculating Average Price

Read the data for latitude, longitutde, price, and weekly price while removing the "$" and "," so I can read the columns as floats instead of strings

In [113]:
data = pd.read_csv("./data/listings.csv")
data = data[['latitude','longitude', 'price', 'weekly_price']]
data['weekly_price'] = data['weekly_price'].str.replace('$', '').str.replace(',', '').astype(float)
data['price'] = data['price'].str.replace('$', '').str.replace(',', '').astype(float)
data.head(10)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,latitude,longitude,price,weekly_price
0,37.754184,-122.406514,49.0,300.0
1,37.754166,-122.421534,220.0,1050.0
2,37.758506,-122.406152,100.0,
3,37.756549,-122.422025,117.0,
4,37.760051,-122.421352,200.0,
5,37.759495,-122.424874,162.0,
6,37.760997,-122.413124,230.0,1400.0
7,37.750543,-122.416559,173.0,
8,37.756288,-122.408738,600.0,
9,37.757246,-122.409315,250.0,


Remove all rows that don't have a weekly price. I print out the length of the data without rows and I attempt to remove any more values that don't have a latitude, longitude, or regular price, but since the lengths are the same, that means the only NaNs exist in the weekly price.

In [114]:
has_weekly = data.dropna(subset=['weekly_price']);
print(len(has_weekly))
print(len(has_weekly.dropna(how='any')))
has_weekly.head()

2165
2165


Unnamed: 0,latitude,longitude,price,weekly_price
0,37.754184,-122.406514,49.0,300.0
1,37.754166,-122.421534,220.0,1050.0
6,37.760997,-122.413124,230.0,1400.0
10,37.749329,-122.410792,300.0,1900.0
14,37.766521,-122.42466,95.0,550.0


I figure out on average how many times the price an owner gets for week as opposed to a day. I figured it was around 7, but it doesn't hurt to get a more accurate number. This is useful to make an educated guess as to what the weekly price of houses with just a price is.

In [115]:
avg_multiplier = (has_weekly.weekly_price/has_weekly.price).mean()
print(avg_multiplier)

6.73920600177


Replace all the NaN with the averge multiplier times the nightly price

In [116]:
data.weekly_price.fillna(data.price*avg_multiplier, inplace=True)
data.head()

Unnamed: 0,latitude,longitude,price,weekly_price
0,37.754184,-122.406514,49.0,300.0
1,37.754166,-122.421534,220.0,1050.0
2,37.758506,-122.406152,100.0,673.9206
3,37.756549,-122.422025,117.0,788.487102
4,37.760051,-122.421352,200.0,1347.8412


Just some information that would be interesting to see.

In [117]:
data.describe()

Unnamed: 0,latitude,longitude,price,weekly_price
count,8706.0,8706.0,8706.0,8706.0
mean,37.767743,-122.430262,250.160579,1655.122398
std,0.02193,0.025466,474.630624,3169.221635
min,37.706928,-122.5115,0.0,0.0
25%,37.753321,-122.442043,100.0,680.0
50%,37.769819,-122.42542,160.0,1050.0
75%,37.785527,-122.412451,250.0,1684.8015
max,37.831093,-122.364759,10000.0,67392.060018


In [118]:
data.head()

Unnamed: 0,latitude,longitude,price,weekly_price
0,37.754184,-122.406514,49.0,300.0
1,37.754166,-122.421534,220.0,1050.0
2,37.758506,-122.406152,100.0,673.9206
3,37.756549,-122.422025,117.0,788.487102
4,37.760051,-122.421352,200.0,1347.8412


Splitting the data into its features and labels

In [119]:
features = data[['latitude','longitude']]
labels = data[['weekly_price']]
x_train , x_test , y_train , y_test = train_test_split(features , labels , test_size = 0.20,random_state =2)

In [None]:
grid_num = len(features.latitude.values)
xi = np.linspace(features.latitude.min(), features.latitude.max(), num=grid_num)
yi = np.linspace(features.longitude.min(), features.longitude.max(), num=grid_num)
zi = matplotlib.mlab.griddata(features.latitude.values, features.longitude.values, labels.values.reshape(-1), xi, yi, interp='linear')

In [None]:
%matplotlib inline
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.plot_surface(xi, yi, zi)
plt.show()

# KNN Regressor

In [None]:
reg = KNeighborsRegressor(10)

A score of 0 is if we had just predicted the averages, so this is definitely not a great model

In [None]:
reg.fit(x_train,y_train)
reg.score(x_test,y_test)

Just looking at the values, to make sure everything has worked out. You can see even from these 10 that the model is not doing a very great job at predicting very high prices.

In [None]:
x_test.head(10)

In [None]:
y_test.head(10)

In [None]:
reg.predict(x_test[:10])

Curious to see that the mean is about the same as the actual dataset, but the standard deviation is much lower. This model doesn't really handle the information very well. This is also pretty apparent with the negative r squared value.

In [None]:
pd.DataFrame(reg.predict(x_test)).describe()

Saving the information of the model so that it can be used as a function in the actual web app

In [None]:
joblib.dump(reg, 'KNNeighborsRegressor.pkl', protocol=2)

In [None]:
print(type(x_test))

In [None]:
test = joblib.load('KNNeighborsRegressor.pkl') 
test.predict(x_test[:10])

# SVR

In [None]:
svr_reg = SVR()

In [None]:
svr_reg.fit(x_train,y_train)
svr_reg.score(x_test,y_test)

In [None]:
pd.DataFrame(svr_reg.predict(x_test)).describe()

# Gradient Boosting

In [None]:
GBR = GradientBoostingRegressor(n_estimators=2, max_depth=2, learning_rate=.12, random_state=3, max_features=1)
GBR.fit(x_train,y_train)
print(GBR.score(x_test,y_test))
pd.DataFrame(GBR.predict(x_test)).describe()

In [None]:
param_test = {'max_depth':list(range(5,16,2)), 'min_samples_split':list(range(200,1001,200))};
grid_search = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=60, max_features='sqrt', subsample=0.8, random_state=10), 
param_grid = param_test, scoring='roc_auc',n_jobs=4,iid=False, cv=5);
grid_search.fit(x_train,y_train);

# Random Forest

In [None]:
RFR = RandomForestRegressor(max_depth=3, random_state=3)
RFR.fit(x_train,y_train)
print(RFR.score(x_test,y_test))
pd.DataFrame(RFR.predict(x_test)).describe()

# Calculating the Best Price

Read in all the data, remove unneeded columns

In [None]:
available_p = pd.read_csv("./data/calendar_available_only.csv")
available_p = available_p[['listing_id', 'price']]
# Replace the string value of the price to a readable float value
available_p['price'] = available_p['price'].str.replace('$', '').str.replace(',', '').astype(float)
listings = pd.read_csv("./data/listings.csv")
listings = listings[['latitude','longitude', 'id']]
# Confirm only non sold houses have prices
all_p = pd.read_csv("./data/calendar.csv").dropna(subset=['price'])
print(all_p.available.unique())

In [None]:
available_p.head()

Take unique IDs, take the average of the prices with that ID, and find the corresponding locations to the ID

In [None]:
uniques = available_p.listing_id.unique()
lat = []
long = []
best_price = []
for entry in uniques:
    best_price.append(available_p.loc[available_p['listing_id'] == entry].price.mean())
    found_row = listings.loc[listings['id'] == entry]
    lat.append(found_row.latitude.values[0])
    long.append(found_row.longitude.values[0])

In [None]:
price_data = pd.DataFrame(
    {'lat': lat,
     'long': long,
     'price': best_price
    })

In [None]:
price_data.head()

In [None]:
new_labels = price_data[['price']]
new_features = price_data[['lat', 'long']]
X_train , X_test , Y_train , Y_test = train_test_split(new_features, new_labels , test_size = 0.20,random_state =2)

# KNN Regressor

In [None]:
reg = KNeighborsRegressor(137)
reg.fit(X_train,Y_train)
print(reg.score(X_test,Y_test))

In [None]:
X_test.head()

In [None]:
Y_test.head()

In [None]:
test = reg.predict(X_test[:5])
print(test)

In [None]:
joblib.dump(reg, 'KNNeighborsPriceRegressor.pkl', protocol=2)