In [139]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.linear_model import LinearRegression as LR
from sklearn.linear_model import SGDRegressor as SGD

from sklearn.neighbors import NearestNeighbors

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import Imputer

pd.set_option('display.max_colwidth', 100)
pd.set_option('display.precision', 5)

In [140]:
df_nei = pd.read_csv('../data/df_neighborhoods.csv')

In [141]:
df_nei.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26621 entries, 0 to 26620
Data columns (total 23 columns):
X             26621 non-null float64
Y             26621 non-null float64
apa           26610 non-null object
bathrooms     25221 non-null float64
bedrooms      26610 non-null float64
cats_OK       26621 non-null int64
date_time     26621 non-null object
dogs_OK       26621 non-null int64
furnished     26621 non-null int64
latitude      26621 non-null float64
laundry       19599 non-null object
longitude     26621 non-null float64
neighborho    26377 non-null object
no smoking    26621 non-null int64
parking       16221 non-null object
post_id       26621 non-null float64
rent          26621 non-null float64
sq_ft         12765 non-null float64
url           26621 non-null object
wheelchair    26621 non-null int64
name          26621 non-null object
LINK          23487 non-null object
neighbor_2    26621 non-null object
dtypes: float64(9), int64(5), object(9)
memory usage: 4.9+ 

In [142]:
df_nei = df_nei.drop(['X', 'Y', 'url', 'post_id', 'LINK'], axis = 1)
df_nei = df_nei.rename(columns = {'neighbor_2' : 'N_1', 'name' : 'N_2', 'neighborho' : 'N_user'})

In [143]:
for N in ['N_1', 'N_2']:
    df_nei[N] = df_nei[N].str.lower()

In [144]:
df_no_nei = pd.read_csv('../data/df_no_neighborhoods.csv')

In [145]:
df_no_nei.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2995 entries, 0 to 2994
Data columns (total 18 columns):
rent                     2995 non-null float64
sq_ft                    1528 non-null float64
bedrooms                 2993 non-null float64
bathrooms                2837 non-null float64
neighborhood             2961 non-null object
latitude                 2672 non-null float64
longitude                2672 non-null float64
wheelchair accessible    2995 non-null int64
no smoking               2995 non-null int64
furnished                2995 non-null int64
cats_OK                  2995 non-null int64
dogs_OK                  2995 non-null int64
url                      2995 non-null object
post_id                  2995 non-null int64
date_time                2995 non-null object
apa                      2993 non-null object
laundry                  2266 non-null object
parking                  1873 non-null object
dtypes: float64(6), int64(6), object(6)
memory usage: 444.6+ KB


In [146]:
df_no_nei = df_no_nei.drop(['url', 'post_id'], axis = 1)
df_no_nei = df_no_nei[df_no_nei.latitude.notnull()].reset_index(drop = True)
df_no_nei = df_no_nei.rename(columns = {'neighborhood' : 'N_user'})

In [147]:
%%time
np = df_nei[['latitude', 'longitude', 'N_2']]
X = np.iloc[:,:2].values
y = df_nei.N_2
le = preprocessing.LabelEncoder()
y = le.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 1)
est = RF(n_estimators= 5)
est.fit(X_train, y_train)
y_pred = est.predict(X_test)
score = y_pred - y_test == 0
print 'precent inaccurate: ', float(len(score) - sum(score)) / len(score)

precent inaccurate:  0.0174647887324
CPU times: user 190 ms, sys: 140 ms, total: 330 ms
Wall time: 331 ms


In [148]:
def imputeNeighborhoods(df_nei, df_no_nei, neighborhood, estimators = 5):
    X_train = df_nei[['latitude', 'longitude']].values
    y_train = df_nei[neighborhood]
    X_test = df_no_nei[['latitude', 'longitude']].values
    est = RF(n_estimators= estimators)
    est.fit(X_train, y_train)
    y_pred = est.predict(X_test)
    df_no_nei[neighborhood] = y_pred
    return df_no_nei

In [149]:
for N in ['N_1', 'N_2']:
    imputeNeighborhoods(df_nei, df_no_nei, N)

### Filter out non-SF listings by distance

In [150]:
def filterDistance(df_with_Ns, df_new, neighbors = 10, distance = .01):
    X = df_with_Ns[['latitude', 'longitude']].values
    X_new = df_new[['latitude', 'longitude']].values
    nbrs = NearestNeighbors(n_neighbors=10, algorithm='ball_tree').fit(X)
    distances, indices = nbrs.kneighbors(X_new)
    df_new['distances'] = pd.DataFrame(distances).mean(axis = 1)
    df_new = df_new[df_new['distances'] < .05].sort(columns = 'distances', ascending = False)
    return df_new

In [151]:
df_no_nei = filterDistance(df_nei, df_no_nei)

In [152]:
# check distances


In [153]:
# temporary - add to function once the right distance is established
df_no_nei = df_no_nei.drop('distances', axis = 1)

In [154]:
df = pd.concat([df_nei, df_no_nei], axis = 0)

In [155]:
df = df[df.rent < 10000]
print len(df)
df = df[df.rent > 1000]
print len(df)
df = df[(df.sq_ft < 5000) & ((df.sq_ft != 0)) | (df.sq_ft.isnull())]
print len(df)
df = df.reset_index(drop = True)

28665
28452
28444


In [156]:
N_us = df.N_user.value_counts()
N_user_label = N_us[N_us > (N_us.sum() / 1100)].index

In [157]:
N_user_unlabel = set(df.N_user.unique()) - set(N_user_label)

In [158]:
isNullIndex = df[df.N_user.isnull()].index
df.ix[isNullIndex, 'N_user'] = 'nan'

In [159]:
df[df.N_user.isnull()].N_user.astype

<bound method Series.astype of Series([], name: N_user, dtype: object)>

In [160]:
label_d = dict(zip(N_user_label,range(len(N_user_label))))

In [161]:
import numpy as np
unlabel_d = dict(zip(N_user_unlabel, np.zeros(len(N_user_unlabel), dtype = int)))

In [162]:
label_d.update(unlabel_d)

In [163]:
df = df.replace({"N_user": label_d})

### Filter out perfect scores / repeats

In [164]:
df['list_count'] = 1

In [165]:
cols = df.columns[:-1].tolist()

In [180]:
df_uniq = df.groupby(cols).sum().reset_index().sort(columns = 'date_time')

In [167]:
im = Imputer(strategy='median', copy = False)
im.fit_transform(df_uniq.sq_ft)

array([[ 1600.,  4490.,  1086., ...,   605.,   911.,  4490.]])

In [181]:
df_uniq

Unnamed: 0,N_1,N_2,N_user,apa,bathrooms,bedrooms,cats_OK,date_time,dogs_OK,furnished,latitude,laundry,longitude,no smoking,parking,rent,sq_ft,wheelchair,wheelchair accessible,list_count
25831,western addition,lower pacific heights,21,apartment,2.0,3,0,2014-12-03 00:00:00,0,0,37.7856,laundry in bldg,-122.4358,1,valet parking,5795,1600,0,1,1
16685,pacific heights,pacific heights,3,apartment,2.0,2,0,2014-12-04 00:00:00,0,0,37.7942,laundry in bldg,-122.4344,1,street parking,6100,4490,0,1,1
1853,chinatown,financial district,2,condo,1.5,1,0,2014-12-05 00:00:00,0,0,37.7926,laundry in bldg,-122.4044,1,attached garage,4795,1086,1,1,1
22591,south of market,south of market,0,apartment,1.0,1,1,2014-12-05 00:00:00,0,0,37.7789,laundry on site,-122.4072,1,valet parking,2495,4490,0,1,1
12599,nob hill,nob hill,11,condo,1.0,0,1,2014-12-05 00:00:00,1,0,37.7917,laundry in bldg,-122.4186,1,attached garage,3200,4490,1,1,1
4055,downtown/civic center,tenderloin,6,apartment,1.0,1,1,2014-12-05 00:00:00,1,0,37.7859,laundry in bldg,-122.4196,1,street parking,2495,4490,0,1,1
17453,parkside,parkside,8,apartment,1.0,2,1,2014-12-05 00:00:00,0,0,37.7441,laundry in bldg,-122.4863,1,attached garage,2700,4490,0,1,1
4056,downtown/civic center,tenderloin,6,apartment,1.0,1,1,2014-12-05 00:00:00,1,0,37.7859,wd in unit,-122.4196,1,street parking,2795,4490,0,1,1
9697,mission,mission,1,apartment,1.0,0,0,2014-12-05 00:00:00,0,0,37.7656,wd in unit,-122.4210,1,valet parking,1950,4490,0,1,1
8296,marina,cow hollow,5,apartment,2.0,2,1,2014-12-05 00:00:00,1,0,37.7989,wd in unit,-122.4460,1,attached garage,5995,4490,0,1,1


In [168]:
for col in df_uniq:
    if df_uniq[col].dtype == object:
        df_uniq[col] = le.fit_transform(df_uniq[col])

In [169]:
X_cols = df_uniq.drop('rent', axis = 1)
X = X_cols.values

In [170]:
y = df_uniq.rent

In [171]:
X_train = X[0:len(X)/2]
y_train = y[0:len(X)/2]
X_test = X[-len(X)/4:]
y_test = y[-len(X)/4:]

In [172]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2)

In [173]:
%%time
est = RF(n_estimators = 20, n_jobs= -1)
est.fit(X_train, y_train)
y_pred = est.predict(X_test)

CPU times: user 8.82 s, sys: 4.64 s, total: 13.5 s
Wall time: 7.19 s


In [174]:
mean_absolute_error(y_pred, y_test)

641.43904660695307

In [175]:
perfect_scores = y_pred == y_test

In [176]:
sum(perfect_scores) / float(len(perfect_scores))

0.086382268103840892

In [117]:
sorted(zip(est.feature_importances_, X_cols.columns))[::-1]

[(0.21861716473085541, 'date_time'),
 (0.12791645832251311, 'longitude'),
 (0.12684508657941856, 'latitude'),
 (0.081831593171715705, 'sq_ft'),
 (0.068492493218115666, 'N_2'),
 (0.059492841832742005, 'N_user'),
 (0.052163818356316408, 'N_1'),
 (0.047043409039529867, 'parking'),
 (0.032122592290381544, 'bedrooms'),
 (0.0317648830359454, 'laundry'),
 (0.029396623323743765, 'no smoking'),
 (0.02582423647463248, 'apa'),
 (0.024581478830143111, 'bathrooms'),
 (0.019610311110495944, 'list_count'),
 (0.015788146476121194, 'cats_OK'),
 (0.014968889992675673, 'dogs_OK'),
 (0.012584797714233206, 'furnished'),
 (0.010955175500420966, 'wheelchair'),
 (0.0, 'wheelchair accessible')]

In [120]:
X_cols

Unnamed: 0,N_1,N_2,N_user,apa,bathrooms,bedrooms,cats_OK,date_time,dogs_OK,furnished,latitude,laundry,longitude,no smoking,parking,sq_ft,wheelchair,wheelchair accessible,list_count
25830,36,55,21,0,2.0,3,0,0,0,0,37.7856,0,-122.4358,1,5,1600,0,1,1
16681,24,77,3,0,2.0,2,0,1,0,0,37.7942,0,-122.4344,1,4,4490,0,1,1
1855,3,30,2,1,1.5,1,0,2,0,0,37.7926,0,-122.4044,1,0,1086,1,1,1
17448,25,80,8,0,1.0,2,1,2,0,0,37.7441,0,-122.4863,1,0,4490,0,1,1
4065,6,105,6,0,1.0,1,1,2,1,0,37.7859,3,-122.4196,1,4,4490,0,1,1
12598,17,69,11,1,1.0,0,1,2,1,0,37.7917,0,-122.4186,1,0,4490,1,1,1
22587,31,97,0,0,1.0,1,1,2,0,0,37.7789,1,-122.4072,1,5,4490,0,1,1
4064,6,105,6,0,1.0,1,1,2,1,0,37.7859,0,-122.4196,1,4,4490,0,1,1
9697,16,63,1,0,1.0,0,0,2,0,0,37.7656,3,-122.4210,1,5,4490,0,1,1
8295,15,20,5,0,2.0,2,1,2,1,0,37.7989,3,-122.4460,1,0,4490,0,1,1
