In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import re
import pickle

path = ""
train = pd.read_csv(path + "train.csv")
test = pd.read_csv(path + "test.csv")

# One-hot-encode categorical variables
train['dataset'] = "train"
test['dataset'] = "test"
data = pd.concat([train,test], axis = 0).reset_index()
categorical = ['property_type','room_type','bed_type','cancellation_policy','city']
data = pd.get_dummies(data, columns = categorical)

#Function to convert amentities string to list
f = lambda x : [r for r in re.sub(r'[^,a-z0-9]','',x.lower()).split(',') if len(r) > 1]
#Amenities list to dummy vars
amenities = pd.get_dummies(data['amenities'].map(f).apply(pd.Series).stack()).sum(level=0)
data = pd.concat([data,amenities],axis=1)


##Some extra features to create from base data
data['host_response_rate'] = data['host_response_rate'].map(lambda x: float(x.split('%')[0])/100 if isinstance(x,str) else 0)
data['instant_bookable'] = data['instant_bookable'].map({'f':0,'t':1})
data['host_has_profile_pic'] = data['host_has_profile_pic'].map({'f':0,'t':1})
data['cleaning_fee'] = data['cleaning_fee'].map({False:0,True:1})


#add rgb data to dataset
rgb = pd.read_csv('./data/withRgb.csv',encoding='iso-8859-1')
data = data.merge(rgb[['id','meanG','meanR','meanB']],left_on='id',right_on='id')

#add median income for census tract to dataset

ct_median_income = pd.read_csv('./data/ct_median_income.csv')
data = data.merge(ct_median_income[['id','ct_median_income']],left_on='id',right_on='id')
data['ct_median_income'] = pd.to_numeric(data['ct_median_income'])

#add zillow data to dataset
zillow = pd.read_csv('./data/Zip_MedianRentalPrice_AllHomes.csv',index_col='RegionName')['2017-12']
zillow.index = [str(zip) for zip in zillow.index]
data['home_prices_zillow'] = data['zipcode'].map(zillow)


#load pickled xgboost model
model = pickle.load(open('./models/xg_model.dat','rb'))

import xgboost as xgb

numerics = ['uint8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
train_x = data[data.dataset == "train"] \
    .select_dtypes(include=numerics) \
    .drop("log_price", axis = 1) \
    .fillna(0) \
    .values


test_x = data[data.dataset == "test"] \
    .select_dtypes(include=numerics) \
    .drop("log_price", axis = 1) \
    .fillna(0) \
    .values
    
train_y = data[data.dataset == "train"].log_price.values

dtrain = xgb.DMatrix(train_x, train_y)

print('RMSE:',mean_squared_error(model.predict(dtrain), train_y )**(1/2))


In [None]:
##Best xgboost model so far

import xgboost as xgb
import pickle

numerics = ['uint8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
train_x = data[data.dataset == "train"] \
    .select_dtypes(include=numerics) \
    .drop("log_price", axis = 1) \
    .fillna(0) \
    .values


test_x = data[data.dataset == "test"] \
    .select_dtypes(include=numerics) \
    .drop("log_price", axis = 1) \
    .fillna(0) \
    .values
    
train_y = data[data.dataset == "train"].log_price.values


y_mean = np.mean(train_y)

xgb_params = {
    'eta': 0.037,
    'max_depth': 10,
    'subsample': 0.80,
    'objective': 'reg:linear',
    'eval_metric': 'mae',
    'lambda': 0.8,   
    'alpha': 0.4, 
    'base_score': y_mean,
    'silent': 1
}

dtrain = xgb.DMatrix(train_x, train_y)
dtest = xgb.DMatrix(test_x)

# cross-validation
#print( "Running XGBoost CV ..." )
#cv_result = xgb.cv(xgb_params, 
#                   dtrain, 
#                   nfold=5,
#                   num_boost_round=350,
#                   early_stopping_rounds=50,
#                   verbose_eval=10, 
#                   show_stdv=False
#                  )
#num_boost_rounds = len(cv_result)

# num_boost_rounds = 150
num_boost_rounds = 242
print("num_boost_rounds="+str(num_boost_rounds))

# train model
print( "\nTraining XGBoost ...")
model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)

#Pickle model object
pickle.dump(model,open('xg_model.dat','wb'))

In [None]:
##NLP - not currently used 
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import re

nltk.download('punkt')
nltk.download('stopwords')

##NLP STUFF HERE

tokens = []
bigrams = []

#Find top words and bigrams (2 word pairs) from the listing names
stop_words = set(stopwords.words('english'))
for name in data['name']:
    if isinstance(name,str):
        words = [word for word in nltk.word_tokenize(re.sub(r'[^ a-z]','',name.lower())) if word not in stop_words]
        bigrams.extend(nltk.bigrams(words))
        tokens.extend(words)

#Count occurences of top bigrams and words in listing name
topBigrams = [' '.join(b[0]) for b in Counter(bigrams).most_common(25)]
topWords = [w[0] for w in Counter(tokens).most_common(25)]
countTopWords = lambda x: len(set(re.sub(r'[^ a-z]','',x.lower()).split()).intersection(set(topWords))) if isinstance(x,str) else 0
countTopBigrams = lambda x: sum([re.sub(r'[^ a-z]','',x.lower()).count(bigram) for bigram in topBigrams]) if isinstance(x,str) else 0

#use weight of topwords and topbirgrams - for example 25 would be 1 point, 0 would be 25 points


data['num_top_words'] = data['name'].map(countTopWords)
data['num_top_bigrams'] = data['name'].map(countTopBigrams)


In [None]:
from PIL import Image
import requests
import os

##RGB Analysis - see image download script below
##BEWARE: Takes a long time to get mean RGB for dataset

def getBrightnessForImage(id=None,url=None,useImageData=True):
    if useImageData:
        path = 'images/{}.jpg'.format(id)
        if os.path.exists(path):
            image = Image.open(path)
        else:
            return(row)
    else:
        image = Image.open(requests.get(url,stream=True).raw)
    RGBs = []
    for x in range(image.width):
        for y in range(image.height):
            RGBs.append(sum(image.getpixel((x,y)))/3)
    return(sum(RGBs)/len(RGBs))   

def getRGBForImage(row,useImageData=True):
    
    if useImageData:
        id = row['id']
        path = 'images/{}.jpg'.format(id)
        if os.path.exists(path):
            image = Image.open(path)
        else:
            return(row)
    else:
        url = row['thumnail_url']
        image = Image.open(requests.get(url,stream=True).raw)
    RGBs = []
    for x in range(image.width):
        for y in range(image.height):
            rgb = image.getpixel((x,y)) 
            RGBs.append(rgb)
    transposed = np.array(RGBs).T
    meanR, meanG, meanB = np.mean(transposed[0]),np.mean(transposed[1]),np.mean(transposed[2])
    
    row['meanR'] = meanR
    row['meanG'] = meanG
    row['meanB'] = meanB
    
    return(row)


data = data.apply(getRGBForImage,axis=1)
 

In [None]:
import os
import requests
import zipfile
import io
import shapefile
from bs4 import BeautifulSoup
import requests
from shapely.geometry import shape as Shape, Point

###Example of spatial feature extraction
###Finds the median household income of the Census tract from listing lng/lat 
###Takes a while to reload data


#Scrape zip file links to dict {stateName:zipFileURL} for census tract shape files
censusShapeURL = 'https://www.census.gov/geo/maps-data/data/cbf/cbf_tracts.html'
soup = BeautifulSoup(requests.get(censusShapeURL).content,'lxml')
ctShapefiles = {option.text.strip():option.get('value') for option in soup.find(id='ct2016m').findAll('option')}

#Could replace this with a less hard-coded method
cityToState = {'NYC':'New York', 
               'SF':'California', 
               'DC':'District of Columbia', 
               'LA': 'California', 
               'Chicago': 'Illinois', 
               'Boston': 'Massachusetts'}

ctDict = {}
#Download all shapefiles needed, unzip, and add to a shapefile dict {censusTract:shapeFile}
for zipFile in data.city.map(cityToState).map(ctShapefiles).unique():
    shapefilePath = './data/census_tract_shapefiles/{}.shp'.format(zipFile.split('/')[-1].split('.')[0])
    if os.path.exists(shapefilePath): 
        print('{} already exists, using local copy'.format(shapefilePath))
    else:
        print('downloading {} ... '.format(zipFile))
        r = requests.get(zipFile)
        z = zipfile.ZipFile(io.BytesIO(r.content))
        z.extractall('./data/census_tract_shapefiles')
        
    shape = shapefile.Reader(shapefilePath)
    ctDict.update({ feature.record[3]: Shape(feature.shape) for feature in shape.shapeRecords()})
  
#Function that will return the census tract for given coordinates
def getFeatureforPoint(shapeDict,lng,lat):
    point = Point(lng,lat)
    for feature, shape in shapeDict.items():
        if shape.contains(point):
            return(feature)

        

#Loop through and save census data for each state 
ctIncomePath = './data/ct_income.csv'
if os.path.exists(ctIncomePath): 
    print('{} already exists, using local copy'.format(ctIncomePath))
    pd.read_csv(ctIncomePath)
else:
    ctIncome = pd.DataFrame()
    for i in range(1,57):
        url = 'https://api.census.gov/data/2016/acs/acs5?get=NAME,B19013_001E&for=tract:*&in=state:{}'.format(str(i).zfill(2))
        try:
            df = pd.DataFrame(requests.get(url).json())
        except:
            next

        ctIncome = pd.concat([ctIncome,df])

    #return series of tract to median income
    ctIncome.columns = ctIncome.iloc[0]
    ctIncome = ctIncome.drop_duplicates().iloc[1:]
    ctIncome['full_tract_name'] = '1400000US' + ct_income['state'] + ct_income['county'] + ct_income['tract']
    ctIncome = ctIncome.set_index('full_tract_name')['B19013_001E'] 
    ctIncome.to_csv(ctIncomePath)

##Should store census tract locations to id mapping also
data.apply(lambda row:\
           getFeatureforPoint(ctDict,row['longitude'],row['latitude']),axis = 1)\
           .map(ctIncome)
    
data[['id','ct_median_income']].to_csv('./data/features/ct_median_income.csv')

In [None]:
##Keras test, not being used

import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

path = ""
train = pd.read_csv(path + "train.csv")
test = pd.read_csv(path + "test.csv")

# One-hot-encode categorical variables
train['dataset'] = "train"
test['dataset'] = "test"
data = pd.concat([train,test], axis = 0)
categorical = ['property_type','room_type','bed_type','cancellation_policy','city']
data = pd.get_dummies(data, columns = categorical)



# Select only numeric data and impute missing values as 0
numerics = ['uint8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']


# define base model
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(len(train_x[0]), input_dim=len(train_x[0]), kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model


# fix random seed for reproducibility
#seed = 7
#np.random.seed(seed)
# evaluate model with standardized dataset
#estimator = KerasRegressor(build_fn=baseline_model, nb_epoch=100, batch_size=5, verbose=0)

#kfold = KFold(n_splits=10, random_state=seed)
#results = cross_val_score(estimator, train_x, train_y, cv=kfold)
print("Results: %.2f (%.2f) MSE" % (results.mean(), results.std()))
    

In [None]:
#xgboost test
import xgboost as xgb

import numpy as np
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.datasets import load_iris, load_digits, load_boston


numerics = ['uint8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
train_x = data[data.dataset == "train"] \
    .select_dtypes(include=numerics) \
    .drop("log_price", axis = 1) \
    .fillna(0) \
    .values

	
test_x = data[data.dataset == "test"] \
    .select_dtypes(include=numerics) \
    .drop("log_price", axis = 1) \
    .fillna(0) \
    .values
    
train_y = data[data.dataset == "train"].log_price.values


rng = np.random.RandomState(31337)
kf = KFold(n_splits=3, shuffle=True, random_state=rng)
for train_index, test_index in kf.split(train_x):
    xgb_model = xgb.XGBRegressor().fit(train_x[train_index], train_y[train_index])
    predictions = xgb_model.predict(train_x[test_index])
    actuals = train_y[test_index]
    print(mean_squared_error(actuals, predictions))
    



In [None]:
##Attempting to create a model based on iterations
xgb_params.update({
            'learning_rate': 0.007,
            'update':'refresh',
           # 'process_type': 'update',
            'refresh_leaf': True,
            #'reg_lambda': 3,  # L2
            'reg_alpha': 3,  # L1
            'silent': False,
        })
batch_size = 5000
iterations = 10
model = None
for i in range(iterations):
    for start in range(0, len(train_x), batch_size):
        print('batch..')
        model = xgb.train(xgb_params, num_boost_round=150, dtrain=xgb.DMatrix(train_x[start:start+batch_size], train_y[start:start+batch_size]), xgb_model=model)

        y_pr = model.predict(xgb.DMatrix(train_x))
        #print('    MSE itr@{}: {}'.format(int(start/batch_size), sklearn.metrics.mean_squared_error(y_te, y_pr)))
    print('MSE itr@{}: {}'.format(i, mean_squared_error(train_y, y_pr)))

#y_pr = model.predict(xgb.DMatrix(x_te))
#print('MSE at the end: {}'.format(mean_squared_error(test_y, y_pr)))

In [None]:
##Attempt to find optimum depth, child weight (never finished processing)

import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search


param_test1 =   {
 'max_depth':list(range(3,10,2)),
 'min_child_weight':list(range(1,6,2))
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch1.fit(train_x,train_y)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_



In [None]:
final_prediction = model.predict(dtest)
submission = pd.DataFrame(np.column_stack([test.id, final_prediction]), columns = ['id','log_price'])
submission.to_csv("fourth_submission.csv", index = False)

In [None]:
##Download images

import os 
import requests


for i,row in test.iterrows():  
    if isinstance(row['thumbnail_url'],str) and not os.path.exists('./images/{}.jpg'.format(row['id'])):
        print(i)
        url = row['thumbnail_url']
        filename = './images/{}.jpg'.format(row['id'])
        try:
            r = requests.get(url, timeout=1.5)
            if r.status_code == 200:
                with open(filename, 'wb') as f:
                    f.write(r.content)
        except:
            print('timeout')
            next


In [None]:
##Correlation matrix

%matplotlib inline
from string import ascii_letters
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style="white")

train

# Compute the correlation matrix
corr = train.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
##With random forest 

#from test script
0.16969142613
0.172199948847
0.171665702104

#with NLP 
0.170133091601
0.172209588905
0.171824024348


#with amenites
0.203165992454
0.205297239503
0.201059075764


#with RGB
0.169998647333
0.171589382807
0.171464521269


#with census median income
0.167652666241
0.169194518538
0.169065016674

#with host response rate
0.166819384931
0.1681722164
0.168567149508

#with instant bookable
0.166553242784
0.168167160006
0.168633984499

0.166422024511
0.168085883256
0.168238200829




