## Preparing the data set

The first step before running the clustering algorithm is to prepare the training and the testing data set. 


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

sns.set(style="whitegrid", color_codes=True)
sns.palplot(sns.color_palette("RdBu", n_colors=7))

fileNameTrain = "C:\\Users\\sevda\\Documents\\Data Lab\\Six sigma rental property\\train.json\\train.json"
train_df = pd.read_json(fileNameTrain)

fileNameTest = "C:\\Users\\sevda\\Documents\\Data Lab\\Six sigma rental property\\test.json\\test.json"
test_df = pd.read_json(fileNameTest)



As a next step, we will extract the key words from the description variable - by key words, we define words that are in the description of the unit but are not stop words as defined by the ntlk.corpus package.

In [2]:
from nltk.corpus import stopwords

cachedStopWords = stopwords.words("english")

description_key_words_ls = []

for ind, row in train_df.iterrows():
        #print(row['features'])
        #description = description.lower()
        description = row['description'].lower().rstrip(',?!.')
        description = ' '.join([word for word in description.split() if word not in cachedStopWords])
        description_ls = description.split(" ")
        description_key_words_ls += [description_ls]

train_df['description_key_words'] = pd.Series(description_key_words_ls, index=train_df.index)

description_key_words_ls = []

for ind, row in test_df.iterrows():
        #print(row['features'])
        #description = description.lower()
        description = row['description'].lower().rstrip(',?!.')
        description = ' '.join([word for word in description.split() if word not in cachedStopWords])
        description_ls = description.split(" ")
        description_key_words_ls += [description_ls]

test_df['description_key_words'] = pd.Series(description_key_words_ls, index=test_df.index)

We will create two numeric variables which describe the number of features and number of key words in the description section

In [62]:
train_df['num_features'] = train_df.features.apply(len)
train_df['num_key_words_description'] = train_df.description_key_words.apply(len)

test_df['num_features'] = test_df.features.apply(len)
test_df['num_key_words_description'] = test_df.description_key_words.apply(len)

From the Created variable, we will extract into new variables the exact data when the listing was created, the day of year, week of year, weekday and hour.

In [63]:
train_df["created"] = pd.to_datetime(train_df["created"])
train_df["date"]= train_df["created"].dt.date

train_df["dayofyear"] = train_df["created"].dt.dayofyear
train_df["weekofyear"] = train_df["created"].dt.weekofyear
train_df["weekday"] = train_df["created"].dt.weekday
train_df["hour"] = train_df["created"].dt.hour

test_df["created"] = pd.to_datetime(test_df["created"])
test_df["date"]= test_df["created"].dt.date

test_df["dayofyear"] = test_df["created"].dt.dayofyear
test_df["weekofyear"] = test_df["created"].dt.weekofyear
test_df["weekday"] = test_df["created"].dt.weekday
test_df["hour"] = test_df["created"].dt.hour

We will also add the number of photos of each listing as a new variable in the training and testing dataframe.

In [64]:
train_df["num_photos"] = train_df["photos"].apply(len)
test_df["num_photos"] = test_df["photos"].apply(len)


Another step is to create two variables which describe the price per bathroom and price per bedroom. 

In [7]:
train_df["price_per_bathroom"] = train_df["price"]/(train_df["bathrooms"] + 1)
train_df["price_per_bedroom"] = train_df["price"]/(train_df["bedrooms"] + 1)

test_df["price_per_bathroom"] = test_df["price"]/(test_df["bathrooms"] + 1)
test_df["price_per_bedroom"] = test_df["price"]/(test_df["bedrooms"] + 1)

We will also use the building id as a unique identifier of the building and the building itself affects the interest level.

In [8]:
from sklearn import preprocessing

lbl = preprocessing.LabelEncoder()
lbl.fit(list(train_df['building_id'].values) + list(test_df['building_id'].values))
train_df['building_id'] = lbl.transform(list(train_df['building_id'].values))

test_df['building_id'] = lbl.transform(list(test_df['building_id'].values))

We will count how many rental units per building are available for rent. 

In [9]:
building_id_group_by = train_df[["street_address", "building_id"]].groupby(["building_id"]).count()
building_id_group_by = building_id_group_by.add_suffix('_count').reset_index()

train_df = train_df.merge(building_id_group_by, left_on=['building_id'], right_on=['building_id'], how='inner')
test_df = test_df.merge(building_id_group_by, left_on=['building_id'], right_on=['building_id'], how='left')

Some buildings are more often seen in rental ads than others. We will introduce a number of few variables which describe how often the building is been seen in a rental post.

As a next step in the data preparation, we will explore furthere the description of the rentals.

In [10]:
import shapefile

sf = shapefile.Reader("C:\\Users\\sevda\\Documents\\Data Lab\\Six sigma rental property\\ZillowNeighborhoods-NY\\ZillowNeighborhoods-NY.shp")

shapes = sf.shapes()
records = sf.records()

towns_values = [records[i][2] for i in range(len(records))]
neighb_values = [records[i][3] for i in range(len(records))]
west_values = [shapes[i].bbox[0] for i in range(len(records))]
south_values = [shapes[i].bbox[1] for i in range(len(records))]
east_values = [shapes[i].bbox[2] for i in range(len(records))]
north_values = [shapes[i].bbox[3] for i in range(len(records))]

west, south, east, north = -74.02, 40.64, -73.85, 40.86

neighbourhood_pd = pd.DataFrame({'Town' : towns_values,
                                 'Neighbourhood' : neighb_values,
                                 'West' : west_values,
                                 'South' : south_values,
                                 'East' : east_values,
                                 'North' : north_values})

neighbourhood_pd = neighbourhood_pd[neighbourhood_pd.Town == "New York"]
neighbourhood_pd = neighbourhood_pd.ix[(neighbourhood_pd.West >= west) & 
                                     (neighbourhood_pd.East <= east) & 
                                     (neighbourhood_pd.South >= south) & 
                                     (neighbourhood_pd.North <= north)]

neighbourhood_sorted_pd = neighbourhood_pd.sort_values(['West'])


neighbourhood_ls = []
for num in range(0, train_df.shape[0]):
    temp = neighbourhood_sorted_pd[(neighbourhood_sorted_pd.West<train_df.longitude.values[num]) &
                                   (neighbourhood_sorted_pd.East>train_df.longitude.values[num]) &
                                   (neighbourhood_sorted_pd.South<train_df.latitude.values[num]) &
                                   (neighbourhood_sorted_pd.North>train_df.latitude.values[num])]
    if temp.shape[0] > 0:
        neighbourhood_ls += [temp.Neighbourhood.values[0]]
    else:
        neighbourhood_ls += ["Other"]
    
train_df['neighbourhood'] = pd.Series(neighbourhood_ls, index=train_df.index)

neighbourhood_ls = []
for num in range(0, test_df.shape[0]):
    temp = neighbourhood_sorted_pd[(neighbourhood_sorted_pd.West<test_df.longitude.values[num]) &
                                   (neighbourhood_sorted_pd.East>test_df.longitude.values[num]) &
                                   (neighbourhood_sorted_pd.South<test_df.latitude.values[num]) &
                                   (neighbourhood_sorted_pd.North>test_df.latitude.values[num])]
    if temp.shape[0] > 0:
        neighbourhood_ls += [temp.Neighbourhood.values[0]]
    else:
        neighbourhood_ls += ["Other"]
    
test_df['neighbourhood'] = pd.Series(neighbourhood_ls, index=test_df.index)

In [11]:
train_df_back_up = train_df
test_df_back_up = test_df

In [12]:
train_df = train_df_back_up
test_df = test_df_back_up

In [13]:
#train_df = train_df.drop(["price_median_x", "price_median_y", "price_diff"], axis=1)
#test_df = test_df.drop(["price_median_x", "price_median_y", "price_diff"], axis=1)

train_group_by = train_df[["bedrooms", "neighbourhood", "price"]].groupby(["bedrooms", "neighbourhood"]).median()
train_group_by = train_group_by.add_suffix('_median').reset_index()

train_df = train_df.merge(train_group_by, left_on=['bedrooms', 'neighbourhood'], right_on=['bedrooms', 'neighbourhood'], how='outer')
test_df = test_df.merge(train_group_by, left_on=['bedrooms', 'neighbourhood'], right_on=['bedrooms', 'neighbourhood'], how='left')

train_df["price_diff"] = train_df["price"] - train_df["price_median"]
test_df["price_diff"] = test_df["price"] - test_df["price_median"]


We will create a number of new variables which describe how often certain building is seen in rental posts. 

In [14]:
buildings_count = train_df['building_id'].value_counts()

train_df['top_10_building'] = train_df['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 90)] else 0)
train_df['top_25_building'] = train_df['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 75)] else 0)
train_df['top_5_building'] = train_df['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 95)] else 0)
train_df['top_50_building'] = train_df['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 50)] else 0)
train_df['top_1_building'] = train_df['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 99)] else 0)
train_df['top_2_building'] = train_df['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 98)] else 0)
train_df['top_15_building'] = train_df['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 85)] else 0)
train_df['top_20_building'] = train_df['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 80)] else 0)
train_df['top_30_building'] = train_df['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 70)] else 0)

test_df['top_10_building'] = test_df['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 90)] else 0)
test_df['top_25_building'] = test_df['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 75)] else 0)
test_df['top_5_building'] = test_df['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 95)] else 0)
test_df['top_50_building'] = test_df['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 50)] else 0)
test_df['top_1_building'] = test_df['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 99)] else 0)
test_df['top_2_building'] = test_df['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 98)] else 0)
test_df['top_15_building'] = test_df['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 85)] else 0)
test_df['top_20_building'] = test_df['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 80)] else 0)
test_df['top_30_building'] = test_df['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 70)] else 0)

We will also use the feature variable to crate a number of new variables based on the features the customers may be looking for. The idea comes from https://www.kaggle.com/jamestollefson/two-sigma-connect-rental-listing-inquiries/feature-engineering-train-features

In [15]:
def newfeat(name, df, series):
    """Create a Series for my feature building loop to fill"""
    feature = pd.Series(0, df.index, name=name)
    """Now populate the new Series with numeric values"""
    for row, word in enumerate(series):
        if name in word:
            feature.iloc[row] = 1
    df[name] = feature
    return(df)
   
train_df = newfeat('Elevator', train_df, train_df.features)
train_df = newfeat('Dogs Allowed', train_df, train_df.features)
train_df = newfeat('Cats Allowed', train_df, train_df.features)
train_df = newfeat('Hardwood Floors', train_df, train_df.features)
train_df = newfeat('Swimming Pool', train_df, train_df.features)
train_df = newfeat('Doorman', train_df, train_df.features)
train_df = newfeat('Laundry in Unit', train_df, train_df.features)
train_df = newfeat('Fitness center', train_df, train_df.features)
train_df = newfeat('gym', train_df, train_df.features)
train_df = newfeat('Pre-War', train_df, train_df.features)
train_df = newfeat('private-balcony', train_df, train_df.features)
train_df = newfeat('balcony', train_df, train_df.features)
train_df = newfeat('Laundry in Building', train_df, train_df.features)


test_df = newfeat('Elevator', test_df, test_df.features)
test_df = newfeat('Dogs Allowed', test_df, test_df.features)
test_df = newfeat('Cats Allowed', test_df, test_df.features)
test_df = newfeat('Hardwood Floors', test_df, test_df.features)
test_df = newfeat('Swimming Pool', test_df, test_df.features)
test_df = newfeat('Doorman', test_df, test_df.features)
test_df = newfeat('Laundry in Unit', test_df, test_df.features)
test_df = newfeat('Fitness center', test_df, test_df.features)
test_df = newfeat('gym', test_df, test_df.features)
test_df = newfeat('Pre-War', test_df, test_df.features)
test_df = newfeat('private-balcony', test_df, test_df.features)
test_df = newfeat('balcony', test_df, test_df.features)
test_df = newfeat('Laundry in Building', test_df, test_df.features)


We will use the listing_id variable to create the listing difference variable which describes how far away from the minimum listing id is a given rental post. As we already discovered in our descriptive analysis, the listing difference affects the interest level of a post.

In [16]:
dayofyear_group_by = train_df[["listing_id", "dayofyear"]].groupby(["dayofyear"]).min()
dayofyear_group_by = dayofyear_group_by.add_suffix('_min').reset_index()

train_df = train_df.merge(dayofyear_group_by, left_on=['dayofyear'], right_on=['dayofyear'], how='left')
train_df['listing_difference'] = train_df['listing_id'] - train_df['listing_id_min']

test_df = test_df.merge(dayofyear_group_by, left_on=['dayofyear'], right_on=['dayofyear'], how='left')
test_df['listing_difference'] = test_df['listing_id'] - test_df['listing_id_min']

In [40]:
index=list(range(train_df.shape[0]))
random.shuffle(index)
a=[np.nan]*len(train_df)
b=[np.nan]*len(train_df)
c=[np.nan]*len(train_df)

for i in range(5):
    building_level={}
    for j in train_df['manager_id'].values:
        building_level[j]=[0,0,0]
    test_index=index[int((i*train_df.shape[0])/5):int(((i+1)*train_df.shape[0])/5)]
    train_index=list(set(index).difference(test_index))
    for j in train_index:
        temp=train_df.iloc[j]
        if temp['interest_level']=='low':
            building_level[temp['manager_id']][0]+=1
        if temp['interest_level']=='medium':
            building_level[temp['manager_id']][1]+=1
        if temp['interest_level']=='high':
            building_level[temp['manager_id']][2]+=1
    for j in test_index:
        temp=train_df.iloc[j]
        if sum(building_level[temp['manager_id']])!=0:
            a[j]=building_level[temp['manager_id']][0]*1.0/sum(building_level[temp['manager_id']])
            b[j]=building_level[temp['manager_id']][1]*1.0/sum(building_level[temp['manager_id']])
            c[j]=building_level[temp['manager_id']][2]*1.0/sum(building_level[temp['manager_id']])
train_df['manager_level_low']=a
train_df['manager_level_medium']=b
train_df['manager_level_high']=c



a=[]
b=[]
c=[]
building_level={}
for j in train_df['manager_id'].values:
    building_level[j]=[0,0,0]
for j in range(train_df.shape[0]):
    temp=train_df.iloc[j]
    if temp['interest_level']=='low':
        building_level[temp['manager_id']][0]+=1
    if temp['interest_level']=='medium':
        building_level[temp['manager_id']][1]+=1
    if temp['interest_level']=='high':
        building_level[temp['manager_id']][2]+=1

for i in test_df['manager_id'].values:
    if i not in building_level.keys():
        a.append(np.nan)
        b.append(np.nan)
        c.append(np.nan)
    else:
        a.append(building_level[i][0]*1.0/sum(building_level[i]))
        b.append(building_level[i][1]*1.0/sum(building_level[i]))
        c.append(building_level[i][2]*1.0/sum(building_level[i]))
test_df['manager_level_low']=a
test_df['manager_level_medium']=b
test_df['manager_level_high']=c



NameError: name 'features_to_use' is not defined

In [104]:
from pandas import HDFStore

store_train_df = HDFStore('train_df.h5')
store_test_df = HDFStore('test_df.h5')

train_df = store_train_df['train_df']

test_df = store_test_df['test_df']

store_train_df.close()
store_test_df.close()


In [17]:
print(train_df.shape)
print(test_df.shape)

print(train_df.columns.values)
print(test_df.columns.values)

(49352, 59)
(74659, 54)
['bathrooms' 'bedrooms' 'building_id' 'created' 'description'
 'display_address' 'features' 'interest_level' 'latitude' 'listing_id'
 'longitude' 'manager_id' 'photos' 'price' 'street_address'
 'description_key_words' 'num_features' 'num_key_words_description'
 'description_tokens' 'compound' 'neg' 'neu' 'pos' 'date' 'dayofyear'
 'weekofyear' 'weekday' 'hour' 'num_photos' 'price_per_bathroom'
 'price_per_bedroom' 'street_address_count' 'neighbourhood' 'price_median'
 'price_diff' 'top_10_building' 'top_25_building' 'top_5_building'
 'top_50_building' 'top_1_building' 'top_2_building' 'top_15_building'
 'top_20_building' 'top_30_building' 'Elevator' 'Dogs Allowed'
 'Cats Allowed' 'Hardwood Floors' 'Swimming Pool' 'Doorman'
 'Laundry in Unit' 'Fitness center' 'gym' 'Pre-War' 'private-balcony'
 'balcony' 'Laundry in Building' 'listing_id_min' 'listing_difference']
['bathrooms' 'bedrooms' 'building_id' 'created' 'description'
 'display_address' 'features' 'latitude'

In [166]:
store_train_df = HDFStore('train_df.h5')
store_test_df = HDFStore('test_df.h5')

store_train_df['train_df'] = train_df  # save it
store_test_df['test_df'] = test_df  #

store_train_df.close()
store_test_df.close()

Before proceeding with the model building, we will replace all the NaN value and Infinity values since Random Forest is not able to work with missing or infinite values.

In [18]:
train_df=train_df.replace(np.nan, 0)
test_df=test_df.replace(np.nan, 0)

train_df=train_df.replace(np.isinf, 1)
test_df=test_df.replace(np.isinf, 1)

We will use the Random Forest function to run the model. We will apply the algorithm to the train data set with the pre selected variables and target variable "interest_level".

In [42]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=1000):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.1
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

We will perform cross validation on the training dataset in order to see when the scores will stop decreasing.

In [None]:
from sklearn import model_selection, preprocessing, ensemble

cv_scores = []
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2016)
for dev_index, val_index in kf.split(range(train_X.shape[0])):
        dev_X, val_X = train_X[dev_index,:], train_X[val_index,:]
        dev_y, val_y = train_y[dev_index], train_y[val_index]
        preds, model = runXGB(dev_X, dev_y, val_X, val_y)
        cv_scores.append(log_loss(val_y, preds))
        print(cv_scores)
        break

In [44]:
import xgboost as xgb

selected_vars  = ["price", "price_diff", "num_features", "num_key_words_description",
                   "dayofyear", "weekday", "hour", "num_photos", "latitude", "longitude",
                   "price_per_bedroom", "street_address_count", "building_id",
                 "Elevator", "Cats Allowed", "Dogs Allowed", 'Hardwood Floors', 'Swimming Pool',
                  'Doorman', 'Laundry in Unit', 'Pre-War',
                  'Laundry in Building', "listing_difference", "manager_level_low",
                  "manager_level_medium", "manager_level_high"]

train_X = train_df[selected_vars]
test_X = test_df[selected_vars]

target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

preds, model = runXGB(train_X, train_y, test_X, num_rounds=500)
out_df_1 = pd.DataFrame(preds)
out_df_1.columns = ["high", "medium", "low"]
out_df_1["listing_id"] = test_df.listing_id.values
out_df_1.to_csv("xgb_starter_results_v8.csv", index=False)

We will remove the price_diff which is the most important feature in the result and will run again the Random forest model.

In [45]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss
import random

selected_vars  = ["price", "num_features", "num_key_words_description",
                   "dayofyear", "weekday", "hour", "num_photos", "latitude", "longitude",
                   "price_per_bedroom", "street_address_count", "building_id",
                 "Elevator", "Cats Allowed", "Dogs Allowed", 'Hardwood Floors', 'Swimming Pool',
                  'Doorman', 'Laundry in Unit', 'Pre-War',
                  'Laundry in Building', "listing_difference", "manager_level_low",
                  "manager_level_medium", "manager_level_high"]


train_X = train_df[selected_vars]
test_X = test_df[selected_vars]

target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

preds, model = runXGB(train_X, train_y, test_X, num_rounds=400)
out_df_2 = pd.DataFrame(preds)
out_df_2.columns = ["high", "medium", "low"]
out_df_2["listing_id"] = test_df.listing_id.values


The price_per_bedroom variable ranks the best variable for predicting the interest level so we will remove that one and run again the random forest model.

In [46]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss
import random

selected_vars  = ["price", "price_diff", "num_features", "num_key_words_description",
                   "dayofyear", "weekday", "hour", "num_photos", "latitude", "longitude",
                   "street_address_count", "building_id",
                 "Elevator", "Cats Allowed", "Dogs Allowed", 'Hardwood Floors', 'Swimming Pool',
                  'Doorman', 'Laundry in Unit', 'Pre-War',
                  'Laundry in Building', "listing_difference", "manager_level_low",
                  "manager_level_medium", "manager_level_high"]


train_X = train_df[selected_vars]
test_X = test_df[selected_vars]

target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

preds, model = runXGB(train_X, train_y, test_X, num_rounds=400)
out_df_3 = pd.DataFrame(preds)
out_df_3.columns = ["high", "medium", "low"]
out_df_3["listing_id"] = test_df.listing_id.values


Now we will remoce price as a variable since "price" is the most important varuable for the lates random forest model prediction.

In [47]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss
import random

selected_vars  = ["price_diff", "price_per_bedroom", "num_features", "num_key_words_description",
                   "dayofyear", "weekday", "hour", "num_photos", "latitude", "longitude",
                   "street_address_count", "building_id",
                 "Elevator", "Cats Allowed", "Dogs Allowed", 'Hardwood Floors', 'Swimming Pool',
                  'Doorman', 'Laundry in Unit', 'Pre-War',
                  'Laundry in Building', "listing_difference", "manager_level_low",
                  "manager_level_medium", "manager_level_high"]


train_X = train_df[selected_vars]
test_X = test_df[selected_vars]

target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

preds, model = runXGB(train_X, train_y, test_X, num_rounds=400)
out_df_4 = pd.DataFrame(preds)
out_df_4.columns = ["high", "medium", "low"]
out_df_4["listing_id"] = test_df.listing_id.values

Now we will remove "listing_difference"

In [48]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss
import random

selected_vars  = ["price", "price_diff", "price_per_bedroom", "num_features", "num_key_words_description",
                   "dayofyear", "weekday", "hour", "num_photos", "latitude", "longitude",
                   "street_address_count", "building_id",
                 "Elevator", "Cats Allowed", "Dogs Allowed", 'Hardwood Floors', 'Swimming Pool',
                  'Doorman', 'Laundry in Unit', 'Pre-War',
                  'Laundry in Building', "manager_level_low",
                  "manager_level_medium", "manager_level_high"]


train_X = train_df[selected_vars]
test_X = test_df[selected_vars]

target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

preds, model = runXGB(train_X, train_y, test_X, num_rounds=400)
out_df_5 = pd.DataFrame(preds)
out_df_5.columns = ["high", "medium", "low"]
out_df_5["listing_id"] = test_df.listing_id.values

We will remove the "num_features"

In [49]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss
import random

selected_vars  = ["price", "price_diff", "price_per_bedroom", "num_key_words_description",
                   "dayofyear", "weekday", "hour", "num_photos", "latitude", "longitude",
                   "street_address_count", "building_id",
                 "Elevator", "Cats Allowed", "Dogs Allowed", 'Hardwood Floors', 'Swimming Pool',
                  'Doorman', 'Laundry in Unit', 'Pre-War',
                  'Laundry in Building', "listing_difference", "manager_level_low",
                  "manager_level_medium", "manager_level_high"]


train_X = train_df[selected_vars]
test_X = test_df[selected_vars]

target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

preds, model = runXGB(train_X, train_y, test_X, num_rounds=400)
out_df_6 = pd.DataFrame(preds)
out_df_6.columns = ["high", "medium", "low"]
out_df_6["listing_id"] = test_df.listing_id.values

Now we will remove "num_key_words_description"

In [50]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss
import random

selected_vars  = ["price", "price_diff", "price_per_bedroom", "num_features",
                   "dayofyear", "weekday", "hour", "num_photos", "latitude", "longitude",
                   "street_address_count", "building_id",
                 "Elevator", "Cats Allowed", "Dogs Allowed", 'Hardwood Floors', 'Swimming Pool',
                  'Doorman', 'Laundry in Unit', 'Pre-War',
                  'Laundry in Building', "listing_difference", "manager_level_low",
                  "manager_level_medium", "manager_level_high"]


train_X = train_df[selected_vars]
test_X = test_df[selected_vars]

target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

preds, model = runXGB(train_X, train_y, test_X, num_rounds=400)
out_df_7 = pd.DataFrame(preds)
out_df_7.columns = ["high", "medium", "low"]
out_df_7["listing_id"] = test_df.listing_id.values

Now we will remove "latitude"

In [52]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss
import random

selected_vars  = ["price", "price_diff", "price_per_bedroom", "num_features", "num_key_words_description",
                   "dayofyear", "weekday", "hour", "num_photos", "longitude",
                   "street_address_count", "building_id",
                 "Elevator", "Cats Allowed", "Dogs Allowed", 'Hardwood Floors', 'Swimming Pool',
                  'Doorman', 'Laundry in Unit', 'Pre-War',
                  'Laundry in Building', "listing_difference", "manager_level_low",
                  "manager_level_medium", "manager_level_high"]


train_X = train_df[selected_vars]
test_X = test_df[selected_vars]

target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

preds, model = runXGB(train_X, train_y, test_X, num_rounds=400)
out_df_8 = pd.DataFrame(preds)
out_df_8.columns = ["high", "medium", "low"]
out_df_8["listing_id"] = test_df.listing_id.values

Now we will remove "longitude".

In [53]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss
import random

selected_vars  = ["price", "price_diff", "price_per_bedroom", "num_features", "num_key_words_description",
                   "dayofyear", "weekday", "hour", "num_photos", "latitude",
                   "street_address_count", "building_id",
                 "Elevator", "Cats Allowed", "Dogs Allowed", 'Hardwood Floors', 'Swimming Pool',
                  'Doorman', 'Laundry in Unit', 'Pre-War',
                  'Laundry in Building', "listing_difference", "manager_level_low",
                  "manager_level_medium", "manager_level_high"]


train_X = train_df[selected_vars]
test_X = test_df[selected_vars]

target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

preds, model = runXGB(train_X, train_y, test_X, num_rounds=400)
out_df_9 = pd.DataFrame(preds)
out_df_9.columns = ["high", "medium", "low"]
out_df_9["listing_id"] = test_df.listing_id.values

Now we will remove "manager_level_low"

In [55]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss
import random

selected_vars  = ["price", "price_diff", "price_per_bedroom", "num_features", "num_key_words_description",
                   "dayofyear", "weekday", "hour", "num_photos", "latitude", "longitude",
                   "street_address_count", "building_id",
                 "Elevator", "Cats Allowed", "Dogs Allowed", 'Hardwood Floors', 'Swimming Pool',
                  'Doorman', 'Laundry in Unit', 'Pre-War',
                  'Laundry in Building', "listing_difference",
                  "manager_level_medium", "manager_level_high"]


train_X = train_df[selected_vars]
test_X = test_df[selected_vars]

target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

preds, model = runXGB(train_X, train_y, test_X, num_rounds=400)
out_df_10 = pd.DataFrame(preds)
out_df_10.columns = ["high", "medium", "low"]
out_df_10["listing_id"] = test_df.listing_id.values

Now we will remove manager_id_medium

In [57]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss
import random

selected_vars  = ["price", "price_diff", "price_per_bedroom", "num_features", "num_key_words_description",
                   "dayofyear", "weekday", "hour", "num_photos", "latitude", "longitude",
                   "street_address_count", "building_id",
                 "Elevator", "Cats Allowed", "Dogs Allowed", 'Hardwood Floors', 'Swimming Pool',
                  'Doorman', 'Laundry in Unit', 'Pre-War',
                  'Laundry in Building', "listing_difference", "manager_level_low", "manager_level_high"]


train_X = train_df[selected_vars]
test_X = test_df[selected_vars]

target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

preds, model = runXGB(train_X, train_y, test_X, num_rounds=400)
out_df_11 = pd.DataFrame(preds)
out_df_11.columns = ["high", "medium", "low"]
out_df_11["listing_id"] = test_df.listing_id.values

Now we will remove "manager_id_high"

In [59]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss
import random

selected_vars  = ["price", "price_diff", "price_per_bedroom", "num_features", "num_key_words_description",
                   "dayofyear", "weekday", "hour", "num_photos", "latitude", "longitude",
                   "street_address_count", "building_id",
                 "Elevator", "Cats Allowed", "Dogs Allowed", 'Hardwood Floors', 'Swimming Pool',
                  'Doorman', 'Laundry in Unit', 'Pre-War',
                  'Laundry in Building', "listing_difference", "manager_level_low", "manager_level_medium"]


train_X = train_df[selected_vars]
test_X = test_df[selected_vars]

target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

preds, model = runXGB(train_X, train_y, test_X, num_rounds=400)
out_df_12 = pd.DataFrame(preds)
out_df_12.columns = ["high", "medium", "low"]
out_df_12["listing_id"] = test_df.listing_id.values

In [60]:
out_df_result = out_df_1.append(out_df_2)
out_df_result = out_df_result.append(out_df_3)
out_df_result = out_df_result.append(out_df_4)
out_df_result = out_df_result.append(out_df_5)
out_df_result = out_df_result.append(out_df_6)
out_df_result = out_df_result.append(out_df_7)
out_df_result = out_df_result.append(out_df_8)
out_df_result = out_df_result.append(out_df_9)
out_df_result = out_df_result.append(out_df_10)
out_df_result = out_df_result.append(out_df_11)
out_df_result = out_df_result.append(out_df_12)
#out_df_result = out_df_1.append(out_df_2)

#print(out_df_1.shape)
#print(out_df_2.shape)
#print(out_df_result.shape)

#print(out_df_result.head())

out_df = out_df_result.groupby(["listing_id"]).mean()
#out_df = out_df.add_suffix('_mean').reset_index()
out_df = out_df.reset_index()

#print(out_df)

In [61]:
out_df.to_csv("xgb_starter_results_v9.csv", index=False)