In [1]:
import base64
import datetime
import itertools
import json
import math
import operator
import os
import pickle
import random
import re
import time

import scipy
from scipy import ndimage
import numpy as np
import pandas as pd
import sklearn
import matplotlib as mpl
import matplotlib.pyplot as plt
from pylab import rcParams
import seaborn as sns
import statsmodels as sm
from pylab import rcParams
from pylab import *
from matplotlib.dates import date2num , DateFormatter
from PIL import Image

np.random.seed(1337)

%matplotlib inline
sns.set(font_scale=1.0)
rcParams['figure.figsize'] = 8, 6
sns.set_style('whitegrid')
sns.set_palette(sns.color_palette('muted'))



In [2]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import normalize
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer



# Load the Data

In [3]:
d = pd.concat([pd.read_csv('tmp/train_ids.csv', index_col='id'),
               pd.read_csv('tmp/test_ids.csv', index_col='id')])

In [4]:
e = pd.read_csv('tmp/features_bathrooms.csv', index_col='id')
d = pd.merge(d, e, how='left', left_index=True, right_index=True)

In [5]:
e = pd.read_csv('tmp/features_bedrooms.csv', index_col='id')
d = pd.merge(d, e, how='left', left_index=True, right_index=True)

In [6]:
e = pd.read_csv('tmp/features_building_id.csv', index_col='id')
d = pd.merge(d, e, how='left', left_index=True, right_index=True)

In [7]:
e = pd.read_csv('tmp/features_created.csv', index_col='id')
d = pd.merge(d, e, how='left', left_index=True, right_index=True)

In [8]:
e = pd.read_csv('tmp/features_description.csv', index_col='id')
d = pd.merge(d, e, how='left', left_index=True, right_index=True)

In [9]:
e = pd.read_csv('tmp/features_display_address.csv', index_col='id')
d = pd.merge(d, e, how='left', left_index=True, right_index=True)

In [10]:
e = pd.read_csv('tmp/features_features.csv', index_col='id')
d = pd.merge(d, e, how='left', left_index=True, right_index=True)

In [11]:
e = pd.read_csv('tmp/features_latlon.csv', index_col='id')
d = pd.merge(d, e, how='left', left_index=True, right_index=True)

In [12]:
e = pd.read_csv('tmp/features_manager_id.csv', index_col='id')
d = pd.merge(d, e, how='left', left_index=True, right_index=True)

In [13]:
e = pd.read_csv('tmp/features_photos.csv', index_col='id')
d = pd.merge(d, e, how='left', left_index=True, right_index=True)

In [14]:
e = pd.read_csv('tmp/features_price.csv', index_col='id')
d = pd.merge(d, e, how='left', left_index=True, right_index=True)

In [15]:
e = pd.read_csv('tmp/features_street_address.csv', index_col='id')
d = pd.merge(d, e, how='left', left_index=True, right_index=True)

In [16]:
d.interest_level.value_counts()

low       34284
medium    11229
high       3839
Name: interest_level, dtype: int64

In [17]:
tr = d[d.interest_level.notnull()].copy()
print len(tr)

49352


In [18]:
trl = tr.interest_level.copy()
trf = tr
del trf['interest_level']

In [19]:
for c in trf.columns:
    if trf.dtypes[c] not in ('int64', 'float64'):
        del trf[c]

In [20]:
te = d[d.interest_level.isnull()].copy()
print len(te)

74659


In [21]:
tef = te
del tef['interest_level']

In [22]:
for c in tef.columns:
    if tef.dtypes[c] not in ('int64', 'float64'):
        del tef[c]

In [23]:
tef.head().T

id,0,1,100,1000,100000
bathrooms,1.000000,1.000000,1.000000,1.000000,2.000000
num_bathrooms_bucket,1.000000,1.000000,1.000000,1.000000,2.000000
bedrooms,1.000000,2.000000,1.000000,2.000000,2.000000
num_bedrooms_bucket,1.000000,2.000000,1.000000,2.000000,2.000000
num_apts_in_building,8.000000,,9.000000,2.000000,267.000000
num_apts_in_building_q10,0.100000,,0.100000,0.000000,0.800000
created_month,6.000000,6.000000,6.000000,6.000000,4.000000
created_day_of_month,11.000000,24.000000,3.000000,11.000000,12.000000
created_dow,5.000000,4.000000,4.000000,5.000000,1.000000
created_hour,5.000000,6.000000,4.000000,6.000000,5.000000


In [24]:
print trf.values.shape
print tef.values.shape

(49352, 90)
(74659, 90)


# Get Script Data

In [25]:
data_path = "input/"
train_file = data_path + "train.json"
test_file = data_path + "test.json"
train_df = pd.read_json(train_file)
test_df = pd.read_json(test_file)
test_df['interest_level'] = np.nan
s_df = pd.concat([train_df, test_df])
print(s_df.shape)

(124011, 15)


In [26]:
features_to_use  = ["bathrooms", "bedrooms", "latitude", "longitude", "price"]

In [27]:
# transformation of lat and lng #
s_df["price_t"] = s_df["price"] / s_df["bedrooms"] 
s_df["room_dif"] = s_df["bedrooms"] - s_df["bathrooms"] 
s_df["room_sum"] = s_df["bedrooms"] + s_df["bathrooms"] 
s_df["price_t1"] = s_df["price"] / s_df["room_sum"]
s_df["fold_t1"] = s_df["bedrooms"] / s_df["room_sum"]

In [28]:
s_df["num_photos"] = s_df["photos"].apply(len)
s_df["num_features"] = s_df["features"].apply(len)
s_df["num_description_words"] = s_df["description"].apply(lambda x: len(x.split(" ")))
s_df["created"] = pd.to_datetime(s_df["created"])
s_df["passed"] = s_df["created"].max() - s_df["created"]
s_df["created_year"] = s_df["created"].dt.year
s_df["created_month"] = s_df["created"].dt.month
s_df["created_day"] = s_df["created"].dt.day
s_df["created_hour"] = s_df["created"].dt.hour

In [29]:
features_to_use.extend(["price_t","num_photos", "num_features", "num_description_words", 
                        "created_year", "created_month", "created_day", "created_hour",
                        "listing_id",'room_dif','room_sum','price_t1'])

In [30]:
categorical = ["display_address", "manager_id", "building_id", "street_address"]
for f in categorical:
        if s_df[f].dtype=='object':
            #print(f)
            lbl = sklearn.preprocessing.LabelEncoder()
            lbl.fit(list(s_df[f].values) + list(s_df[f].values))
            s_df[f] = lbl.transform(list(s_df[f].values))
            features_to_use.append(f)

In [31]:
s_df['features'] = s_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
print(s_df["features"].head())
tfidf = CountVectorizer(stop_words='english', max_features=400)
s_df_sparse = tfidf.fit_transform(s_df["features"])

10                                                         
10000     Doorman Elevator Fitness_Center Cats_Allowed D...
100004    Laundry_In_Building Dishwasher Hardwood_Floors...
100007                               Hardwood_Floors No_Fee
100013                                              Pre-War
Name: features, dtype: object


In [32]:
s_X = scipy.sparse.hstack([s_df[features_to_use], s_df_sparse]).tocsr()

In [33]:
sd = pd.DataFrame(s_X.todense(), index=d.index)

In [34]:
print sd.values.shape

(124011, 421)


In [35]:
s_feature_names = features_to_use
s_feature_names += ['tfidf_{0:03}'.format(i) for i in range(400)]
sd.columns = s_feature_names

In [36]:
sd.head()

Unnamed: 0_level_0,bathrooms,bedrooms,latitude,longitude,price,price_t,num_photos,num_features,num_description_words,created_year,...,tfidf_390,tfidf_391,tfidf_392,tfidf_393,tfidf_394,tfidf_395,tfidf_396,tfidf_397,tfidf_398,tfidf_399
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,1.5,3.0,40.7145,-73.9425,3000.0,1000.0,5.0,0.0,95.0,2016.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10000,1.0,2.0,40.7947,-73.9667,5465.0,2732.5,11.0,5.0,9.0,2016.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100004,1.0,1.0,40.7388,-74.0018,2850.0,2850.0,8.0,4.0,94.0,2016.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100007,1.0,1.0,40.7539,-73.9677,3275.0,3275.0,3.0,2.0,80.0,2016.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100013,1.0,4.0,40.8241,-73.9493,3350.0,837.5,3.0,1.0,68.0,2016.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Combine Data Sets

In [37]:
del sd['bathrooms']
del sd['bedrooms']
del sd['created_month']
del sd['created_hour']
del sd['num_features']
del sd['latitude']
del sd['longitude']
del sd['num_photos']
del sd['price']

In [38]:
target_num_map = {'high':0, 'medium':1, 'low':2}
trln = np.array(trl.apply(lambda x: target_num_map[x]))

In [39]:
combined_tr_f = pd.merge(trf, sd, left_index=True, right_index=True)

In [40]:
combined_te_f = pd.merge(tef, sd, left_index=True, right_index=True)

In [41]:
combined_tr_f['interest_level'] = trln
combined_te_f['interest_level'] = np.nan

In [42]:
print combined_tr_f.values.shape
print combined_te_f.values.shape

(49352, 503)
(74659, 503)


In [43]:
combined_tr_f.to_csv('tmp/combined_train.csv')
combined_te_f.to_csv('tmp/combined_test.csv')