In [244]:
import base64
import datetime
import json
import os
import random
import time

import scipy
from scipy import ndimage
import numpy as np
import pandas as pd
import sklearn
import matplotlib as mpl
import matplotlib.pyplot as plt
from pylab import rcParams
import seaborn as sns
import statsmodels as sm
from pylab import rcParams
from pylab import *
from matplotlib.dates import date2num , DateFormatter
from PIL import Image

np.random.seed(1337)

%matplotlib inline
sns.set(font_scale=1.0)
rcParams['figure.figsize'] = 8, 6
sns.set_style('whitegrid')
sns.set_palette(sns.color_palette('muted'))

pd.options.display.max_colwidth = 1000

In [245]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import normalize
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import confusion_matrix

from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy import sparse

# Load the Data

In [246]:
d = pd.concat([pd.read_csv('tmp/train_ids.csv', index_col='id'),
               pd.read_csv('tmp/test_ids.csv', index_col='id')])

In [247]:
e = pd.read_csv('tmp/features_bathrooms.csv', index_col='id')
d = pd.merge(d, e, how='left', left_index=True, right_index=True)

In [248]:
e = pd.read_csv('tmp/features_bedrooms.csv', index_col='id')
d = pd.merge(d, e, how='left', left_index=True, right_index=True)

In [249]:
e = pd.read_csv('tmp/features_building_id.csv', index_col='id')
d = pd.merge(d, e, how='left', left_index=True, right_index=True)

In [250]:
e = pd.read_csv('tmp/features_created.csv', index_col='id')
d = pd.merge(d, e, how='left', left_index=True, right_index=True)

In [251]:
e = pd.read_csv('tmp/features_description.csv', index_col='id')
d = pd.merge(d, e, how='left', left_index=True, right_index=True)

In [252]:
e = pd.read_csv('tmp/features_display_address.csv', index_col='id')
d = pd.merge(d, e, how='left', left_index=True, right_index=True)

In [253]:
e = pd.read_csv('tmp/features_features.csv', index_col='id')
d = pd.merge(d, e, how='left', left_index=True, right_index=True)

In [254]:
e = pd.read_csv('tmp/features_latlon.csv', index_col='id')
d = pd.merge(d, e, how='left', left_index=True, right_index=True)

In [255]:
e = pd.read_csv('tmp/features_manager_id.csv', index_col='id')
d = pd.merge(d, e, how='left', left_index=True, right_index=True)

In [256]:
e = pd.read_csv('tmp/features_photos.csv', index_col='id')
d = pd.merge(d, e, how='left', left_index=True, right_index=True)

In [257]:
e = pd.read_csv('tmp/features_price.csv', index_col='id')
d = pd.merge(d, e, how='left', left_index=True, right_index=True)

In [258]:
e = pd.read_csv('tmp/features_street_address.csv', index_col='id')
d = pd.merge(d, e, how='left', left_index=True, right_index=True)

In [259]:
e = pd.read_csv('tmp/features_others.csv', index_col='id')
d = pd.merge(d, e, how='left', left_index=True, right_index=True)

In [260]:
d.interest_level.value_counts()

low       34284
medium    11229
high       3839
Name: interest_level, dtype: int64

In [261]:
del d['num_apts_in_building_q10']
del d['num_apts_in_building']

In [262]:
# d = pd.concat([d, tf_idfs], axis=1)
d.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 124011 entries, 0 to 124010
Columns: 109 entries, listing_id to hours_since_managers_last_listing
dtypes: float64(53), int64(44), object(12)
memory usage: 104.1+ MB


In [263]:
tr = d[d.interest_level.notnull()].copy()
print len(tr)

49352


In [264]:
trl = tr.interest_level.copy()
trf = tr
del trf['interest_level']

In [265]:
te = d[d.interest_level.isnull()].copy()
print len(te)

74659


In [266]:
tef = te
del tef['interest_level']

In [267]:

tfidf_features = CountVectorizer(stop_words='english', max_features=400)
tr_sparse_features = tfidf_features.fit_transform(trf["features"])
te_sparse_features = tfidf_features.transform(tef["features"])

tfidf_desc = CountVectorizer(stop_words='english', max_features=400)
tr_sparse_desc = tfidf_desc.fit_transform(trf["desc_clean"].fillna(""))
te_sparse_desc = tfidf_desc.transform(tef["desc_clean"].fillna(""))

In [268]:
for c in trf.columns:
    if trf.dtypes[c] not in ('int64', 'float64'):
        del trf[c]           
            
for c in tef.columns:
    if tef.dtypes[c] not in ('int64', 'float64'):
        del tef[c]

# Run Classification

In [269]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=1000):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.1
    param['max_depth'] = 4
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.8
    param['colsample_bytree'] = 0.8
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=50)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

In [270]:
train_X = sparse.hstack([trf,tr_sparse_features, tr_sparse_desc]).tocsr()
test_X = sparse.hstack([tef,te_sparse_features, te_sparse_desc]).tocsr()

target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(trl.apply(lambda x: target_num_map[x]))
train_y

array([1, 2, 1, ..., 1, 1, 0])

In [271]:
preds, model = runXGB(train_X, train_y, test_X, num_rounds=1000)

In [281]:
print len(trf.columns)

97


In [282]:
#xgb.plot_importance(model)
feature_importance_score = model.get_fscore()
feature_importance = pd.Series(feature_importance_score).sort_values().to_dict()

In [None]:
our_features_imp = {}
for i, f in enumerate(trf.columns):
    f_id = 'f{}'.format(i)
    our_features_imp
    

In [274]:
out_df = pd.DataFrame(preds)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = pd.read_csv('input/sample_submission.csv').listing_id.values
out_df.to_csv("output/xgb_starter_with_other_features.csv", index=False)

In [275]:
def cvXGB(train_X, train_y, test_X, seed_val=0, num_rounds=1000, nfold=3):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.1
    param['max_depth'] = 4
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.8
    param['colsample_bytree'] = 0.8
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    eval_hist = xgb.cv(plst, xgtrain, num_rounds, nfold=nfold)

    
    return eval_hist


In [276]:
eval_history = cvXGB(train_X, train_y, test_X, num_rounds=1000)
eval_history

Unnamed: 0,test-mlogloss-mean,test-mlogloss-std,train-mlogloss-mean,train-mlogloss-std
0,1.030193,0.000925,1.029532,0.000742
1,0.974447,0.001443,0.973316,0.001046
2,0.924540,0.001238,0.922869,0.000825
3,0.882113,0.001055,0.879926,0.000503
4,0.845211,0.000975,0.842468,0.000359
5,0.814060,0.000081,0.810897,0.000826
6,0.787687,0.000632,0.784104,0.000726
7,0.762934,0.000456,0.758865,0.000846
8,0.741165,0.000244,0.736627,0.001018
9,0.721896,0.000363,0.716915,0.000662


In [None]:
cvXGB(train_X, train_y, test_X, num_rounds=1000, nfold=10)