In [72]:
import pandas as pd
import numpy as np
import json, os
from pandas.io.json import json_normalize
import missingno as msno
import seaborn as sn
from scipy import stats, sparse
import datetime as dt
import matplotlib.pyplot as plt
from sklearn import preprocessing
%matplotlib inline

In [73]:
os.chdir('D:/Kaggle/Two Sigma Connect')
pd.set_option('max_columns', 200)
pd.set_option('max_colwidth', 200)

In [74]:
tr_df = pd.read_json('train.json')
te_df = pd.read_json('test.json')

In [75]:
# count number of photos

tr_df['num_photos'] = tr_df['photos'].apply(lambda x: len(x))
te_df['num_photos'] = te_df['photos'].apply(lambda x: len(x))

tr_df['num_features'] = tr_df['features'].apply(lambda x: len(x))
te_df['num_features'] = te_df['features'].apply(lambda x: len(x))

In [76]:
feature_to_use = ['price', 'num_photos', 'num_features', 'bedrooms', 'bathrooms']

In [77]:
fs = ['manager_id', 'building_id', 'street_address', 'display_address']

for f in fs:
    if tr_df[f].dtype == 'object':
        le = preprocessing.LabelEncoder()
        le.fit(list(tr_df[f].values) + list(te_df[f].values))
        tr_df[f] = le.transform(list(tr_df[f].values))
        te_df[f] = le.transform(list(te_df[f].values))
        feature_to_use.append(f)

In [79]:
#print tr_df.columns
feature_to_use.append('price_diff')

In [80]:
print feature_to_use

['price', 'num_photos', 'num_features', 'bedrooms', 'bathrooms', 'manager_id', 'building_id', 'street_address', 'display_address', 'price_diff']


In [81]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.sparse import hstack

In [83]:
tr_df['features'] = tr_df['features'].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
te_df['features'] = te_df['features'].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))

In [84]:
tfidf = CountVectorizer(stop_words = 'english', max_features = 200)
tr_sparse = tfidf.fit_transform(tr_df['features'])
te_sparse = tfidf.fit_transform(te_df['features'])

In [95]:
print feature_to_use

['price', 'num_photos', 'num_features', 'bedrooms', 'bathrooms', 'manager_id', 'building_id', 'street_address', 'display_address', 'price_diff']


In [87]:
tr_X = sparse.hstack([tr_df[feature_to_use], tr_sparse]).tocsr()
#te_X = sparse.hstack([te_df[feature_to_use], te_sparse]).tocsr()

In [88]:
# numbering target variable 

target_var_map = {'high':0, 'medium':1, 'low':2}
tr_y = np.array(tr_df['interest_level'].apply(lambda x : target_var_map[x]))
print (tr_X.shape, tr_y.shape)

((49352, 210), (49352L,))


In [99]:
tr_X_df = pd.SparseDataFrame([ pd.SparseSeries(tr_X[i].toarray().ravel()) for i in np.arange(tr_X.shape[0])])

In [25]:
# test set

te_X_df = pd.SparseDataFrame([ pd.SparseSeries(te_X[i].toarray().ravel()) for i in np.arange(te_X.shape[0])])
te_X_df = te_X_df.iloc[:,0:9]
te_X_df.columns = feature_to_use

In [100]:
tr_y_df = pd.DataFrame(tr_y.ravel())
tr_y_df.columns = ['target']

In [101]:
tr_X_df = tr_X_df.iloc[:,0:10]
tr_X_df.columns = feature_to_use

In [102]:
print tr_X_df.head()

    price  num_photos  num_features  bedrooms  bathrooms  manager_id  \
0  3000.0         5.0           0.0       3.0        1.5      1568.0   
1  5465.0        11.0           5.0       2.0        1.0      1988.0   
2  2850.0         8.0           4.0       1.0        1.0      3733.0   
3  3275.0         3.0           2.0       1.0        1.0       282.0   
4  3350.0         3.0           1.0       4.0        1.0      2618.0   

   building_id  street_address  display_address  price_diff  
0       3797.0         23484.0          12282.0      -150.0  
1       8986.0         23680.0           9080.0      1295.0  
2       8889.0          9827.0          13719.0       150.0  
3       1848.0         14237.0          10866.0      -895.0  
4          0.0         19227.0          15072.0      -820.0  


In [103]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

In [275]:
est = GradientBoostingClassifier(n_estimators = 100)
est.fit(tr_X_df, tr_y_df)

  y = column_or_1d(y, warn=True)


GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [279]:
pred_tr_X = est.predict(tr_X_df)

In [105]:
from sklearn.grid_search import GridSearchCV

In [30]:
# convert to array

tr_X_arr = tr_X.toarray()

In [49]:
# Perform a grid search to fine tune with the parameters 

param_grid = {'max_depth':[4, 6, 8]}

est = RandomForestClassifier(n_estimators = 500)
gs_cv = GridSearchCV(est, param_grid).fit(tr_X_arr, tr_y)
print gs_cv.best_params_

{'max_depth': 8}


In [None]:
# apply model based on GridSearch 

est = RandomForestClassifier(n_estimators = 500, max_depth = 8)
rfc = est.fit(tr_X_arr, tr_y)
print rfc.feature_importances_

In [51]:
pred_X = rfc.predict(tr_X_arr)

In [39]:
from sklearn.metrics import confusion_matrix

In [44]:
print confusion_matrix(pred_X, tr_y)

[[    0     0     0]
 [   15    58    23]
 [ 3824 11171 34261]]


In [52]:
print confusion_matrix(pred_X, tr_y)

[[    2     0     0]
 [   17    61    24]
 [ 3820 11168 34260]]


In [53]:
pred_te = rfc.predict_proba(te_X)

In [54]:
pred_df = pd.DataFrame(pred_te, columns = ('high', 'medium', 'low'))
print pred_df.head()

       high    medium       low
0  0.077023  0.232660  0.690317
1  0.049802  0.150708  0.799490
2  0.091516  0.231357  0.677127
3  0.096610  0.254462  0.648928
4  0.059139  0.219749  0.721112


In [55]:
pred_df['listing_id'] = te_df.listing_id.values
pred_df.to_csv('submission4_RF_500_8.csv', index = False)