In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import scipy

from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

df=pd.read_json("../input/train.json")
df['priceperbed']=(df['price'].clip(upper=7000)/df['bedrooms'].clip(lower=1))
df['created']=df['created'].astype(np.datetime64)
df['created_day']=np.array(df.created.values, dtype='datetime64[D]').astype(np.float32)%7
df['created_week']=np.array(df.created.values, dtype='datetime64[W]').astype(np.float32)
df['created_hour']=np.array(df.created.values, dtype='datetime64[h]').astype(np.float32)%24
df['desc_count']=df.description.apply(lambda x: len(x.split())).clip(upper=150)
df['features_count']=df.features.apply(lambda x: len(x))
df['photos_count']=df.photos.apply(lambda x: len(x))

categorical = ["display_address", "building_id", "street_address"]
for f in categorical:
        if df[f].dtype=='object':
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(df[f].values) + list(df[f].values))
            df[f] = lbl.transform(list(df[f].values))
            
lbl = preprocessing.LabelEncoder()
lbl.fit(list(df['manager_id'].values))
df['manager_id'] = lbl.transform(list(df['manager_id'].values))

feature_list=['no fee', 'hardwood floors', 'laundry in building']
df['features']=df['features'].apply(lambda x: list(map(str.lower, x)))
for feature in feature_list:
        df[feature]=df['features'].apply(lambda x: feature in x)
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                 stop_words='english')
vectorizer.fit(df.description.values)


In [None]:
df_tv, df_test = train_test_split(df, random_state=0)
df_train, df_val = train_test_split(df_tv, random_state=0)

In [None]:
temp = pd.concat([df_train.manager_id,pd.get_dummies(df_train.interest_level)], axis = 1
                ).groupby('manager_id').mean()
temp.columns = ['high_frac','low_frac', 'medium_frac']
temp['count'] = df_train.groupby('manager_id').count().iloc[:,1]

temp['manager_skill'] = temp['high_frac']*2 + temp['medium_frac']
unranked_managers_ixes = temp['count']<20
ranked_managers_ixes = ~unranked_managers_ixes
mean_values = temp.loc[ranked_managers_ixes, [
    'high_frac','low_frac', 'medium_frac','manager_skill']].mean()
temp.loc[unranked_managers_ixes,['high_frac','low_frac', 'medium_frac','manager_skill']] = mean_values.values

df_train = df_train.merge(temp.reset_index(),how='left', on='manager_id')
df_val = df_val.merge(temp.reset_index(),how='left', on='manager_id')
new_manager_ixes = df_val['high_frac'].isnull()
df_val.loc[new_manager_ixes,['high_frac','low_frac', 'medium_frac','manager_skill'
                            ]] = mean_values.values
df_test = df_test.merge(temp.reset_index(),how='left', on='manager_id')
new_manager_ixes = df_test['high_frac'].isnull()
df_test.loc[new_manager_ixes,['high_frac','low_frac', 'medium_frac','manager_skill'
                            ]] = mean_values.values

In [None]:
derived_cols = ['derived_'+str(i) for i in range(5)]
cols=['price', 'bathrooms', 'bedrooms', 'latitude', 'longitude', 'priceperbed','created_hour', 
      'desc_count', 'photos_count', 'features_count', 'no fee', 'hardwood floors', 
      'laundry in building', 'manager_skill', 'listing_id']+categorical

svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)
X_train = svd.fit_transform(vectorizer.transform(df_train.description))
X_train=np.hstack([X_train, df_train[cols].values])
X_val = svd.transform(vectorizer.transform(df_val.description))
X_val=np.hstack([X_val, df_val[cols].values])
X_test = svd.transform(vectorizer.transform(df_test.description))
X_test=np.hstack([X_test, df_test[cols].values])
target_num_map = {'high':0, 'low':1, 'medium':2}
y_train = np.array(df_train['interest_level'].apply(lambda x: target_num_map[x]))
y_test = np.array(df_test['interest_level'].apply(lambda x: target_num_map[x]))
y_val = np.array(df_val['interest_level'].apply(lambda x: target_num_map[x]))

In [None]:
dtrain = xgb.DMatrix(data=X_train, label=y_train)
xgval = xgb.DMatrix(X_val, y_val)

In [None]:
import xgboost as xgb
SEED = 0

params = {
    'eta':.15,
    'max_depth':6,
    'min_child_weight':3,
    'colsample_bytree':.8,
    'subsample':.8,
    'seed':0,
    'nthread':16,
    'objective':'multi:softprob',
    'eval_metric':'mlogloss',
    'num_class':3,
    'silent':1
}

bst = xgb.train(params, dtrain, 130, verbose_eval=25)
y_pred = bst.predict(dtrain)
score=log_loss(df_train['interest_level'].values, y_pred)
print(score)
y_pred = bst.predict(xgval)
score=log_loss(df_val['interest_level'].values, y_pred)
print(score)

In [None]:
#pd.Series(index = derived_cols + cols, data = clf.feature_importances_).sort_values().plot(
#    kind = 'bar')