In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import scipy

from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

In [None]:
df=pd.read_json("../input/train.json")
df['priceperbed']=(df['price'].clip(upper=7000)/df['bedrooms'].clip(lower=1))
df['created']=df['created'].astype(np.datetime64)
df['created_day']=np.array(df.created.values, dtype='datetime64[D]').astype(np.float32)%7
df['created_week']=np.array(df.created.values, dtype='datetime64[W]').astype(np.float32)
df['created_hour']=np.array(df.created.values, dtype='datetime64[h]').astype(np.float32)%24
df['desc_count']=df.description.apply(lambda x: len(x.split())).clip(upper=150)
df['features_count']=df.features.apply(lambda x: len(x))
df['photos_count']=df.photos.apply(lambda x: len(x))

lbl = preprocessing.LabelEncoder()
lbl.fit(list(df['manager_id'].values))
df['manager_id'] = lbl.transform(list(df['manager_id'].values))

In [None]:
feature_list=['no fee', 'hardwood floors', 'laundry in building']
df['features']=df['features'].apply(lambda x: list(map(str.lower, x)))
for feature in feature_list:
        df[feature]=df['features'].apply(lambda x: feature in x)
cols=['price', 'bathrooms', 'bedrooms', 'latitude', 'longitude', 'manager_idpriceperbed','created_hour',  'desc_count', 
      'photos_count', 'features_count', 'no fee', 'hardwood floors', 'laundry in building']
        

In [None]:
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                 stop_words='english')
vectorizer.fit(df.description.values)


In [None]:
df_tv, df_test = train_test_split(df, random_state=0)
df_train, df_val = train_test_split(df_tv, random_state=0)

In [None]:
svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)
X_train = svd.fit_transform(vectorizer.transform(df_train.description))
#X_train_dense=scipy.sparse.csr_matrix(df_train[cols].values, dtype=np.float64)
X_train=np.hstack([X_train, df_train[cols].values])
X_val = svd.transform(vectorizer.transform(df_val.description))
#X_val_dense=scipy.sparse.csr_matrix(df_val[cols].values, dtype=np.float64)
X_val=np.hstack([X_val, df_val[cols].values])

In [None]:
clf=ExtraTreesClassifier(max_depth=23, n_estimators=1000,
                             min_samples_split=10, random_state=0) 
clf.fit(X_train, df_train['interest_level'])
y_pred=clf.predict_proba(X_train)
score=log_loss(df_train['interest_level'].values, y_pred)
y_pred=clf.predict_proba(X_val)
score2=log_loss(df_val['interest_level'].values, y_pred)
print("%.6f %.6f"%(score, score2))

In [None]:
X_test = svd.transform(vectorizer.transform(df_test.description))
X_test=np.hstack([X_test, df_test[cols].values])
y_pred=clf.predict_proba(X_test)
score3=log_loss(df_test['interest_level'].values, y_pred)
print("%.6f"%(score3))