In [None]:
import time; start_time = time.time()
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import preprocessing, model_selection
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import FeatureUnion
from sklearn.metrics import log_loss
from sklearn import pipeline
import pandas as pd
import numpy as np
from nltk.stem.porter import *
stemmer = PorterStemmer()
from bs4 import BeautifulSoup
import random; random.seed(7)
import xgboost as xgb
import datetime as dt
from scipy import sparse

In [None]:
train_df = pd.read_json(open("../input/train.json", "r")) #limit
test_df = pd.read_json(open("../input/test.json", "r")) #limit

In [None]:
def fix_con(df):
    df['photos'] = df.photos.apply(len)

    df["price_be"] = df["price"].divide(df["bedrooms"])
    df["price_ba"] = df["price"].divide(df["bathrooms"])

    df["created"] = pd.to_datetime(df["created"])
    df["created_year"] = df["created"].dt.year
    df["created_month"] = df["created"].dt.month
    df["created_day"] = df["created"].dt.day
    df['created_hour'] = df["created"].dt.hour
    df['created_weekday'] = df['created'].dt.weekday
    df['created_week'] = df['created'].dt.week
    df['created_quarter'] = df['created'].dt.quarter
    df['created_weekend'] = ((df['created_weekday'] == 5) & (df['created_weekday'] == 6))
    df['created_wd'] = ((df['created_weekday'] != 5) & (df['created_weekday'] != 6))
    df['created'] = df['created'].map(lambda x: float((x - dt.datetime(1899, 12, 30)).days) + (float((x - dt.datetime(1899, 12, 30)).seconds) / 86400))

    df['x5'] = df['latitude'].map(lambda x : round(x,5))
    df['y5'] = df['longitude'].map(lambda x : round(x,5))
    df['x4'] = df['latitude'].map(lambda x : round(x,4))
    df['y4'] = df['longitude'].map(lambda x : round(x,4))
    df['x3'] = df['latitude'].map(lambda x : round(x,3))
    df['y3'] = df['longitude'].map(lambda x : round(x,3))
    df['x2'] = df['latitude'].map(lambda x : round(x,2))
    df['y2'] = df['longitude'].map(lambda x : round(x,2))
    
    return df

In [None]:
train_df = fix_con(train_df)
test_df = fix_con(test_df)

In [None]:
## replacing inf values
def handle_inf(df, col_filter, to_fill_col):
    df = df.replace([np.inf, -np.inf], np.nan)
    fill_value = df[df[col_filter] == 0].price.mean()
    df[to_fill_col] = df[to_fill_col].fillna(fill_value)
    return df
train_df = handle_inf(train_df, 'bedrooms', 'price_be')
train_df = handle_inf(train_df, 'bathrooms', 'price_ba')
test_df = handle_inf(test_df, 'bedrooms', 'price_be')
test_df = handle_inf(test_df, 'bathrooms', 'price_ba')

In [None]:
categorical = ["display_address", "manager_id", "building_id", "street_address",'created_weekend','created_wd']
for f in categorical:
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train_df[f].values) + list(test_df[f].values))
        train_df[f] = lbl.transform(list(train_df[f].values))
        test_df[f] = lbl.transform(list(test_df[f].values))

In [None]:
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin

class manager_skill(BaseEstimator, TransformerMixin):
    """
    Adds the column "manager_skill" to the dataset, based on the Kaggle kernel
    "Improve Perfomances using Manager features" by den3b. The function should
    be usable in scikit-learn pipelines.
    
    Parameters
    ----------
    threshold : Minimum count of rental listings a manager must have in order
                to get his "own" score, otherwise the mean is assigned.

    Attributes
    ----------
    mapping : pandas dataframe
        contains the manager_skill per manager id.
        
    mean_skill : float
        The mean skill of managers with at least as many listings as the 
        threshold.
    """
    def __init__(self, threshold = 5):
        
        self.threshold = threshold
        
    def _reset(self):
        """Reset internal data-dependent state of the scaler, if necessary.
        
        __init__ parameters are not touched.
        """
        # Checking one attribute is enough, becase they are all set together
        # in fit        
        if hasattr(self, 'mapping_'):
            
            self.mapping_ = {}
            self.mean_skill_ = 0.0
        
    def fit(self, X,y):
        """Compute the skill values per manager for later use.
        
        Parameters
        ----------
        X : pandas dataframe, shape [n_samples, n_features]
            The rental data. It has to contain a column named "manager_id".
            
        y : pandas series or numpy array, shape [n_samples]
            The corresponding target values with encoding:
            low: 0.0
            medium: 1.0
            high: 2.0
        """        
        self._reset()
        
        temp = pd.concat([X.manager_id,pd.get_dummies(y)], axis = 1).groupby('manager_id').mean()
        temp.columns = ['low_frac', 'medium_frac', 'high_frac']
        temp['count'] = X.groupby('manager_id').count().iloc[:,1]
        
        
        temp['manager_skill'] = temp['high_frac']*2 + temp['medium_frac']
        
        mean = temp.loc[temp['count'] >= self.threshold, 'manager_skill'].mean()
        
        temp.loc[temp['count'] < self.threshold, 'manager_skill'] = mean
        
        self.mapping_ = temp[['low_frac', 'medium_frac', 'high_frac','manager_skill']]
        self.mean_skill_ = mean
            
        return self
        
    def transform(self, X):
        """Add manager skill to a new matrix.
        
        Parameters
        ----------
        X : pandas dataframe, shape [n_samples, n_features]
            Input data, has to contain "manager_id".
        """        
        X = pd.merge(left = X, right = self.mapping_, how = 'left', left_on = 'manager_id', right_index = True)
        X['manager_skill'].fillna(self.mean_skill_, inplace = True)
        
        return X

In [None]:
trans = manager_skill()
train_df = trans.fit_transform(train_df, train_df['interest_level'])
test_df = trans.transform(test_df)

In [None]:
train_df['features_len'] = train_df.features.apply(len)
test_df['features_len'] = test_df.features.apply(len)

In [None]:
def str_stem(s): 
    if isinstance(s, str):
        s = s.lower()
        s = s.replace("  "," ")
        b = BeautifulSoup(s, "lxml")
        s = b.get_text(" ").strip()
        s = (" ").join([z for z in s.split(" ")])
        s = (" ").join([stemmer.stem(z) for z in s.split(" ")])
        s = s.lower().strip()
        return s
    else:
        return ""

In [None]:
def fix_description(df):
    df['x_'+'description'] = df['description'].map(lambda x:str_stem(x))
    df['y_'+'description'] = df['description'].values
    return df
train_df = fix_description(train_df)
test_df = fix_description(test_df)

In [None]:
train_df['description_len'] = train_df.x_description.apply(lambda x: len(x.split()))
test_df['description_len'] = test_df.x_description.apply(lambda x: len(x.split()))

In [None]:
train_df['features'] = train_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
test_df['features'] = test_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
print(train_df["features"].head())
tfidf = CountVectorizer(stop_words='english', max_features=200)
tr_sparse = tfidf.fit_transform(train_df["features"])
te_sparse = tfidf.transform(test_df["features"])

In [None]:
target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
train_df = train_df.drop(['description','features','interest_level','x_description','y_description'], axis = 1)
test_df = test_df.drop(['description','features','x_description','y_description'], axis = 1)

In [None]:
train_X = sparse.hstack([train_df[features_to_use], tr_sparse]).tocsr()
test_X = sparse.hstack([test_df[features_to_use], te_sparse]).tocsr()
print(train_X.shape, test_X.shape)

In [None]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=1000):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.1
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

In [None]:
cv_scores = []
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2016)
for dev_index, val_index in kf.split(range(train_X.shape[0])):
        dev_X, val_X = train_X[dev_index,:], train_X[val_index,:]
        dev_y, val_y = train_y[dev_index], train_y[val_index]
        preds, model = runXGB(dev_X, dev_y, val_X, val_y)
        cv_scores.append(log_loss(val_y, preds))
        print(cv_scores)
        break

In [None]:
#dic = model.get_score()
#sorted(dic.items(), key=lambda kv: kv[1], reverse=True)[:20]
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV

In [None]:
#train_df.info()
#features_to_use = ['building_id','price', 'price_be','latitude','longitude','display_address','street_address','created','price_ba','listing_id','manager_id']
rfecv = RFECV(estimator=model, step=1, cv=StratifiedKFold(2),
              scoring='accuracy')
rfecv.fit(train_X, train_y)

print("Optimal number of features : %d" % rfecv.n_features_)