In [None]:
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
import random
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [None]:
train_df = pd.read_json("../input/train.json")
test_df = pd.read_json("../input/test.json")

In [None]:
#define function for XGB model running
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=2000):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.02
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model


#Feature Engineering

In [None]:
#Badroom and Bathroom
train_df["price_per_bedroom"] =train_df["price"]/train_df["bedrooms"]
test_df["price_per_bedroom"] = test_df["price"]/test_df["bedrooms"] 

train_df["room_sum"] = train_df["bedrooms"]+train_df["bathrooms"] 
test_df["room_sum"] = test_df["bedrooms"]+test_df["bathrooms"] 


In [None]:
#Number of Photos
train_df["num_photos"] = train_df["photos"].apply(len)
test_df["num_photos"] = test_df["photos"].apply(len)

In [None]:
#Number of Features
train_df["num_features"] = train_df["features"].apply(len)
test_df["num_features"] = test_df["features"].apply(len)


In [None]:
# count of words present in description column #
train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
test_df["num_description_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))


In [None]:
# create time, interval time from since the list created
train_df["created"] = pd.to_datetime(train_df["created"])
train_df["passed"] = train_df["created"].max() - train_df["created"]
train_df["passed"] = train_df["passed"].dt.days

test_df["created"] = pd.to_datetime(test_df["created"])
test_df["passed"] = test_df["created"].max() - test_df["created"]
test_df["passed"] = test_df["passed"].dt.days

train_df["created_year"] = train_df["created"].dt.year
test_df["created_year"] = test_df["created"].dt.year
train_df["created_month"] = train_df["created"].dt.month
test_df["created_month"] = test_df["created"].dt.month
train_df["created_day"] = train_df["created"].dt.day
test_df["created_day"] = test_df["created"].dt.day
train_df["created_hour"] = train_df["created"].dt.hour
test_df["created_hour"] = test_df["created"].dt.hour


In [None]:
# Rate by Level for Manager_id
nrow = train_df.shape[0]
manager = train_df.groupby(['manager_id'])['interest_level'].count()
manager2 = manager.to_frame().reset_index()
manager2.columns = ["manager_id","manager_id_count"]
manager2["manager_prob"]=manager2["manager_id_count"]/nrow
manager2["manager_id_odds"]=manager2["manager_prob"]/(1-manager2["manager_prob"])
manager2["manager_id_odds"]=np.log(manager2["manager_id_odds"])
manager2 = manager2.drop("manager_prob",axis=1)
manager2.head()
train_df = pd.merge(train_df, manager2, on='manager_id', how='left')
train_df.head()

In [None]:
nrow = test_df.shape[0]
manager = test_df.groupby(['manager_id'])['listing_id'].count()
manager2 = manager.to_frame().reset_index()
manager2.columns = ["manager_id","manager_id_count"]
manager2["manager_prob"]=manager2["manager_id_count"]/nrow
manager2["manager_id_odds"]=manager2["manager_prob"]/(1-manager2["manager_prob"])
manager2["manager_id_odds"]=np.log(manager2["manager_id_odds"])
manager2 = manager2.drop("manager_prob",axis=1)
manager2.head()
test_df = pd.merge(test_df, manager2, on='manager_id', how='left')
test_df.head()

In [None]:
# Rate by Level for Building_id
nrow = train_df.shape[0]
building = train_df.groupby(['building_id'])['interest_level'].count()
building2 = building.to_frame().reset_index()
building2.columns = ["building_id","building_id_count"]
building2["building_prob"]=building2["building_id_count"]/nrow
building2["building_id_odds"]=building2["building_prob"]/(1-building2["building_prob"])
building2["building_id_odds"]=np.log(building2["building_id_odds"])
building2 = building2.drop("building_prob",axis=1)
building2.head()
train_df = pd.merge(train_df, building2, on='building_id', how='left')
train_df.head()

In [None]:
nrow = test_df.shape[0]
building = test_df.groupby(['building_id'])['listing_id'].count()
building2 = building.to_frame().reset_index()
building2.columns = ["building_id","building_id_count"]
building2["building_prob"]=building2["building_id_count"]/nrow
building2["building_id_odds"]=building2["building_prob"]/(1-building2["building_prob"])
building2["building_id_odds"]=np.log(building2["building_id_odds"])
building2 = building2.drop("building_prob",axis=1)
building2.head()
test_df = pd.merge(test_df, building2, on='building_id', how='left')
test_df.head()

In [None]:
features_to_use=["bathrooms", "bedrooms", "latitude", "longitude", "price","price_per_bedroom",
"num_photos", "num_features", "num_description_words","listing_id", "created_year", "created_month", "created_day", "created_hour",
                "manager_id_count","manager_id_odds","building_id_count","building_id_odds"]

#using cross valdation to compute the posterier prob (P(y = low/medium/high|x_manager))
#in the barreca's paper we know, thatcount(x_manager) maybe too small to give a credencial probability, thus we could combine the prior probability
#we may use that here
#and we could see if count(x_manager) is nan, the prob = 0 here, however we may use the prior here
index=list(range(train_df.shape[0]))
random.shuffle(index)
a=[np.nan]*len(train_df)
b=[np.nan]*len(train_df)
c=[np.nan]*len(train_df)

In [None]:
for i in range(5):
    building_level={}
    for j in train_df['manager_id'].values:
        building_level[j]=[0,0,0]
    
    #select the fifth part as the validation set, and the other as the train set
    test_index=index[int((i*train_df.shape[0])/5):int(((i+1)*train_df.shape[0])/5)]
    train_index=list(set(index).difference(test_index))
    
    #sum up the count of each level for a specific manager
    for j in train_index:
        temp=train_df.iloc[j]
        if temp['interest_level']=='low':
            building_level[temp['manager_id']][0]+=1
        if temp['interest_level']=='medium':
            building_level[temp['manager_id']][1]+=1
        if temp['interest_level']=='high':
            building_level[temp['manager_id']][2]+=1
            
    for j in test_index:
        temp=train_df.iloc[j]
        if sum(building_level[temp['manager_id']])!=0:
            a[j]=building_level[temp['manager_id']][0]*1.0/sum(building_level[temp['manager_id']])
            b[j]=building_level[temp['manager_id']][1]*1.0/sum(building_level[temp['manager_id']])
            c[j]=building_level[temp['manager_id']][2]*1.0/sum(building_level[temp['manager_id']])


In [None]:
train_df['manager_level_low']=a
train_df['manager_level_medium']=b
train_df['manager_level_high']=c

In [None]:
train_df.head()

In [None]:
#here is too compute prior in the trainset as as estimate of posterier prob in the test set.
#if there is manager_id not found in the train_set, we use nan for the prob
a=[]
b=[]
c=[]
building_level={}
for j in train_df['manager_id'].values:
    building_level[j]=[0,0,0]

for j in range(train_df.shape[0]):
    temp=train_df.iloc[j]
    if temp['interest_level']=='low':
        building_level[temp['manager_id']][0]+=1
    if temp['interest_level']=='medium':
        building_level[temp['manager_id']][1]+=1
    if temp['interest_level']=='high':
        building_level[temp['manager_id']][2]+=1

for i in test_df['manager_id'].values:
    if i not in building_level.keys():
        a.append(np.nan)
        b.append(np.nan)
        c.append(np.nan)
    else:
        a.append(building_level[i][0]*1.0/sum(building_level[i]))
        b.append(building_level[i][1]*1.0/sum(building_level[i]))
        c.append(building_level[i][2]*1.0/sum(building_level[i]))


In [None]:
test_df['manager_level_low']=a
test_df['manager_level_medium']=b
test_df['manager_level_high']=c

features_to_use.append('manager_level_low') 
features_to_use.append('manager_level_medium') 
features_to_use.append('manager_level_high')


#transfer the categorical varibles to label integer
categorical = ["display_address", "manager_id", "building_id"]
for f in categorical:
        if train_df[f].dtype=='object':
            #print(f)
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(train_df[f].values) + list(test_df[f].values))
            train_df[f] = lbl.transform(list(train_df[f].values))
            test_df[f] = lbl.transform(list(test_df[f].values))
            features_to_use.append(f)


In [None]:
#transfer features to bag of word and using tdidf to normalizing the word-count
#the tdidf transformation is what we haven't done in version 1, maybe that would improve performance
#and the tokens we chose here are the top 200, which is larger than version 1
train_df['features'] = train_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
test_df['features'] = test_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
print(train_df["features"].head())
tfidf = CountVectorizer(stop_words='english', max_features=200)
tr_sparse = tfidf.fit_transform(train_df["features"])
te_sparse = tfidf.transform(test_df["features"])

In [None]:
features_to_use

In [None]:
train_X = sparse.hstack([train_df[features_to_use], tr_sparse]).tocsr()
test_X = sparse.hstack([test_df[features_to_use], te_sparse]).tocsr()

target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
print(train_X.shape, test_X.shape)

In [None]:
#final predication
preds, model = runXGB(train_X, train_y, test_X, num_rounds=2000)
out_df = pd.DataFrame(preds)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test_df.listing_id.values
out_df.to_csv("xgb_v2.csv", index=False)

