In [1]:
import numpy as np
import operator
import os
import pandas as pd
import random
import re
from scipy import sparse
from sklearn import model_selection, ensemble
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import StratifiedKFold
import sys
import xgboost as xgb

In [2]:
train_df = pd.read_json("Downloads/Kaggle_TwoSigma_train.json")
test_df = pd.read_json("Downloads/Kaggle_TwoSigma_test.json")

In [3]:
# load magic feature and merge
photo_meta = pd.read_csv("Downloads/listing_image_time.csv")
train_df = pd.merge(train_df, photo_meta, on='listing_id', how='inner')
test_df = pd.merge(test_df, photo_meta, on='listing_id', how='inner')

### Basic features

In [7]:
# naive features

# outlier removal for price
train_df["price"] = train_df["price"].clip(upper=13000)

# transform price
train_df["price_t"] =train_df["price"]/train_df["bedrooms"]
test_df["price_t"] = test_df["price"]/test_df["bedrooms"] 
train_df["logprice"] = np.log(train_df["price"])
test_df["logprice"] = np.log(test_df["price"])

# count photos
train_df["num_photos"] = train_df["photos"].apply(len)
test_df["num_photos"] = test_df["photos"].apply(len)

# price per room
train_df["room_sum"] = train_df["bedrooms"]+train_df["bathrooms"] 
test_df["room_sum"] = test_df["bedrooms"]+test_df["bathrooms"] 
train_df['price_per_room'] = train_df['price']/train_df['room_sum']
test_df['price_per_room'] = test_df['price']/test_df['room_sum']

# count features
train_df["num_features"] = train_df["features"].apply(len)
test_df["num_features"] = test_df["features"].apply(len)

# count of words present in description column #
train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
test_df["num_description_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))

# outlier removal for number of bathrooms
test_df["bathrooms"].iloc[19671] = 2.0
test_df["bathrooms"].iloc[22977] = 2.0
test_df["bathrooms"].iloc[63719] = 2.0

#created date
train_df["created"] = pd.to_datetime(train_df["created"])
test_df["created"] = pd.to_datetime(test_df["created"])
train_df["created_month"] = train_df["created"].dt.month
test_df["created_month"] = test_df["created"].dt.month
train_df["created_day"] = train_df["created"].dt.day
test_df["created_day"] = test_df["created"].dt.day
train_df["created_hour"] = train_df["created"].dt.hour
test_df["created_hour"] = test_df["created"].dt.hour

# Note: tried using strftime, creating an integer field of month, date, hour. 
# Code below - commented out. Prediction was better using month, day, hour as 3 separate fields. 
# train_df["created2"] = train_df["created"].apply(lambda x: x.strftime('%m%d%H'))
# test_df["created2"] = test_df["created"].apply(lambda x: x.strftime('%m%d%H'))
# test_df["created2"] = test_df["created2"].astype(int)
# train_df["created2"] = train_df["created2"].astype(int)

In [None]:
features_to_use=["bathrooms", "bedrooms", "latitude", 
                 "longitude", "price", "logprice", "price_t", "price_per_room",
                 "num_photos", "num_features", "num_description_words",
                 "listing_id", "created_month", "created_day", "created_hour", 
                 "time_stamp"
                ]

### The XGB model

The below xgboost function comes from [XGB starter notebook by SRK](https://www.kaggle.com/sudalairajkumar/two-sigma-connect-rental-listing-inquiries/xgb-starter-in-python). After some tuning and based on cross validation scores, I increased the num_rounds=2000, decreased the learning rate *(param['eta'] = 0.02)*. 

In [10]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=300, num_rounds=2000):
    
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.02
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [(xgtrain,'train'), (xgtest, 'test')]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

### Preprocessing High Cardinality Data - manager_id

Below treatment of manager_id came from ["CV statistics" notebook by @gdy5](https://www.kaggle.com/guoday/two-sigma-connect-rental-listing-inquiries/cv-statistics-better-parameters-and-explaination). 


In [11]:
index=list(range(train_df.shape[0]))
random.shuffle(index)
a=[np.nan]*len(train_df)
b=[np.nan]*len(train_df)
c=[np.nan]*len(train_df)

for i in range(5):
    building_level={}
    for j in train_df['manager_id'].values:
        building_level[j]=[0,0,0]
    test_index=index[int((i*train_df.shape[0])/5):int(((i+1)*train_df.shape[0])/5)]
    train_index=list(set(index).difference(test_index))
    for j in train_index:
        temp=train_df.iloc[j]
        if temp['interest_level']=='low':
            building_level[temp['manager_id']][0]+=1
        if temp['interest_level']=='medium':
            building_level[temp['manager_id']][1]+=1
        if temp['interest_level']=='high':
            building_level[temp['manager_id']][2]+=1
    for j in test_index:
        temp=train_df.iloc[j]
        if sum(building_level[temp['manager_id']])!=0:
            a[j]=building_level[temp['manager_id']][0]*1.0/sum(building_level[temp['manager_id']])
            b[j]=building_level[temp['manager_id']][1]*1.0/sum(building_level[temp['manager_id']])
            c[j]=building_level[temp['manager_id']][2]*1.0/sum(building_level[temp['manager_id']])

train_df['manager_level_low']=a
train_df['manager_level_medium']=b
train_df['manager_level_high']=c

In [12]:
a=[]
b=[]
c=[]
building_level={}
for j in train_df['manager_id'].values:
    building_level[j]=[0,0,0]
for j in range(train_df.shape[0]):
    temp=train_df.iloc[j]
    if temp['interest_level']=='low':
        building_level[temp['manager_id']][0]+=1
    if temp['interest_level']=='medium':
        building_level[temp['manager_id']][1]+=1
    if temp['interest_level']=='high':
        building_level[temp['manager_id']][2]+=1

for i in test_df['manager_id'].values:
    if i not in building_level.keys():
        a.append(np.nan)
        b.append(np.nan)
        c.append(np.nan)
    else:
        a.append(building_level[i][0]*1.0/sum(building_level[i]))
        b.append(building_level[i][1]*1.0/sum(building_level[i]))
        c.append(building_level[i][2]*1.0/sum(building_level[i]))
test_df['manager_level_low']=a
test_df['manager_level_medium']=b
test_df['manager_level_high']=c

features_to_use.append('manager_level_low') 
features_to_use.append('manager_level_medium') 
features_to_use.append('manager_level_high')

### Geographical features

In [None]:
location_dict = {
    'cluster1': [-73.98356655,  40.75323318],
    'cluster2': [-73.94459474,  40.82751759],
    'cluster3': [-73.95648445,  40.77901614],
    'cluster4': [-73.99855045,  40.71993652],
    'cluster5': [-73.95348287,  40.688674]
                }

for location in location_dict.keys():
    dlat = location_dict[location][0] - train_df['latitude']
    dlon = (location_dict[location][1] - train_df['longitude']) * np.cos(np.deg2rad(41))  #  adjust for NYC latitude
    train_df['dist_' + location] = np.sqrt(dlat ** 2 + dlon ** 2) * 60     # distance in nautical miles

for location in location_dict.keys():
    dlat = location_dict[location][0] - test_df['latitude']
    dlon = (location_dict[location][1] - test_df['longitude']) * np.cos(np.deg2rad(41))  #  adjust for NYC latitude
    test_df['dist_' + location] = np.sqrt(dlat ** 2 + dlon ** 2) * 60     # distance in nautical miles
    
features_to_use.append("dist_cluster1")
features_to_use.append("dist_cluster2") 
features_to_use.append("dist_cluster3") 
features_to_use.append("dist_cluster4")
features_to_use.append("dist_cluster5")

In [None]:
# note: tried removing outliers 
# train_df=train_df[(train_df.longitude>train_df.longitude.quantile(0.005))
#                   &(train_df.longitude<train_df.longitude.quantile(0.995))
#                   &(train_df.latitude>train_df.latitude.quantile(0.005))                           
#                   &(train_df.latitude<train_df.latitude.quantile(0.995))]

train_df["position"] = train_df.longitude.round(3).astype(str) + '_' + train_df.latitude.round(3).astype(str)
test_df["position"] = test_df.longitude.round(3).astype(str) + '_' + test_df.latitude.round(3).astype(str)

vals = train_df["position"].value_counts()
dvals = vals.to_dict()
train_df["density"] = train_df["position"].apply(lambda x: dvals.get(x, vals.min()))
test_df["density"] = test_df["position"].apply(lambda x: dvals.get(x, vals.min()))

features_to_use.append("density")

### Text features

In [None]:
# text preprocessing for "features" 

def create_binary_features(df):
    bows = {
        "pet" : ("dogs_allowed", "cats_allowed", "pets_ok", "dogs allowed", "cats allowed", "pets allowed"),
        "nofee": ("no fee", "no-fee", "no  fee", "nofee", "no_fee"),
        "lowfee": ("reduced_fee", "low_fee", "reduced fee", "low fee"),
        "furnished": ("furnished"),
        "concierge": ("concierge", "doorman", "housekeep", "in_super"),
        "prewar": ("prewar", "pre_war", "pre war", "pre-war"),
        "laundry": ("laundry", "lndry"),
        "health": ("health", "gym", "fitness", "training"),
        "transport": ("train", "subway", "transport"),
        "parking": ("parking"),
        "utilities": ("utilities", "heat water", "water included"), 
    }
    
    def indicator(bow):
        return lambda s: int(any([x in s for x in bow]))

    features = df["features"].apply(lambda f: " ".join(f).lower())   # convert features to string
    for key in bows:
        df["feature_" + key] = features.apply(indicator(bows[key]))

    return df

create_binary_features(train_df)
create_binary_features(test_df)

In [None]:
# feature engineering on "description"

def cap_share(x):
    return sum(1 for c in x if c.isupper())/float(len(x)+1)

for df in [train_df, test_df]:
    #ALL CAPS description
    df['num_caps'] = df['description'].apply(cap_share)
    
    # length of description in lines shown
    df['num_lines'] = df['description'].apply(lambda x: x.count('<br /><br />'))
    
    # Email address included
    df['num_email'] = 0
    df['num_email'].ix[df['description'].str.contains('@')] = 1
    
    # Phone number included
    reg = re.compile(".*?(\(?\d{3}\D{0,3}\d{3}\D{0,3}\d{4}).*?", re.S)

def try_and_find_nr(description):
    if reg.match(description) is None:
        return 0
    return 1

train_df["num_phone_nr"] = train_df['description'].apply(try_and_find_nr)
test_df["num_phone_nr"] = test_df['description'].apply(try_and_find_nr)

features_to_use.append("num_caps")
features_to_use.append("num_lines")
features_to_use.append("num_phone_nr")
features_to_use.append("num_email")

In [13]:
# transforming categorical variables into numerical continuous variables. plain old LabelEncoder.
# I tried get_dummies / OneHotEncoder but dimensions for "display_address" got out of control. 
# In retrospect, more text cleanup was needed to link duplicate addresses. 

categorical = ["display_address", "manager_id", "building_id"]

for f in categorical:
        if train_df[f].dtype=='object':
            #print(f)
            lbl = LabelEncoder()
            lbl.fit(list(train_df[f].values) + list(test_df[f].values))
            train_df[f] = lbl.transform(list(train_df[f].values))
            test_df[f] = lbl.transform(list(test_df[f].values))
            features_to_use.append(f)

In [14]:
# CountVectorizer and tdidf for "features"

train_df['features'] = train_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
test_df['features'] = test_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
print(train_df["features"].head())
tfidf = CountVectorizer(stop_words='english', max_features=200)
tr_sparse = tfidf.fit_transform(train_df["features"])
te_sparse = tfidf.transform(test_df["features"])

0                                                     
1    Doorman Elevator Fitness_Center Cats_Allowed D...
2    Laundry_In_Building Dishwasher Hardwood_Floors...
3                               Hardwood_Floors No_Fee
4                                              Pre-War
Name: features, dtype: object


In [15]:
train_X = sparse.hstack([train_df[features_to_use], tr_sparse]).tocsr()
test_X = sparse.hstack([test_df[features_to_use], te_sparse]).tocsr()

target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
print(train_X.shape, test_X.shape)

((49352, 242), (74659, 242))


In [None]:
# Note on XGBoost: NaN and infinity are treated as specific values, accepted as input.
# As a result, there was no need to fillna() or replace() inf in this script. 

### Cross validation

In [None]:
cv_scores = []
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=300)
for dev_index, val_index in kf.split(range(train_X.shape[0])):
        dev_X, val_X = train_X[dev_index,:], train_X[val_index,:]
        dev_y, val_y = train_y[dev_index], train_y[val_index]
        preds, model = runXGB(dev_X, dev_y, val_X, val_y, num_rounds = 300) #change num_rounds for shorter time in cv
        cv_scores.append(log_loss(val_y, preds))
        print(cv_scores)
        break

### Run model and save predictions

In [None]:
preds, model = runXGB(train_X, train_y, test_X, num_rounds=2000)
out_df = pd.DataFrame(preds)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test_df.listing_id.values
out_df.to_csv("predictions.csv", index=False)