In [1]:
%matplotlib inline

from model import pipeline
from scipy.cluster.hierarchy import dendrogram, linkage
import model.pandas as mpd
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import fbeta_score
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression, LinearRegression
sns.set()



In [2]:
def quantileEncoder(cols, target, qs, qnames = None):
    if qnames is None:
        qnames = [f"{'_'.join(cols)}_encode_{q * 100}%" for q in qs]
    
    def quantileFun(df):
        return df.groupby(cols)[target]\
                .quantile(qs)\
                .unstack()\
                .rename({q : qname for q, qname in zip(qs, qnames)}, axis = 1)
                
    return mpd.joinFun(quantileFun, on = cols, how = 'left')

In [3]:
def dateDecoder(date_col):
    d1 = mpd.columnMapper(lambda x: x.dt.dayofweek, date_col, 'dow')
    d2 = mpd.columnMapper(lambda x: x.dt.month, date_col, 'month')
    
    return pipeline([d1, d2])

def timeDecoder(time_col):
    t1 = mpd.columnMapper(lambda x: x.str.extract('([0-9]+):[0-9]+', expand = False).astype('int'),
                     time_col, 'hour')
    t2 = mpd.columnMapper(lambda x: x.str.extract('[0-9]+:([0-9]+)', expand = False).astype('int'),
                     time_col, 'minute')
    
    return t1
    

In [4]:
class E(mpd.Estimator):
    def __init__(self, fit_trans):
        self.fit_trans = fit_trans
    
    def fit(self, df):
        t, _ = self.fit_trans
        return t
    
    def fit_transform(self, df):
        return self.fit_trans(df)

def regularizedQuanitleEncoding(cols, target, qs, qnames = None, splits = 5):
    if qnames is None:
        qnames = [f"{'_'.join(cols)}_encode_{q * 100}%" for q in qs]
        
    def fit_qenc(train):
        folder = KFold(n_splits=splits)
        train_enc = []
        
        for cv_train_ids, cv_val_ids in folder.split(train):
            cv_train = train.iloc[cv_train_ids]
            cv_val = train.iloc[cv_val_ids]
            
            cv_quanitles = cv_train.groupby(cols)[target]\
                .quantile(qs)\
                .unstack()\
                .rename({q : qname for q, qname in zip(qs, qnames)}, axis = 1)
            
            train_enc = train_enc + [cv_val.join(cv_quanitles, on = cols, how = 'left')]
        
        means = train.groupby(cols)[target]\
            .quantile(qs)\
            .unstack()\
            .rename({q : qname for q, qname in zip(qs, qnames)}, axis = 1)
        
        return mpd.MapT(lambda df: df.join(means, on = cols, how = 'left')), pd.concat(train_enc)
        
    return E(fit_qenc)

In [5]:
def regularizedStuff(cols, target, rsuffix = None, splits = 5):
    if rsuffix is None:
        rsuffix = ''
        
    def fit_qenc(train):
        folder = KFold(n_splits=splits)
        train_enc = []
        
        for cv_train_ids, cv_val_ids in folder.split(train):
            cv_train = train.iloc[cv_train_ids]
            cv_val = train.iloc[cv_val_ids]
            
            cv_means = cv_train.groupby(cols)[target]\
                .agg(['mean', 'std', 'count'])
            
            cv_means = cv_means.rename({col : '_'.join(cols) + '_' + target + '_' + col + rsuffix for col in cv_means.columns}, axis = 1)
            
            train_enc = train_enc + [cv_val.join(cv_means, on = cols, how = 'left')]
        
        means = train.groupby(cols)[target]\
            .agg(['mean', 'std', 'count'])
        
        means = means.rename({col : '_'.join(cols) + '_' + target + '_' + col + rsuffix for col in means.columns}, axis = 1)
        return mpd.MapT(lambda df: df.join(means, on = cols, how = 'left')), pd.concat(train_enc)
        
    return E(fit_qenc)

In [6]:
data = pd.read_csv('../data/train-parking.csv', parse_dates=['Date']).sort_values('Date')
test = pd.read_csv('../data/test-no-labels-with-id.csv', parse_dates=['Date'])

In [7]:
intersection_info = pd.read_csv('../intersection_locations2.csv').set_index(['Street1', 'Street2'])

In [82]:
train, val = train_test_split(data, test_size = 0.3, shuffle = False)

In [88]:
outlier_removal = mpd.trainOnly(lambda df: df[(df['Real.Spots'] < 30) & (df['Street'] != 'Redwood Street')])
de = dateDecoder('Date')
te = timeDecoder('Time')
rs1 = regularizedStuff(['Street'], 'Real.Spots')
#rs12 = regularizedStuff(['Street', 'From', 'To'], 'any_spot')
#rs2 = regularizedStuff(['hour'], 'Real.Spots')
qe = regularizedQuanitleEncoding(['Street'], 'Real.Spots', [0,0.25,0.5,0.75,1])
qe2 = regularizedQuanitleEncoding(['hour'], 'Real.Spots', [0,0.25,0.5,0.75,1])
le = mpd.labelEncoder(['Street', 'From', 'To'])
add_latlng = mpd.MapE(lambda df: df.join(intersection_info, on = ['Street', 'From'], how = 'left'))

remove = []
dropper = mpd.dropCols(['Date', 'Time'] + remove)

final_pipe = outlier_removal * pipeline([de, add_latlng, te, qe, qe2, le, dropper, mpd.MapE(lambda df: df.fillna(-1))])

In [97]:
t, train2 = final_pipe.fit_transform(train)
val2 = t.transform(val)
test2 = t.transform(test)

In [105]:
dtrain = xgb.DMatrix(train2.drop(['Real.Spots', 'any_spot'], axis = 1), 
                     label = train2['Real.Spots'])
dval = xgb.DMatrix(val2.drop(['Real.Spots', 'any_spot'], axis = 1), 
                   label = val2['Real.Spots'])
# specify parameters via map
param = {'max_depth':4, 'eta':.01, 'colsample_bytree' : .8, 
         'subsample' : .8}
num_round = 400
bst = xgb.train(param, dtrain, num_round)
# make prediction
preds = bst.predict(dval)
print(fbeta_score(train2['any_spot'], (bst.predict(dtrain) > 1.1), 0.5))
print(fbeta_score(val2['any_spot'], (preds > 1.1), 0.5))

0.658838878017
0.51724137931


In [86]:
df = val2
base_score = fbeta_score(df['any_spot'], (bst.predict(xgb.DMatrix(df.drop(['Real.Spots', 'any_spot'], axis = 1))) > 0.5), 0.5)
importances = []
msk = np.random.permutation(df.shape[0])
for col in val2.drop(['Real.Spots', 'any_spot'], axis = 1).columns:
    x = df.assign(**{col : df[col].iloc[msk].values})
    dx = xgb.DMatrix(x.drop(['Real.Spots', 'any_spot'], axis = 1), 
                         label = x['any_spot'])
    
    score = fbeta_score(x['any_spot'], bst.predict(dx) > 0.5, 0.5)
    importances.append((col, (base_score - score) /base_score))
    
res = pd.DataFrame(importances, columns=['col', 'imp']).sort_values('imp', ascending =False)
res

Unnamed: 0,col,imp
0,Street.Length,0.13541
4,longitude,0.11034
3,latitude,0.055556
1,dow,0.031805
5,hour,0.022405
16,Street,0.02078
9,Street_encode_75.0%,0.019231
15,hour_encode_100%,0.017636
14,hour_encode_75.0%,0.014493
13,hour_encode_50.0%,0.009709


In [31]:
res[res['imp'] < 0]['col'].values

array(['d', 'Street_encode_100%', 'hour_encode_75.0%',
       'Street_Real.Spots_count', 'Street_Real.Spots_std',
       'hour_Real.Spots_mean', 'hour_Real.Spots_count', 'dow',
       'hour_Real.Spots_std', 'latitude', 'To', 'Street_Real.Spots_mean',
       'From'], dtype=object)

In [203]:
test_preds = bst.predict(xgb.DMatrix(test2.drop('id', axis = 1))) > 0.5

In [206]:
pd.DataFrame({'any_spot' : test_preds}, index = test2['id']).to_csv('../predictions/xgboost_new_pipeline_test.csv')