In [44]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [59]:
train = pd.read_csv('../data/train-parking.csv', parse_dates=[[3,4]], infer_datetime_format=True)
test=pd.read_csv('../data/test-no-label-parking.csv', parse_dates=[[3,4]], infer_datetime_format=True)
valid_dow = pd.read_csv('../data/valid_dow.csv', infer_datetime_format=True, index_col='index').set_index('index.1')
valid_hg = pd.read_csv('../data/valid_hg.csv', infer_datetime_format=True, index_col='index').set_index('index.1')
valid_swhg = pd.read_csv('../data/valid_swhg.csv', infer_datetime_format=True, index_col='index').set_index('index.1')

In [60]:
train.head()

Unnamed: 0,Date_Time,Street,From,To,Real.Spots,Street.Length,any_spot
0,2014-01-07 16:19:00,Mission Street,25th Street,26th Street,4,179.13297,1
1,2014-01-18 20:42:00,Polk Street,Ellis Street,Olive Street,0,52.74021,0
2,2014-01-18 20:39:00,Van Ness Avenue,Geary Boulevard,Myrtle Street,0,52.51784,0
3,2014-01-18 20:38:00,Van Ness Avenue,Bush Street,Fern Street,0,52.405315,0
4,2014-01-18 20:38:00,Van Ness Avenue,Daniel Burnham Court,Post Street,0,52.191193,0


##### Get the proper train and validation set

In [61]:
train_dow = train[~train.index.isin(valid_dow.index)]
train_hg = train[~train.index.isin(valid_hg.index)]
train_swhg = train[~train.index.isin(valid_swhg.index)]

## Select validation set - round 1

In [76]:
# get long and lat for train street
st_latlng = pd.read_csv("../data/train_longlat.csv", index_col = False,  dtype = {'lat':np.float64,'lng':np.float64})

In [77]:
train_dow = pd.merge(train_dow,st_latlng,how ='left',left_on=["Street","From","To"], right_on=["Street","From","To"])
valid_dow= pd.merge(valid_dow,st_latlng,how ='left',left_on=["Street","From","To"], right_on=["Street","From","To"])

In [78]:
# sensor related data
sensor = pd.read_csv('../data/aggregated_sensor.csv')

In [79]:
sensor['TIME_OF_DAY'] = sensor['TIME_OF_DAY']/100
sensor['TIME_OF_DAY'] = sensor['TIME_OF_DAY'].astype('int')

In [81]:
# Clustering
from sklearn.cluster import KMeans
spots = np.vstack(train_dow[['lat','lng']].values)
kmeans = KMeans().fit(spots)
train_dow['street_cluster'] = kmeans.predict(train_dow[['lat', 'lng']])
valid_dow['street_cluster'] = kmeans.predict(valid_dow[['lat', 'lng']])

In [88]:
# Time features
train_dow['Minute'] = train_dow.Date_Time.dt.minute
valid_dow['Minute'] = pd.to_datetime(valid_dow['Date_Time']).dt.minute
train_dow['Hour'] = train_dow.Date_Time.dt.hour
valid_dow['Hour'] = pd.to_datetime(valid_dow['Date_Time']).dt.hour
train_dow['Dow'] = train_dow.Date_Time.dt.weekday
valid_dow['Dow'] = pd.to_datetime(valid_dow['Date_Time']).dt.weekday

In [90]:
# Convert the time (Hour, Minute) to a numerical value XXXX
train_dow['Time'] = train_dow["Hour"]*100 + train_dow['Minute']
valid_dow['Time'] = valid_dow["Hour"]*100 + valid_dow['Minute']

In [91]:
def day_type(x):
    if x == 6 or x == 0:
        return 1
    else:
        return 0
    
train_dow['isweekend'] = train_dow['Dow'].apply(day_type)
valid_dow['isweekend'] = valid_dow['Dow'].apply(day_type)

In [92]:
# Single mean encoding
from sklearn.model_selection import KFold

def multi_reg_mean_encoding(train, cols, splits =5):
    kf = KFold(n_splits = splits)
    global_mean = train.any_spot.mean()
    
    for col in cols:
        means = train.groupby(col).any_spot.mean()
        train[col+'_mean_enc'] = train[col].map(means)        
        for tr_ind,val_ind in kf.split(train):
            tr,val = train.iloc[tr_ind],train.iloc[val_ind]
            foldmeans = tr.groupby(col).any_spot.mean()
            train.loc[val_ind,col+"_mean_enc"] = train.loc[val_ind,col].map(foldmeans)    
        train[col+"_mean_enc"].fillna(global_mean,inplace=True)   
        
#mean encoding for validation and test data
def multi_test_mean_encoding(test, train, cols):
    for col in cols:
        global_mean = train.any_spot.mean()
        means = train.groupby(col).any_spot.mean()
        test[col+"_mean_enc"] = test[col].map(means)
        test[col+"_mean_enc"].fillna(global_mean, inplace=True)

In [93]:
#create a column for From_To pair
train_dow['From_To'] = train_dow['From'] + '_' + train_dow['To']
valid_dow['From_To'] = valid_dow['From'] + '_' + valid_dow['To']

In [94]:
cols = ['Dow','isweekend','From_To','Street','Hour', 'street_cluster' ]
multi_reg_mean_encoding(train_dow, cols, splits =5)
multi_test_mean_encoding(valid_dow, train_dow, cols)

In [95]:
# Double variable mean encoding
colpairs = [('street_cluster','Dow'),('Dow','Hour'), ('street_cluster','Hour'),
            ('street_cluster','isweekend'),('From_To','Dow'),('From_To','Hour')]
pairlist = []
for pair in colpairs:
    colname = pair[0] + '_' + pair[1]
    pairlist.append(colname)
    train_dow[colname] = list(zip(train_dow[pair[0]], train_dow[pair[1]]))
    valid_dow[colname] = list(zip(valid_dow[pair[0]], valid_dow[pair[1]]))

In [96]:
multi_reg_mean_encoding(train_dow, pairlist, splits =5)
multi_test_mean_encoding(valid_dow, train_dow, pairlist)

In [100]:
#drop Data_Time, Real.Spots, street_cluster_Dow, Dow_Hour, street_cluster_Hour, street_cluster_isweekend, From_To_Dow, From_To_Hour
train_dow = train_dow.drop(['Date_Time','Real.Spots', 'street_cluster_Dow', 'Dow_Hour', 'street_cluster_Hour', 'street_cluster_isweekend', 'From_To_Dow', 'From_To_Hour'], axis =1)
valid_dow = valid_dow.drop(['Date_Time', 'Real.Spots', 'street_cluster_Dow', 'Dow_Hour', 'street_cluster_Hour', 'street_cluster_isweekend', 'From_To_Dow', 'From_To_Hour'], axis = 1)

In [101]:
train_dow['Street'] = train_dow.Street.str.lower()
train_dow['From'] = train_dow.From.str.lower()
train_dow['To'] = train_dow.To.str.lower()
valid_dow['Street'] = valid_dow.Street.str.lower()
valid_dow['From'] = valid_dow.From.str.lower()
valid_dow['To'] = valid_dow.To.str.lower()

In [102]:
# Join sensor data:
train_dow = pd.merge(train_dow,sensor, how='left',left_on=['Street','Dow','Hour'], right_on=['STREET_NAME','DOW','TIME_OF_DAY'])
valid_dow = pd.merge(valid_dow,sensor, how='left',left_on=['Street','Dow','Hour'], right_on=['STREET_NAME','DOW','TIME_OF_DAY'])

train_dow = train_dow.drop(['STREET_NAME','DOW','TIME_OF_DAY'],axis =1)
valid_dow = valid_dow.drop(['STREET_NAME','DOW','TIME_OF_DAY'],axis =1)

In [104]:
# convert categorical variables to code
def process_dfs(train_df, test_df, cols):
    # transfer categorical 
    for name in cols:
        train_df[name]=train_df[name].astype('category').cat.as_ordered()
        test_df[name] = pd.Categorical(test_df[name], categories=train_df[name].cat.categories, ordered=True)
    return train_df, test_df
  
def cat2code(train_df, test_df, cols):
    # transfer categorical 
    for name in cols:
        train_df[name]= train_df[name].cat.codes
        test_df[name] = test_df[name].cat.codes
    return train_df, test_df

In [105]:
catcols = ['Street','From','To', 'From_To']
train_dow,valid_dow = process_dfs(train_dow.copy(),valid_dow.copy(),catcols)
train_dow,valid_dow = cat2code(train_dow,valid_dow,catcols)

In [108]:
# split X and y
def split_response(df, response):
    y = df[response].values
    df.drop([response],axis=1,inplace=True)
    return df,y

In [109]:
X_train_dow,y_train_dow = split_response(train_dow.copy(),'any_spot')
X_valid_dow,y_valid_dow = split_response(valid_dow.copy(),'any_spot')

In [116]:
def f05_score_soft(labels, preds):
    tp = np.sum((labels==(preds>0.5)) & (labels==1))
    tn = np.sum((labels==(preds<0.5)) & (labels==0))
    fp = np.sum((preds>0.5))-tp
    fn = np.sum(preds<0.5)-tn
    p = tp/(tp+fp)
    r = tp/(tp+fn)
    score = 1.25*p*r/(0.25*p+r)
    return score

def f05_score_hard(labels, preds):
    tp = np.sum((labels==preds) & (labels==1))
    tn = np.sum((labels==preds) & (labels==0))
    fp = np.sum(preds==1)-tp
    fn = np.sum(preds==0)-tn
#    print(tp, tn, fp, fn)
    p = tp*1.0/(tp+fp)
    r = tp*1.0/(tp+fn)
    score = 1.25*p*r/(0.25*p+r)
    return score

In [148]:
# Manual gridsearch using validation set
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score
import itertools

n_estimators = [20, 40, 60, 80]
max_depth = [3,5,10,15,20,25,30]
max_features = [0.2,0.4,0.5,0.6,0.8,0.9,1]
weight = [0.7, 0.8, 0.9, 1, 1.1, 1.2, 1.3, 1.4, 1.5]
para_lists = [n_estimators, max_depth, max_features, weight]
combo = list(itertools.product(*para_lists))

In [150]:
parameters = []
f05 = []
precision = []
recall = []

for p in combo:
    np.random.seed(42)
    m = RandomForestClassifier(n_estimators = p[0], max_depth =p[1], max_features=p[2],class_weight={0:1,1:p[3]}, min_samples_split = 2, n_jobs=-1)
    m.fit(X_train_dow,y_train_dow)
    pred = m.predict(X_valid_dow)
    prec = precision_score(y_valid_dow, pred)
    rec = recall_score(y_valid_dow, pred)
    f = f05_score_hard(y_valid_dow, pred)
    parameters.append(p)
    f05.append(f)
    precision.append(prec)
    recall.append(rec)

table1 = pd.DataFrame({'Parameters': parameters, 'f0.5': f05, 'precision':precision, 'recall':recall})

In [151]:
table1.sort_values('f0.5', ascending=False, inplace=True)

In [152]:
table1.head()

Unnamed: 0,Parameters,f0.5,precision,recall
514,"(40, 5, 0.4, 0.8)",0.616883,0.655172,0.5
19,"(20, 3, 0.5, 0.8)",0.615672,0.6875,0.434211
531,"(40, 5, 0.6, 0.7)",0.607639,0.660377,0.460526
75,"(20, 5, 0.4, 1)",0.603933,0.614286,0.565789
943,"(60, 3, 1, 1.4)",0.600649,0.637931,0.486842


In [153]:
# use the best hyperparameters (40, 5, 0.4, 0.8) to get feature importance:
np.random.seed(42)
m2 = RandomForestClassifier(n_estimators = 40, max_depth =5, max_features=0.4, class_weight={0:1,1:0.8}, min_samples_split = 2, n_jobs=-1)
m2.fit(X_train_dow,y_train_dow)

feat_imp_dow = pd.DataFrame({
    'features': X_train_dow.columns,
    'imp' :m2.feature_importances_
})
feat_imp_dow.sort_values(by='imp', inplace=True, ascending=False)

In [178]:
feat_imp_dow

Unnamed: 0,features,imp
15,From_To_mean_enc,0.19073
3,Street.Length,0.0827
20,Dow_Hour_mean_enc,0.069543
36,GMP_UNKNOWN_TIME_pct,0.038704
10,Time,0.037968
5,lng,0.035539
17,Hour_mean_enc,0.031345
12,From_To,0.027748
4,lat,0.027581
2,To,0.025569


In [170]:
to_drop_dow = feat_imp_dow[feat_imp_dow['imp'] <= 0.01].features

### DOW-retrain entire model - remember to drop insignificant columns in the end

In [156]:
train = pd.merge(train,st_latlng,how ='left',left_on=["Street","From","To"], right_on=["Street","From","To"])
test= pd.merge(test,st_latlng,how ='left',left_on=["Street","From","To"], right_on=["Street","From","To"])

spots = np.vstack(train[['lat','lng']].values)
kmeans = KMeans().fit(spots)
train['street_cluster'] = kmeans.predict(train[['lat', 'lng']])
test['street_cluster'] = kmeans.predict(test[['lat', 'lng']])

In [157]:
train['Minute'] = train.Date_Time.dt.minute
test['Minute'] = pd.to_datetime(test['Date_Time']).dt.minute
train['Hour'] = train.Date_Time.dt.hour
test['Hour'] = pd.to_datetime(test['Date_Time']).dt.hour
train['Dow'] = train.Date_Time.dt.weekday
test['Dow'] = pd.to_datetime(test['Date_Time']).dt.weekday

In [158]:
train['Time'] = train["Hour"]*100 + train['Minute']
test['Time'] = test["Hour"]*100 + test['Minute']
train['isweekend'] = train['Dow'].apply(day_type)
test['isweekend'] = test['Dow'].apply(day_type)
train['From_To'] = train['From'] + '_' + train['To']
test['From_To'] = test['From'] + '_' + test['To']

In [159]:
cols = ['Dow','isweekend','From_To','Street','Hour', 'street_cluster' ]
multi_reg_mean_encoding(train, cols, splits =5)
multi_test_mean_encoding(test, train, cols)

In [160]:
colpairs = [('street_cluster','Dow'),('Dow','Hour'), ('street_cluster','Hour'),
            ('street_cluster','isweekend'),('From_To','Dow'),('From_To','Hour')]
pairlist = []
for pair in colpairs:
    colname = pair[0] + '_' + pair[1]
    pairlist.append(colname)
    train[colname] = list(zip(train[pair[0]], train[pair[1]]))
    test[colname] = list(zip(test[pair[0]], test[pair[1]]))

In [161]:
multi_reg_mean_encoding(train, pairlist, splits =5)
multi_test_mean_encoding(test, train, pairlist)

In [162]:
train = train.drop(['Date_Time','Real.Spots', 'street_cluster_Dow', 'Dow_Hour', 'street_cluster_Hour', 'street_cluster_isweekend', 'From_To_Dow', 'From_To_Hour'], axis =1)
test = test.drop(['Date_Time', 'street_cluster_Dow', 'Dow_Hour', 'street_cluster_Hour', 'street_cluster_isweekend', 'From_To_Dow', 'From_To_Hour'], axis = 1)

In [163]:
train['Street'] = train.Street.str.lower()
train['From'] = train.From.str.lower()
train['To'] = train.To.str.lower()
test['Street'] = test.Street.str.lower()
test['From'] = test.From.str.lower()
test['To'] = test.To.str.lower()

In [164]:
train = pd.merge(train,sensor, how='left',left_on=['Street','Dow','Hour'], right_on=['STREET_NAME','DOW','TIME_OF_DAY'])
test = pd.merge(test,sensor, how='left',left_on=['Street','Dow','Hour'], right_on=['STREET_NAME','DOW','TIME_OF_DAY'])

train = train.drop(['STREET_NAME','DOW','TIME_OF_DAY'],axis =1)
test = test.drop(['STREET_NAME','DOW','TIME_OF_DAY'],axis =1)

In [165]:
catcols = ['Street','From','To', 'From_To']
train,test = process_dfs(train.copy(),test.copy(),catcols)
train,test = cat2code(train,test,catcols)

In [167]:
X_train,y_train = split_response(train.copy(),'any_spot')

In [172]:
# drop unimportant features
X_train.drop(to_drop_dow, axis = 1, inplace=True)
test.drop(to_drop_dow, axis = 1, inplace=True)

In [175]:
# Fit rf
np.random.seed(42)
m_dow = RandomForestClassifier(n_estimators = 40, max_depth =5, max_features=0.4, class_weight={0:1,1:0.8}, min_samples_split = 2, n_jobs=-1)
m_dow.fit(X_train,y_train)
pred_dow = m_dow.predict(test)

In [176]:
result = pd.DataFrame({"id": np.arange(1,len(test)+1),"any_spot":pred_dow})

In [177]:
result[['id','any_spot']].to_csv("../submissions/32_dow_test.csv", index =False)

By DOW: 

Result from leaderboard: 0.55749

on validation set: 0.616883