In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv('../data/train-parking.csv', parse_dates=[[3,4]], infer_datetime_format=True)
test=pd.read_csv('../data/test-no-label-parking.csv', parse_dates=[[3,4]], infer_datetime_format=True)
# valid_dow = pd.read_csv('../data/valid_dow.csv', infer_datetime_format=True, index_col='index').set_index('index.1')
# valid_hg = pd.read_csv('../data/valid_hg.csv', infer_datetime_format=True, index_col='index').set_index('index.1')
valid_swhg = pd.read_csv('../data/valid_swhg.csv', infer_datetime_format=True, index_col='index').set_index('index.1')

In [3]:
train.head()

Unnamed: 0,Date_Time,Street,From,To,Real.Spots,Street.Length,any_spot
0,2014-01-07 16:19:00,Mission Street,25th Street,26th Street,4,179.13297,1
1,2014-01-18 20:42:00,Polk Street,Ellis Street,Olive Street,0,52.74021,0
2,2014-01-18 20:39:00,Van Ness Avenue,Geary Boulevard,Myrtle Street,0,52.51784,0
3,2014-01-18 20:38:00,Van Ness Avenue,Bush Street,Fern Street,0,52.405315,0
4,2014-01-18 20:38:00,Van Ness Avenue,Daniel Burnham Court,Post Street,0,52.191193,0


##### Get the proper train and validation set

In [4]:
# train_dow = train[~train.index.isin(valid_dow.index)]
# train_hg = train[~train.index.isin(valid_hg.index)]
train_swhg = train[~train.index.isin(valid_swhg.index)]

## Select validation set - round 1

In [5]:
# get long and lat for train street
st_latlng = pd.read_csv("../data/train_longlat.csv", index_col = False,  dtype = {'lat':np.float64,'lng':np.float64})

In [6]:
train_swhg = pd.merge(train_swhg,st_latlng,how ='left',left_on=["Street","From","To"], right_on=["Street","From","To"])
valid_swhg= pd.merge(valid_swhg,st_latlng,how ='left',left_on=["Street","From","To"], right_on=["Street","From","To"])

In [7]:
# sensor related data
sensor = pd.read_csv('../data/aggregated_sensor.csv')

In [8]:
sensor['TIME_OF_DAY'] = sensor['TIME_OF_DAY']/100
sensor['TIME_OF_DAY'] = sensor['TIME_OF_DAY'].astype('int')

In [9]:
valid_swhg

Unnamed: 0,Date_Time,Street,From,To,Real.Spots,Street.Length,any_spot,lat,lng
0,2014-01-21 17:35:00,23rd Street,Mission Street,Bartlett Street,1,96.037110,1,37.753846,-122.418619
1,2014-02-15 16:50:00,23rd Street,South Van Ness Avenue,Capp Street,1,96.301300,1,37.753976,-122.416441
2,2014-03-01 16:59:00,23rd Street,Mission Street,Bartlett Street,3,96.037110,1,37.753846,-122.418619
3,2014-03-15 08:15:00,Battery Street,Halleck Street,California Street,0,54.143090,0,37.793702,-122.400085
4,2014-02-24 11:10:00,Battery Street,California Street,Pine Street,0,107.501144,0,37.793224,-122.399989
5,2014-03-22 15:24:00,Bush Street,Taylor Street,Mason Street,4,146.525740,1,37.789758,-122.412102
6,2014-03-15 21:27:00,Bush Street,Taylor Street,Mason Street,1,146.525740,1,37.789758,-122.412102
7,2014-03-21 22:06:00,Bush Street,Taylor Street,Mason Street,1,146.525740,1,37.789758,-122.412102
8,2014-03-15 20:32:00,Bush Street,Taylor Street,Mason Street,2,146.525740,1,37.789758,-122.412102
9,2014-03-22 13:16:00,Geary Street,Leavenworth Street,Hyde Street,9,146.586320,1,37.786532,-122.414834


In [10]:
# Clustering
from sklearn.cluster import KMeans
spots = np.vstack(train_swhg[['lat','lng']].values)
kmeans = KMeans().fit(spots)
train_swhg['street_cluster'] = kmeans.predict(train_swhg[['lat', 'lng']])
valid_swhg['street_cluster'] = kmeans.predict(valid_swhg[['lat', 'lng']])

In [11]:
# Time features
train_swhg['Minute'] = train_swhg.Date_Time.dt.minute
valid_swhg['Minute'] = pd.to_datetime(valid_swhg['Date_Time']).dt.minute
train_swhg['Hour'] = train_swhg.Date_Time.dt.hour
valid_swhg['Hour'] = pd.to_datetime(valid_swhg['Date_Time']).dt.hour
train_swhg['Dow'] = train_swhg.Date_Time.dt.weekday
valid_swhg['Dow'] = pd.to_datetime(valid_swhg['Date_Time']).dt.weekday

In [12]:
# Convert the time (Hour, Minute) to a numerical value XXXX
train_swhg['Time'] = train_swhg["Hour"]*100 + train_swhg['Minute']
valid_swhg['Time'] = valid_swhg["Hour"]*100 + valid_swhg['Minute']

In [13]:
def day_type(x):
    if x == 6 or x == 0:
        return 1
    else:
        return 0
    
train_swhg['isweekend'] = train_swhg['Dow'].apply(day_type)
valid_swhg['isweekend'] = valid_swhg['Dow'].apply(day_type)

In [14]:
# Single mean encoding
from sklearn.model_selection import KFold

def multi_reg_mean_encoding(train, cols, splits =5):
    kf = KFold(n_splits = splits)
    global_mean = train.any_spot.mean()
    
    for col in cols:
        means = train.groupby(col).any_spot.mean()
        train[col+'_mean_enc'] = train[col].map(means)        
        for tr_ind,val_ind in kf.split(train):
            tr,val = train.iloc[tr_ind],train.iloc[val_ind]
            foldmeans = tr.groupby(col).any_spot.mean()
            train.loc[val_ind,col+"_mean_enc"] = train.loc[val_ind,col].map(foldmeans)    
        train[col+"_mean_enc"].fillna(global_mean,inplace=True)   
        
#mean encoding for validation and test data
def multi_test_mean_encoding(test, train, cols):
    for col in cols:
        global_mean = train.any_spot.mean()
        means = train.groupby(col).any_spot.mean()
        test[col+"_mean_enc"] = test[col].map(means)
        test[col+"_mean_enc"].fillna(global_mean, inplace=True)

In [15]:
#create a column for From_To pair
train_swhg['From_To'] = train_swhg['From'] + '_' + train_swhg['To']
valid_swhg['From_To'] = valid_swhg['From'] + '_' + valid_swhg['To']

In [16]:
cols = ['Dow','isweekend','From_To','Street','Hour', 'street_cluster' ]
multi_reg_mean_encoding(train_swhg, cols, splits =5)
multi_test_mean_encoding(valid_swhg, train_swhg, cols)

In [17]:
# Double variable mean encoding
colpairs = [('street_cluster','Dow'),('Dow','Hour'), ('street_cluster','Hour'),
            ('street_cluster','isweekend'),('From_To','Dow'),('From_To','Hour')]
pairlist = []
for pair in colpairs:
    colname = pair[0] + '_' + pair[1]
    pairlist.append(colname)
    train_swhg[colname] = list(zip(train_swhg[pair[0]], train_swhg[pair[1]]))
    valid_swhg[colname] = list(zip(valid_swhg[pair[0]], valid_swhg[pair[1]]))

In [18]:
multi_reg_mean_encoding(train_swhg, pairlist, splits =5)
multi_test_mean_encoding(valid_swhg, train_swhg, pairlist)

In [19]:
#drop Data_Time, Real.Spots, street_cluster_Dow, Dow_Hour, street_cluster_Hour, street_cluster_isweekend, From_To_Dow, From_To_Hour
train_swhg = train_swhg.drop(['Date_Time','Real.Spots', 'street_cluster_Dow', 'Dow_Hour', 'street_cluster_Hour', 'street_cluster_isweekend', 'From_To_Dow', 'From_To_Hour'], axis =1)
valid_swhg = valid_swhg.drop(['Date_Time', 'Real.Spots', 'street_cluster_Dow', 'Dow_Hour', 'street_cluster_Hour', 'street_cluster_isweekend', 'From_To_Dow', 'From_To_Hour'], axis = 1)

In [20]:
train_swhg['Street'] = train_swhg.Street.str.lower()
train_swhg['From'] = train_swhg.From.str.lower()
train_swhg['To'] = train_swhg.To.str.lower()
valid_swhg['Street'] = valid_swhg.Street.str.lower()
valid_swhg['From'] = valid_swhg.From.str.lower()
valid_swhg['To'] = valid_swhg.To.str.lower()

In [21]:
# Join sensor data:
train_swhg = pd.merge(train_swhg,sensor, how='left',left_on=['Street','Dow','Hour'], right_on=['STREET_NAME','DOW','TIME_OF_DAY'])
valid_swhg = pd.merge(valid_swhg,sensor, how='left',left_on=['Street','Dow','Hour'], right_on=['STREET_NAME','DOW','TIME_OF_DAY'])

train_swhg = train_swhg.drop(['STREET_NAME','DOW','TIME_OF_DAY','Minute'],axis =1)
valid_swhg = valid_swhg.drop(['STREET_NAME','DOW','TIME_OF_DAY','Minute'],axis =1)

In [22]:
# convert categorical variables to code
def process_dfs(train_df, test_df, cols):
    # transfer categorical 
    for name in cols:
        train_df[name]=train_df[name].astype('category').cat.as_ordered()
        test_df[name] = pd.Categorical(test_df[name], categories=train_df[name].cat.categories, ordered=True)
    return train_df, test_df
  
def cat2code(train_df, test_df, cols):
    # transfer categorical 
    for name in cols:
        train_df[name]= train_df[name].cat.codes
        test_df[name] = test_df[name].cat.codes
    return train_df, test_df

In [23]:
catcols = ['Street','From','To', 'From_To']
train_swhg,valid_swhg = process_dfs(train_swhg.copy(),valid_swhg.copy(),catcols)
train_swhg,valid_swhg = cat2code(train_swhg,valid_swhg,catcols)

In [24]:
# split X and y
def split_response(df, response):
    y = df[response].values
    df.drop([response],axis=1,inplace=True)
    return df,y

In [25]:
X_train_swhg,y_train_swhg = split_response(train_swhg.copy(),'any_spot')
X_valid_swhg,y_valid_swhg = split_response(valid_swhg.copy(),'any_spot')

In [26]:
def f05_score_soft(labels, preds):
    tp = np.sum((labels==(preds>0.5)) & (labels==1))
    tn = np.sum((labels==(preds<0.5)) & (labels==0))
    fp = np.sum((preds>0.5))-tp
    fn = np.sum(preds<0.5)-tn
    p = tp/(tp+fp)
    r = tp/(tp+fn)
    score = 1.25*p*r/(0.25*p+r)
    return score

def f05_score_hard(labels, preds):
    tp = np.sum((labels==preds) & (labels==1))
    tn = np.sum((labels==preds) & (labels==0))
    fp = np.sum(preds==1)-tp
    fn = np.sum(preds==0)-tn
#    print(tp, tn, fp, fn)
    p = tp*1.0/(tp+fp)
    r = tp*1.0/(tp+fn)
    score = 1.25*p*r/(0.25*p+r)
    return score

In [28]:
# Manual gridsearch using validation set
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score
import itertools

n_estimators = [20, 40, 60, 80]
max_depth = [3,5,10,15,20,25,30]
max_features = [0.2,0.4,0.5,0.6,0.8,0.9,1]
weight = [0.7, 0.8, 0.9, 1, 1.1, 1.2, 1.3, 1.4, 1.5]
para_lists = [n_estimators, max_depth, max_features, weight]
combo = list(itertools.product(*para_lists))

In [78]:
parameters = []
f05 = []
precision = []
recall = []

for p in combo:
    np.random.seed(1)
    m = RandomForestClassifier(n_estimators = p[0], max_depth =p[1], max_features=p[2],class_weight={0:1,1:p[3]}, min_samples_split = 2, n_jobs=-1)
    m.fit(X_train_swhg,y_train_swhg)
    pred = m.predict(X_valid_swhg)
    prec = precision_score(y_valid_swhg, pred)
    rec = recall_score(y_valid_swhg, pred)
    f = f05_score_hard(y_valid_swhg, pred)
    parameters.append(p)
    f05.append(f)
    precision.append(prec)
    recall.append(rec)

table3 = pd.DataFrame({'Parameters': parameters, 'f0.5': f05, 'precision':precision, 'recall':recall})

In [79]:
table3.sort_values('f0.5', ascending=False, inplace=True)

In [80]:
table3.head()

Unnamed: 0,Parameters,f0.5,precision,recall
687,"(40, 15, 1, 1)",0.59375,0.703704,0.365385
346,"(20, 25, 0.6, 1.1)",0.592105,0.72,0.346154
283,"(20, 20, 0.6, 1.1)",0.592105,0.72,0.346154
409,"(20, 30, 0.6, 1.1)",0.592105,0.72,0.346154
59,"(20, 3, 1, 1.25)",0.576923,0.923077,0.230769


In [37]:
# use the best hyperparameters (80, 25, 1, 0.9) to get feature importance:
#(20,15,1,1.2)42
#(20,10,1,1.1)1
#(20,25,0.9,1.4)1

best = (20,15,1,1.2)
np.random.seed(42)
m2 = RandomForestClassifier(n_estimators = best[0], max_depth =best[1], max_features=best[2], class_weight={0:1,1:best[3]}, min_samples_split = 2, n_jobs=-1)
m2.fit(X_train_swhg,y_train_swhg)

feat_imp_swhg = pd.DataFrame({
    'features': X_train_swhg.columns,
    'imp' :m2.feature_importances_
})
feat_imp_swhg.sort_values(by='imp', inplace=True, ascending=False)

In [38]:
pred2 = m2.predict(X_valid_swhg)
prec2 = precision_score(y_valid_swhg, pred2)
rec2 = recall_score(y_valid_swhg, pred2)
f2 = f05_score_hard(y_valid_swhg, pred2)
f2

0.49479166666666657

In [39]:
feat_imp_swhg

Unnamed: 0,features,imp
9,Time,0.046287
3,Street.Length,0.045467
14,From_To_mean_enc,0.043144
4,lat,0.036671
5,lng,0.036631
2,To,0.036368
11,From_To,0.035328
1,From,0.032855
18,street_cluster_Dow_mean_enc,0.027303
7,Hour,0.026811


In [88]:
to_drop_swhg = feat_imp_swhg[feat_imp_swhg['imp'] <= 0.015].features

### DOW-retrain entire model - remember to drop insignificant columns in the end

In [93]:
train = pd.merge(train,st_latlng,how ='left',left_on=["Street","From","To"], right_on=["Street","From","To"])
test= pd.merge(test,st_latlng,how ='left',left_on=["Street","From","To"], right_on=["Street","From","To"])

spots = np.vstack(train[['lat','lng']].values)
kmeans = KMeans().fit(spots)
train['street_cluster'] = kmeans.predict(train[['lat', 'lng']])
test['street_cluster'] = kmeans.predict(test[['lat', 'lng']])

In [94]:
train['Minute'] = train.Date_Time.dt.minute
test['Minute'] = pd.to_datetime(test['Date_Time']).dt.minute
train['Hour'] = train.Date_Time.dt.hour
test['Hour'] = pd.to_datetime(test['Date_Time']).dt.hour
train['Dow'] = train.Date_Time.dt.weekday
test['Dow'] = pd.to_datetime(test['Date_Time']).dt.weekday

In [95]:
train['Time'] = train["Hour"]*100 + train['Minute']
test['Time'] = test["Hour"]*100 + test['Minute']
train['isweekend'] = train['Dow'].apply(day_type)
test['isweekend'] = test['Dow'].apply(day_type)
train['From_To'] = train['From'] + '_' + train['To']
test['From_To'] = test['From'] + '_' + test['To']

In [96]:
cols = ['Dow','isweekend','From_To','Street','Hour', 'street_cluster' ]
multi_reg_mean_encoding(train, cols, splits =5)
multi_test_mean_encoding(test, train, cols)

In [97]:
colpairs = [('street_cluster','Dow'),('Dow','Hour'), ('street_cluster','Hour'),
            ('street_cluster','isweekend'),('From_To','Dow'),('From_To','Hour')]
pairlist = []
for pair in colpairs:
    colname = pair[0] + '_' + pair[1]
    pairlist.append(colname)
    train[colname] = list(zip(train[pair[0]], train[pair[1]]))
    test[colname] = list(zip(test[pair[0]], test[pair[1]]))

In [98]:
multi_reg_mean_encoding(train, pairlist, splits =5)
multi_test_mean_encoding(test, train, pairlist)

In [99]:
train = train.drop(['Date_Time','Real.Spots', 'street_cluster_Dow', 'Dow_Hour', 'street_cluster_Hour', 'street_cluster_isweekend', 'From_To_Dow', 'From_To_Hour'], axis =1)
test = test.drop(['Date_Time', 'street_cluster_Dow', 'Dow_Hour', 'street_cluster_Hour', 'street_cluster_isweekend', 'From_To_Dow', 'From_To_Hour'], axis = 1)

In [100]:
train['Street'] = train.Street.str.lower()
train['From'] = train.From.str.lower()
train['To'] = train.To.str.lower()
test['Street'] = test.Street.str.lower()
test['From'] = test.From.str.lower()
test['To'] = test.To.str.lower()

In [101]:
train = pd.merge(train,sensor, how='left',left_on=['Street','Dow','Hour'], right_on=['STREET_NAME','DOW','TIME_OF_DAY'])
test = pd.merge(test,sensor, how='left',left_on=['Street','Dow','Hour'], right_on=['STREET_NAME','DOW','TIME_OF_DAY'])

train = train.drop(['STREET_NAME','DOW','TIME_OF_DAY','Minute'],axis =1)
test = test.drop(['STREET_NAME','DOW','TIME_OF_DAY','Minute'],axis =1)

In [102]:
catcols = ['Street','From','To', 'From_To']
train,test = process_dfs(train.copy(),test.copy(),catcols)
train,test = cat2code(train,test,catcols)

In [103]:
X_train,y_train = split_response(train.copy(),'any_spot')

In [104]:
# drop unimportant features
X_train.drop(to_drop_swhg, axis = 1, inplace=True)
test.drop(to_drop_swhg, axis = 1, inplace=True)

In [105]:
# Fit rf
np.random.seed(1)
m_swhg = RandomForestClassifier(n_estimators = 20, max_depth =25, max_features=0.6, class_weight={0:1,1:1.1}, min_samples_split = 2, n_jobs=-1)
m_swhg.fit(X_train,y_train)
pred_swhg = m_swhg.predict(test)

In [106]:
result = pd.DataFrame({"id": np.arange(1,len(test)+1),"any_spot":pred_swhg})

In [107]:
result[['id','any_spot']].to_csv("../submissions/32_swhg_droptime1.csv", index =False)

By Street-dow-hour_group:

Result from leaderboard: 0.56851

on validation set: 0.578035

v2 validationset
Seed 42
LB: 0.568
VAL: 0.568


Seed 1
LB: 0.60
VL: 0.59
