In [1]:
import numpy as np
import pandas as pd

In [2]:
test_raw = pd.read_csv('../data/test-no-label-parking.csv', parse_dates = [[3,4]], infer_datetime_format=True)

In [3]:
test_raw['Minute'] = test_raw.Date_Time.dt.minute
test_raw['Hour'] = test_raw.Date_Time.dt.hour
test_raw['Dow'] = test_raw.Date_Time.dt.weekday
test_raw['Year'] = test_raw.Date_Time.dt.year
test_raw['Month'] = test_raw.Date_Time.dt.month
def day_type(x):
    if x == 6 or x == 0:
        return 1
    else:
        return 0
test_raw['isweekend'] = test_raw['Dow'].apply(day_type)

In [4]:
test_raw.head()

Unnamed: 0,Date_Time,Street,From,To,Street.Length,Minute,Hour,Dow,Year,Month,isweekend
0,2014-03-28 16:34:00,Stockton Street,Stockton Tunnel,Stockton Street,35.786472,34,16,4,2014,3,0
1,2014-03-28 21:34:00,Van Ness Avenue,Hayes Street,Van Ness Avenue,63.787968,34,21,4,2014,3,0
2,2014-03-28 19:50:00,Van Ness Avenue,McAllister Street,Van Ness Avenue,56.007236,50,19,4,2014,3,0
3,2014-03-28 20:02:00,Mission Street,11th Street,Mission Street,139.6519,2,20,4,2014,3,0
4,2014-03-28 19:43:00,Hyde Street,Golden Gate Avenue,McAllister Street,105.14411,43,19,4,2014,3,0


In [6]:
train_raw = pd.read_csv('../data/train-parking.csv', parse_dates = [[3,4]], infer_datetime_format=True)
train_raw['Minute'] = train_raw.Date_Time.dt.minute
train_raw['Hour'] = train_raw.Date_Time.dt.hour
train_raw['Dow'] = train_raw.Date_Time.dt.weekday
train_raw['Year'] = train_raw.Date_Time.dt.year
train_raw['Month'] = train_raw.Date_Time.dt.month
train_raw['isweekend'] = train_raw['Dow'].apply(day_type)

In [7]:
train_raw.groupby(['Hour'])['Date_Time'].count()

Hour
7      63
8      36
9      58
10     32
11     31
12     33
13    101
14    109
15     97
16     98
17     61
18     99
19     93
20     93
21     91
22      5
Name: Date_Time, dtype: int64

In [8]:
# notice that in test set we have time of 23, which is not present in the train
test_raw.groupby(['Hour'])['Date_Time'].count()

Hour
7       6
8      48
9      51
10     73
11     76
12     23
13     21
14     37
15     29
16     43
17      3
18     26
19    102
20     92
21     72
22     20
23      4
Name: Date_Time, dtype: int64

In [9]:
train_raw.groupby(['Dow'])['Date_Time'].count()

Dow
0    179
1     94
2    112
3     91
4    229
5    284
6    111
Name: Date_Time, dtype: int64

In [10]:
# notice that we dont even have Sunday on testset - so we won't include sundays in validation set
test_raw.groupby(['Dow'])['Date_Time'].count()

Dow
1     54
2    248
3     38
4    194
5    120
6     72
Name: Date_Time, dtype: int64

In [11]:
print('train length: {} test length: {}'.format(train_raw.shape[0], test_raw.shape[0]))

train length: 1100 test length: 726


In [12]:
train_raw[train_raw['Dow'] != 0].groupby(['Dow'])['Date_Time'].count()/train_raw[train_raw['Dow'] != 0].shape[0]

Dow
1    0.102063
2    0.121607
3    0.098806
4    0.248643
5    0.308360
6    0.120521
Name: Date_Time, dtype: float64

In [13]:
test_raw.groupby(['Dow'])['Date_Time'].count()/test_raw.shape[0]

Dow
1    0.074380
2    0.341598
3    0.052342
4    0.267218
5    0.165289
6    0.099174
Name: Date_Time, dtype: float64

In [14]:
(train_raw.groupby(['Street'])['Date_Time'].count()-test_raw.groupby(['Street'])['Date_Time'].count()*train_raw.shape[0]/test_raw.shape[0]).astype('int')

Street
23rd Street           60
Battery Street         0
Bryant Street         11
Bush Street           -2
Columbus Avenue       -1
Geary Street         -14
Grove Street         -13
Hyde Street           -5
Jackson Street         3
Jessie Street          4
Jones Street         -34
Kearny Street        -16
Larkin Street        -31
Leavenworth Street    -5
Mason Street          -7
Mission Street        34
Montgomery Street     -7
Pine Street           15
Polk Street            0
Post Street           -1
Redwood Street        -2
Stockton Street       -8
Sutter Street         -6
Taylor Street        -13
Van Ness Avenue       23
Washington Street     19
Name: Date_Time, dtype: int64

### Validation 1: use the same proportion of oberservation by day of week as the test set

In [66]:
test_prop = test_raw.groupby(['Dow'])['Date_Time'].count()/test_raw.shape[0]

In [67]:
# assume that we want to have ~250 observations in the validation set, so we will select the following number of rows in the train set for each dow:
n_rows_dow=(test_prop*250).astype('int')

In [68]:
valid_dow = pd.DataFrame()
for dow, i in zip(n_rows_dow.index, n_rows_dow):
    df_tmp = train_raw[train_raw['Dow']==dow].reset_index()
    rows = np.random.choice(df_tmp.index.values, i, replace=False)
    sampled_df = df_tmp.iloc[rows]
    valid_dow = pd.concat([valid_dow, sampled_df])

In [69]:
valid_dow.head()

Unnamed: 0,index,Date_Time,Street,From,To,Real.Spots,Street.Length,any_spot,Minute,Hour,Dow,Year,Month,isweekend
27,113,2014-01-28 15:41:00,Jackson Street,Polk Street,Van Ness Avenue,3,139.32397,1,41,15,1,2014,1,0
39,389,2014-02-11 13:38:00,Stockton Street,Stockton Street,Jackson Street,0,84.28298,0,38,13,1,2014,2,0
44,394,2014-02-11 13:43:00,Stockton Street,Stockton Tunnel,Stockton Street,0,35.786472,0,43,13,1,2014,2,0
80,927,2014-03-25 15:41:00,Montgomery Street,Verdi Place,Pacific Avenue,0,66.27028,0,41,15,1,2014,3,0
50,400,2014-02-11 13:15:00,Montgomery Street,Washington Street,Montgomery Street,0,51.695087,0,15,13,1,2014,2,0


In [71]:
valid_dow.drop(['Minute','Hour','Dow','Year','Month','isweekend'], axis=1, inplace=True)

In [52]:
# train_raw = pd.read_csv('../data/train-parking.csv', parse_dates = [[3,4]], infer_datetime_format=True)
# valid_dow = valid_dow[train_raw.columns]

In [81]:
valid_dow.to_csv('../data/valid_dow.csv', index_label='index')

### Validation 2: use the same proportion of oberservation by hour_group as the test set

In [15]:
def hour_group(x):
    if x <= 11:
        return 'morning'
    elif x > 11 and x<=17:
        return  'afternoon'
    else:
        return 'night'

In [16]:
train_raw['Hour_group'] = train_raw['Hour'].apply(hour_group)
test_raw['Hour_group'] = test_raw['Hour'].apply(hour_group)

In [17]:
train_raw.head()

Unnamed: 0,Date_Time,Street,From,To,Real.Spots,Street.Length,any_spot,Minute,Hour,Dow,Year,Month,isweekend,Hour_group
0,2014-01-07 16:19:00,Mission Street,25th Street,26th Street,4,179.13297,1,19,16,1,2014,1,0,afternoon
1,2014-01-18 20:42:00,Polk Street,Ellis Street,Olive Street,0,52.74021,0,42,20,5,2014,1,0,night
2,2014-01-18 20:39:00,Van Ness Avenue,Geary Boulevard,Myrtle Street,0,52.51784,0,39,20,5,2014,1,0,night
3,2014-01-18 20:38:00,Van Ness Avenue,Bush Street,Fern Street,0,52.405315,0,38,20,5,2014,1,0,night
4,2014-01-18 20:38:00,Van Ness Avenue,Daniel Burnham Court,Post Street,0,52.191193,0,38,20,5,2014,1,0,night


In [76]:
test_raw.groupby(['Hour_group'])['Date_Time'].count()/test_raw.shape[0]

Hour_group
afternoon    0.214876
morning      0.349862
night        0.435262
Name: Date_Time, dtype: float64

In [77]:
train_raw.groupby(['Hour_group'])['Date_Time'].count()/train_raw.shape[0]

Hour_group
afternoon    0.453636
morning      0.200000
night        0.346364
Name: Date_Time, dtype: float64

In [78]:
test_prop_hg = test_raw.groupby(['Hour_group'])['Date_Time'].count()/test_raw.shape[0]

# assume that we want to have ~250 observations in the validation set, so we will select the following number of rows in the train set for each dow:
n_rows_hg=(test_prop_hg*250).astype('int')

In [79]:
valid_hg = pd.DataFrame()
for hg, i in zip(n_rows_hg.index, n_rows_hg):
    df_tmp = train_raw[train_raw['Hour_group']==hg].reset_index()
    rows = np.random.choice(df_tmp.index.values, i, replace=False)
    sampled_df = df_tmp.iloc[rows]
    valid_hg = pd.concat([valid_hg, sampled_df])

In [80]:
valid_hg.drop(['Minute','Hour','Dow','Year','Month','isweekend','Hour_group'], axis=1, inplace=True)

In [83]:
valid_hg.head()

Unnamed: 0,index,Date_Time,Street,From,To,Real.Spots,Street.Length,any_spot
448,983,2014-03-26 16:01:00,Bush Street,Taylor Street,Mason Street,0,146.52574,0
445,979,2014-03-26 14:10:00,Polk Street,Bonita Street,Green Street,0,51.76525,0
69,134,2014-01-30 16:27:00,Polk Street,Jackson Street,Washington Street,0,98.40858,0
44,101,2014-01-28 13:56:00,Van Ness Avenue,Clay Street,Washington Street,2,98.93023,1
493,1036,2014-03-27 13:21:00,Washington Street,Hotaling Street,Columbus Avenue,0,51.84337,0


In [84]:
valid_hg.to_csv('../data/valid_hg.csv', index_label='index')

### Group by street , is weekend and hour group

In [18]:
frac = test_raw.groupby(['Street','isweekend','Hour_group'])['Date_Time'].count()/test_raw.shape[0]
frac = frac.reset_index()

In [19]:
frac['pct'] = frac['Date_Time']
frac['count'] = (300*frac['pct']).astype('int')
frac = frac.drop('Date_Time', axis =1)

In [20]:
frac.head()

Unnamed: 0,Street,isweekend,Hour_group,pct,count
0,23rd Street,0,afternoon,0.011019,3
1,23rd Street,0,morning,0.002755,0
2,Battery Street,0,afternoon,0.002755,0
3,Battery Street,0,morning,0.004132,1
4,Battery Street,0,night,0.004132,1


In [23]:
valset = pd.DataFrame()
for i in range(len(frac)):
    street, wkn, hg, n = frac.iloc[i][['Street','isweekend','Hour_group','count']]
    df_tmp = train_raw[(train_raw['Street']==street) & 
                       (train_raw['isweekend']==wkn) & 
                       (train_raw['Hour_group']==hg)].reset_index()
    # In order not to take all the data in one segment into the val set, we only take sample if the number of samples
    # in training is 2 times or more than the number of sample we want to take into val
    if (len(df_tmp)>= 2*n) and (n!=0) :
        rows = np.random.choice(df_tmp.index.values, n, replace=False)
        sampled_df = df_tmp.iloc[rows]
        valset = pd.concat([valset, sampled_df])

In [24]:
valset.shape

(156, 15)

In [25]:
valset.columns

Index(['index', 'Date_Time', 'Street', 'From', 'To', 'Real.Spots',
       'Street.Length', 'any_spot', 'Minute', 'Hour', 'Dow', 'Year', 'Month',
       'isweekend', 'Hour_group'],
      dtype='object')

In [26]:
valset.drop(['Minute','Hour','Dow','Year','Month','isweekend','Hour_group'], axis=1, inplace=True)

In [27]:
valset.to_csv('../data/valid_swhg.csv', index_label='index')