In [1]:
import numpy as np
import pandas as pd

In [4]:
test_raw = pd.read_csv('../data/test-no-label-parking.csv', parse_dates = [[3,4]], infer_datetime_format=True)

In [7]:
test_raw['Minute'] = test_raw.Date_Time.dt.minute
test_raw['Hour'] = test_raw.Date_Time.dt.hour
test_raw['Dow'] = test_raw.Date_Time.dt.weekday

In [9]:
test_raw['Year'] = test_raw.Date_Time.dt.year
test_raw['Month'] = test_raw.Date_Time.dt.month
def day_type(x):
    if x == 6 or x == 0:
        return 1
    else:
        return 0
test_raw['isweekend'] = test_raw['Dow'].apply(day_type)

In [10]:
test_raw.head()

Unnamed: 0,Date_Time,Street,From,To,Street.Length,Minute,Hour,Dow,Year,Month,isweekend
0,2014-03-28 16:34:00,Stockton Street,Stockton Tunnel,Stockton Street,35.786472,34,16,4,2014,3,0
1,2014-03-28 21:34:00,Van Ness Avenue,Hayes Street,Van Ness Avenue,63.787968,34,21,4,2014,3,0
2,2014-03-28 19:50:00,Van Ness Avenue,McAllister Street,Van Ness Avenue,56.007236,50,19,4,2014,3,0
3,2014-03-28 20:02:00,Mission Street,11th Street,Mission Street,139.6519,2,20,4,2014,3,0
4,2014-03-28 19:43:00,Hyde Street,Golden Gate Avenue,McAllister Street,105.14411,43,19,4,2014,3,0


In [13]:
train_raw = pd.read_csv('../data/train-parking.csv', parse_dates = [[3,4]], infer_datetime_format=True)
train_raw['Minute'] = train_raw.Date_Time.dt.minute
train_raw['Hour'] = train_raw.Date_Time.dt.hour
train_raw['Dow'] = train_raw.Date_Time.dt.weekday
train_raw['Year'] = train_raw.Date_Time.dt.year
train_raw['Month'] = train_raw.Date_Time.dt.month
train_raw['isweekend'] = train_raw['Dow'].apply(day_type)

In [14]:
train_raw.groupby(['Hour'])['Date_Time'].count()

Hour
7      63
8      36
9      58
10     32
11     31
12     33
13    101
14    109
15     97
16     98
17     61
18     99
19     93
20     93
21     91
22      5
Name: Date_Time, dtype: int64

In [12]:
# notice that in test set we have time of 23, which is not present in the train
test_raw.groupby(['Hour'])['Date_Time'].count()

Hour
7       6
8      48
9      51
10     73
11     76
12     23
13     21
14     37
15     29
16     43
17      3
18     26
19    102
20     92
21     72
22     20
23      4
Name: Date_Time, dtype: int64

In [15]:
train_raw.groupby(['Dow'])['Date_Time'].count()

Dow
0    179
1     94
2    112
3     91
4    229
5    284
6    111
Name: Date_Time, dtype: int64

In [30]:
# notice that we dont even have Sunday on testset - so we won't include sundays in validation set
test_raw.groupby(['Dow'])['Date_Time'].count()

Dow
1     54
2    248
3     38
4    194
5    120
6     72
Name: Date_Time, dtype: int64

In [18]:
print('train length: {} test length: {}'.format(train_raw.shape[0], test_raw.shape[0]))

train length: 1100 test length: 726


In [35]:
train_raw[train_raw['Dow'] != 0].groupby(['Dow'])['Date_Time'].count()/train_raw[train_raw['Dow'] != 0].shape[0]

Dow
1    0.102063
2    0.121607
3    0.098806
4    0.248643
5    0.308360
6    0.120521
Name: Date_Time, dtype: float64

In [32]:
test_raw.groupby(['Dow'])['Date_Time'].count()/test_raw.shape[0]

Dow
1    0.074380
2    0.341598
3    0.052342
4    0.267218
5    0.165289
6    0.099174
Name: Date_Time, dtype: float64

In [89]:
(train_raw.groupby(['Street'])['Date_Time'].count()-test_raw.groupby(['Street'])['Date_Time'].count()*train_raw.shape[0]/test_raw.shape[0]).astype('int')

Street
23rd Street           60
Battery Street         0
Bryant Street         11
Bush Street           -2
Columbus Avenue       -1
Geary Street         -14
Grove Street         -13
Hyde Street           -5
Jackson Street         3
Jessie Street          4
Jones Street         -34
Kearny Street        -16
Larkin Street        -31
Leavenworth Street    -5
Mason Street          -7
Mission Street        34
Montgomery Street     -7
Pine Street           15
Polk Street            0
Post Street           -1
Redwood Street        -2
Stockton Street       -8
Sutter Street         -6
Taylor Street        -13
Van Ness Avenue       23
Washington Street     19
Name: Date_Time, dtype: int64

### Validation 1: use the same proportion of oberservation by day of week as the test set

In [36]:
test_prop = test_raw.groupby(['Dow'])['Date_Time'].count()/test_raw.shape[0]

In [41]:
# assume that we want to have ~250 observations in the validation set, so we will select the following number of rows in the train set for each dow:
n_rows_dow=(test_prop*250).astype('int')

In [111]:
valid_dow = pd.DataFrame()
for dow, i in zip(n_rows_dow.index, n_rows_dow):
    df_tmp = train_raw[train_raw['Dow']==dow].reset_index()
    rows = np.random.choice(df_tmp.index.values, i, replace=False)
    sampled_df = df_tmp.iloc[rows]
    valid_dow = pd.concat([valid_dow, sampled_df])

In [114]:
valid_dow.head()

Unnamed: 0,index,Date_Time,Street,From,To,Real.Spots,Street.Length,any_spot,Minute,Hour,Dow,Year,Month,isweekend
11,32,2014-01-21 19:25:00,23rd Street,Mission Street,Capp Street,0,96.04013,0,25,19,1,2014,1,0
22,108,2014-01-28 13:56:00,Washington Street,Van Ness Avenue,Polk Street,26,139.37155,1,56,13,1,2014,1,0
80,927,2014-03-25 15:41:00,Montgomery Street,Verdi Place,Pacific Avenue,0,66.27028,0,41,15,1,2014,3,0
79,926,2014-03-25 15:04:00,Montgomery Street,Verdi Place,Pacific Avenue,0,66.27028,0,4,15,1,2014,3,0
76,923,2014-03-25 15:03:00,Kearny Street,Kearny Street,Columbus Avenue,0,55.823406,0,3,15,1,2014,3,0


In [126]:
valid_dow.drop(['index', 'Minute','Hour','Dow','Year','Month','isweekend'], axis=1)

Unnamed: 0,Date_Time,Street,From,To,Real.Spots,Street.Length,any_spot
11,2014-01-21 19:25:00,23rd Street,Mission Street,Capp Street,0,96.040130,0
22,2014-01-28 13:56:00,Washington Street,Van Ness Avenue,Polk Street,26,139.371550,1
80,2014-03-25 15:41:00,Montgomery Street,Verdi Place,Pacific Avenue,0,66.270280,0
79,2014-03-25 15:04:00,Montgomery Street,Verdi Place,Pacific Avenue,0,66.270280,0
76,2014-03-25 15:03:00,Kearny Street,Kearny Street,Columbus Avenue,0,55.823406,0
54,2014-02-25 11:18:00,Jones Street,Ellis Street,Eddy Street,0,104.767970,0
48,2014-02-11 15:24:00,Washington Street,Hotaling Street,Columbus Avenue,0,51.843370,0
7,2014-01-21 17:36:00,23rd Street,South Van Ness Avenue,Capp Street,1,96.301300,1
59,2014-02-25 10:12:00,Stockton Street,Stockton Street,Jackson Street,0,84.282980,0
32,2014-02-11 13:15:00,Montgomery Street,Clay Street,Commercial Street,0,50.629986,0


### Validation 2: use the same proportion of oberservation by hour_group as the test set

In [115]:
def hour_group(x):
    if x <= 11:
        return 'morning'
    elif x > 11 and x<=17:
        return  'afternoon'
    else:
        return 'night'

In [116]:
train_raw['Hour_group'] = train_raw['Hour'].apply(hour_group)
test_raw['Hour_group'] = test_raw['Hour'].apply(hour_group)

In [117]:
train_raw.head()

Unnamed: 0,Date_Time,Street,From,To,Real.Spots,Street.Length,any_spot,Minute,Hour,Dow,Year,Month,isweekend,Hour_group
0,2014-01-07 16:19:00,Mission Street,25th Street,26th Street,4,179.13297,1,19,16,1,2014,1,0,afternoon
1,2014-01-18 20:42:00,Polk Street,Ellis Street,Olive Street,0,52.74021,0,42,20,5,2014,1,0,night
2,2014-01-18 20:39:00,Van Ness Avenue,Geary Boulevard,Myrtle Street,0,52.51784,0,39,20,5,2014,1,0,night
3,2014-01-18 20:38:00,Van Ness Avenue,Bush Street,Fern Street,0,52.405315,0,38,20,5,2014,1,0,night
4,2014-01-18 20:38:00,Van Ness Avenue,Daniel Burnham Court,Post Street,0,52.191193,0,38,20,5,2014,1,0,night


In [118]:
test_raw.groupby(['Hour_group'])['Date_Time'].count()/test_raw.shape[0]

Hour_group
afternoon    0.214876
morning      0.349862
night        0.435262
Name: Date_Time, dtype: float64

In [119]:
train_raw.groupby(['Hour_group'])['Date_Time'].count()/train_raw.shape[0]

Hour_group
afternoon    0.453636
morning      0.200000
night        0.346364
Name: Date_Time, dtype: float64

In [120]:
test_prop_hg = test_raw.groupby(['Hour_group'])['Date_Time'].count()/test_raw.shape[0]

# assume that we want to have ~250 observations in the validation set, so we will select the following number of rows in the train set for each dow:
n_rows_hg=(test_prop_hg*250).astype('int')

In [122]:
valid_hg = pd.DataFrame()
for hg, i in zip(n_rows_hg.index, n_rows_hg):
    df_tmp = train_raw[train_raw['Hour_group']==hg].reset_index()
    rows = np.random.choice(df_tmp.index.values, i, replace=False)
    sampled_df = df_tmp.iloc[rows]
    valid_hg = pd.concat([valid_hg, sampled_df])

In [125]:
valid_hg.groupby(['Hour_group'])['Date_Time'].count()/valid_hg.shape[0]

Hour_group
afternoon    0.213710
morning      0.350806
night        0.435484
Name: Date_Time, dtype: float64

In [127]:
valid_hg.drop(['index', 'Minute','Hour','Dow','Year','Month','isweekend','Hour_group'], axis=1)

Unnamed: 0,Date_Time,Street,From,To,Real.Spots,Street.Length,any_spot,Hour_group
30,2014-01-27 16:57:00,Pine Street,Larkin Street,Polk Street,0,146.416550,0,afternoon
469,2014-03-27 15:51:00,Montgomery Street,Montgomery Street,Jackson Street,0,28.596806,0,afternoon
5,2014-01-21 17:35:00,23rd Street,Mission Street,Bartlett Street,1,96.037110,1,afternoon
424,2014-03-26 16:06:00,Stockton Street,Stockton Street,Jackson Street,0,84.282980,0,afternoon
240,2014-02-11 13:13:00,Columbus Avenue,Kearny Street,Pacific Avenue,0,28.695284,0,afternoon
157,2014-02-03 16:04:00,Van Ness Avenue,McAllister Street,Van Ness Avenue,0,154.604970,0,afternoon
236,2014-02-11 13:21:00,Montgomery Street,Bush Street,Sutter Street,0,105.384210,0,afternoon
353,2014-03-22 13:01:00,Van Ness Avenue,Van Ness Avenue,Bush Street,4,39.027992,1,afternoon
190,2014-02-03 13:03:00,Larkin Street,Turk Street,Eddy Street,0,104.799670,0,afternoon
387,2014-03-23 17:11:00,Washington Street,Polk Street,Larkin Street,0,146.472520,0,afternoon
