In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import pandas as pd

In [3]:
import logging

# create logger
log = logging.getLogger('hot1_encoding_logger')
log.setLevel(logging.DEBUG)

def hot1_encoding(df, index):
    log.info("\nApplying Hot 1 encoding method to column {}".format(index))
    #
    s= df[index]
    #
    m_value_counts= s.value_counts()
    log.info("m_value_counts={}".format(m_value_counts))
    
    for k,v in m_value_counts.iteritems():
        uv_mask= (df[index] == k).astype(float)
        new_field= "Field_sourceCol_{}_value_{}".format(index, k)
        df[new_field]= uv_mask
    
    del df[index]

In [4]:
from datetime import datetime

def extract_day_of_week(s):
    d= datetime.strptime(s, '%m-%d-%Y')
    return d.weekday()

In [5]:
def get_reconstruction_from_projection(eigenvectors, Mean, m_projection):
    Eig= np.matrix(eigenvectors)
    rec = np.array(m_projection * Eig.transpose() + Mean)
    #rec= Eig*m_projection.transpose()+Mean[:,np.newaxis]
    return np.ravel(rec)

In [6]:
m_fwy_meta_df= pd.read_json('./data/regression/station_meta.json', typ='frame', orient='records')
m_fwy_meta_df.rename(columns={'station': 'S_ID', 'district' : 'DISTRICT_ID', 'latitude' : 'LAT', 'longitude' : 'LON', 'zip' : 'ZIP'}, inplace=True)
m_fwy_meta_df.drop(labels=['direction', 'freeway', 'name', 'urban'], axis=1, inplace=True)
m_fwy_meta_df

Unnamed: 0,DISTRICT_ID,LAT,LON,S_ID,ZIP
0,8,33.785651,-117.218642,819200,92570
1,8,33.785554,-117.218747,819201,92570
2,11,32.841682,-117.121715,1114114,92122
3,8,33.782061,-117.213169,819203,92570
4,8,33.889640,-117.270822,819204,92518
5,3,38.852429,-121.146352,316758,95658
6,8,33.897686,-117.274790,819206,92518
7,11,32.845641,-117.209753,1114119,92117
8,12,33.769824,-117.758692,1212424,92602
9,8,33.874539,-117.264715,819209,92518


In [7]:
p1_list= ['wkday', 'wkend']
p2_list= ['weekday', 'weekend']
years= [2008, 2009, 2010, 2011, 2013, 2014, 2015]
#
p1= p1_list[0]
p2= p2_list[0]
y= years[0]
a_df= pd.read_csv('./data/regression/trim_{}_{}.csv'.format(y, p1), header=0)
c_df= pd.merge(a_df, m_fwy_meta_df, on='S_ID')
#
base_mean_path= './data/{}/total_flow_{}_mean_vector.pivot_{}_grouping_pca_tmp.csv'
base_eigs_path= './data/{}/total_flow_{}_eigenvectors.pivot_{}_grouping_pca_tmp.csv'
mean= pd.read_csv(base_mean_path.format(p2, p2, y), header=None).values[0]
eigs= pd.read_csv(base_eigs_path.format(p2, p2, y), header=None).values  # eigenvectors per row matrix (5 X 288)

rows= c_df[['Flow_Coef_1', 'Flow_Coef_2', 'Flow_Coef_3', 'Flow_Coef_4', 'Flow_Coef_5']].values

new_columns= np.zeros(len(rows))
for i, row in enumerate(rows):
    rec= get_reconstruction_from_projection(eigs, mean, row)
    new_columns[i]= np.mean(rec)

c_df['AGG_TOTAL_FLOW']= new_columns
c_df.drop([
    'S_ID',
    'Flow_Coef_1',
    'Flow_Coef_2',
    'Flow_Coef_3', 
    'Flow_Coef_4', 
    'Flow_Coef_5',
    'CHP_DESC',
    'CHP_DURATION',
    'CC_CODE',
    'ZIP'
], axis=1, inplace=True)
#
c_df['CHP_INC']= c_df.CHP_INC.apply(lambda v: 1 if v == 'T' else 0)
c_df['CHP_INC']= c_df.CHP_INC.astype(float)

c_df['DATE']= c_df.DATE.apply(lambda s: extract_day_of_week(s))
c_df.rename(columns={'DATE':'DAY_OF_WEEK'}, inplace=True)
#
print(c_df.dtypes)

hot1_columns= ['NUM_LANES', 'FWY_NUM', 'FWY_DIR', 'DAY_OF_WEEK', 'DISTRICT_ID']
for c in hot1_columns:
    hot1_encoding(c_df, c)
c_df

INFO:hot1_encoding_logger:
Applying Hot 1 encoding method to column NUM_LANES
INFO:hot1_encoding_logger:m_value_counts=4    732231
3    299021
5    253738
2    107156
6     58719
1     12967
7      4169
dtype: int64
INFO:hot1_encoding_logger:
Applying Hot 1 encoding method to column FWY_NUM
INFO:hot1_encoding_logger:m_value_counts=5      114346
101    111763
405    106675
80      73272
10      66208
805     65402
210     64222
91      63475
15      57985
605     57256
880     52728
60      42330
680     36067
280     32845
57      31678
710     30879
50      29232
99      27611
94      23622
105     21446
22      18920
110     18717
8       17410
118     16741
41      14461
85      14437
180     13183
73      12372
241     12080
30      11960
        ...  
261      4770
37       4172
980      4161
215      4117
205      4031
90       3943
237      3517
133      3458
25       3144
242      2373
23       1962
238      1847
56       1839
65       1445
1        1081
29       1050
126      

NUM_LANES           int64
LEN               float64
URBAN               int64
DENSITY           float64
FWY_NUM             int64
FWY_DIR             int64
AVG_VALUE           int64
CHP_INC           float64
DAY_OF_WEEK         int64
DISTRICT_ID         int64
LAT               float64
LON               float64
AGG_TOTAL_FLOW    float64
dtype: object


Unnamed: 0,LEN,URBAN,DENSITY,AVG_VALUE,CHP_INC,LAT,LON,AGG_TOTAL_FLOW,Field_sourceCol_NUM_LANES_value_4,Field_sourceCol_NUM_LANES_value_3,...,Field_sourceCol_DAY_OF_WEEK_value_0,Field_sourceCol_DISTRICT_ID_value_7,Field_sourceCol_DISTRICT_ID_value_4,Field_sourceCol_DISTRICT_ID_value_12,Field_sourceCol_DISTRICT_ID_value_11,Field_sourceCol_DISTRICT_ID_value_8,Field_sourceCol_DISTRICT_ID_value_3,Field_sourceCol_DISTRICT_ID_value_10,Field_sourceCol_DISTRICT_ID_value_6,Field_sourceCol_DISTRICT_ID_value_5
0,0.966,1,5157.771414,425000,1,38.566948,-121.506049,195.356597,0,1,...,0,0,0,0,0,0,1,0,0,0
1,0.966,1,5157.771414,425000,0,38.566948,-121.506049,189.795183,0,1,...,0,0,0,0,0,0,1,0,0,0
2,0.966,1,5157.771414,425000,0,38.566948,-121.506049,146.227898,0,1,...,0,0,0,0,0,0,1,0,0,0
3,0.966,1,5157.771414,425000,0,38.566948,-121.506049,191.499171,0,1,...,1,0,0,0,0,0,1,0,0,0
4,0.966,1,5157.771414,425000,1,38.566948,-121.506049,187.595178,0,1,...,0,0,0,0,0,0,1,0,0,0
5,0.966,1,5157.771414,425000,0,38.566948,-121.506049,199.154174,0,1,...,0,0,0,0,0,0,1,0,0,0
6,0.966,1,5157.771414,425000,0,38.566948,-121.506049,201.004434,0,1,...,0,0,0,0,0,0,1,0,0,0
7,0.966,1,5157.771414,425000,0,38.566948,-121.506049,230.830796,0,1,...,0,0,0,0,0,0,1,0,0,0
8,0.966,1,5157.771414,425000,1,38.566948,-121.506049,191.970677,0,1,...,1,0,0,0,0,0,1,0,0,0
9,0.966,1,5157.771414,425000,1,38.566948,-121.506049,191.970677,0,1,...,1,0,0,0,0,0,1,0,0,0


In [8]:
for c in c_df.columns:
    print(c)

LEN
URBAN
DENSITY
AVG_VALUE
CHP_INC
LAT
LON
AGG_TOTAL_FLOW
Field_sourceCol_NUM_LANES_value_4
Field_sourceCol_NUM_LANES_value_3
Field_sourceCol_NUM_LANES_value_5
Field_sourceCol_NUM_LANES_value_2
Field_sourceCol_NUM_LANES_value_6
Field_sourceCol_NUM_LANES_value_1
Field_sourceCol_NUM_LANES_value_7
Field_sourceCol_FWY_NUM_value_5
Field_sourceCol_FWY_NUM_value_101
Field_sourceCol_FWY_NUM_value_405
Field_sourceCol_FWY_NUM_value_80
Field_sourceCol_FWY_NUM_value_10
Field_sourceCol_FWY_NUM_value_805
Field_sourceCol_FWY_NUM_value_210
Field_sourceCol_FWY_NUM_value_91
Field_sourceCol_FWY_NUM_value_15
Field_sourceCol_FWY_NUM_value_605
Field_sourceCol_FWY_NUM_value_880
Field_sourceCol_FWY_NUM_value_60
Field_sourceCol_FWY_NUM_value_680
Field_sourceCol_FWY_NUM_value_280
Field_sourceCol_FWY_NUM_value_57
Field_sourceCol_FWY_NUM_value_710
Field_sourceCol_FWY_NUM_value_50
Field_sourceCol_FWY_NUM_value_99
Field_sourceCol_FWY_NUM_value_94
Field_sourceCol_FWY_NUM_value_105
Field_sourceCol_FWY_NUM_value_22
F

In [10]:
c_df.to_csv('./data/regression/preprocessed_2008_wkday.csv', index=False)