In [1]:
# load traffic dataset into Pandas
# based on TFT data ingestion scripts

In [2]:
import numpy as np
import pandas as pd

In [3]:
import os

In [4]:
data_folder = '/home/developer/gcp/cbidmltsf/datasets/traffic/PEMS-SF'

In [5]:
! ls -l /home/developer/gcp/cbidmltsf/datasets/traffic/PEMS-SF

total 517100
-rw-rw-r-- 1 developer developer 109480788 jul  2 12:14 PEMS-SF.zip
-rw-r--r-- 1 developer developer 165136172 may 22  2011 PEMS_test
-rw-r--r-- 1 developer developer       348 may 22  2011 PEMS_testlabels
-rw-r--r-- 1 developer developer 254855530 may 22  2011 PEMS_train
-rw-r--r-- 1 developer developer       536 may 22  2011 PEMS_trainlabels
-rw-r--r-- 1 developer developer      1654 may 27  2011 randperm
-rw-r--r-- 1 developer developer      6743 may 22  2011 stations_list


In [6]:
# load training dataset

In [7]:
train_dataset_filename = '{}/PEMS_train'.format(data_folder)
train_dataset_filename

'/home/developer/gcp/cbidmltsf/datasets/traffic/PEMS-SF/PEMS_train'

In [8]:
with open(train_dataset_filename, 'r') as train_dataset:
  train_dataset_lines = train_dataset.readlines()

In [9]:
# process lines as in TFT script to avoid crashing the server

In [10]:
def process_list(s, variable_type=int, delimiter=None):
    """Parses a line in the PEMS format to a list."""
    if delimiter is None:
      l = [
          variable_type(i) for i in s.replace('[', '').replace(']', '').split()
      ]
    else:
      l = [
          variable_type(i)
          for i in s.replace('[', '').replace(']', '').split(delimiter)
      ]

    return l

In [11]:
def read_single_list(filename):
    """Returns single list from a file in the PEMS-custom format."""
    with open(os.path.join(data_folder, filename), 'r') as dat:
        l = process_list(dat.readlines()[0])
    return l

In [12]:
def read_matrix(filename):
    """Returns a matrix from a file in the PEMS-custom format."""
    array_list = []
    with open(os.path.join(data_folder, filename), 'r') as dat:

        lines = dat.readlines()
        for i, line in enumerate(lines):
            if (i + 1) % 50 == 0:
                print('Completed {} of {} rows for {}'.format(i + 1, len(lines), filename))

            array = [
                process_list(row_split, variable_type=float, delimiter=None)
                for row_split in process_list(
                    line, variable_type=str, delimiter=';')
            ]
            array_list.append(array)

    return array_list

In [13]:
train_tensor = read_matrix('PEMS_train')

Completed 50 of 267 rows for PEMS_train
Completed 100 of 267 rows for PEMS_train
Completed 150 of 267 rows for PEMS_train
Completed 200 of 267 rows for PEMS_train
Completed 250 of 267 rows for PEMS_train


In [14]:
# train_tensor list: one element per day
len(train_tensor)

267

In [15]:
# train_tensor[x] list: one element per traffic sensor
len(train_tensor[0])

963

In [16]:
# train_tensor[x][y] list: one element per 10-minute resolution lecture (144 lectures per day)
len(train_tensor[0][0])

144

In [17]:
# total of 10-minute lectures in the training set
267*963*144

37025424

In [18]:
# total of hourly lectures in the test set
267*963*24

6170904

In [19]:
shuffle_order = np.array(read_single_list('randperm')) - 1  # start index from 0, not from 1
len(shuffle_order)

440

In [20]:
train_dayofweek = read_single_list('PEMS_trainlabels')
len(train_dayofweek)

267

In [21]:
test_dayofweek = read_single_list('PEMS_testlabels')

In [22]:
len(test_dayofweek)

173

In [23]:
test_tensor = read_matrix('PEMS_test')

Completed 50 of 173 rows for PEMS_test
Completed 100 of 173 rows for PEMS_test
Completed 150 of 173 rows for PEMS_test


In [24]:
len(test_tensor)

173

In [25]:
len(test_tensor[0])

963

In [26]:
len(test_tensor[0][0])

144

In [27]:
# inverse permutate shuffle order
print('Shuffling')
inverse_mapping = {
    new_location: previous_location
    for previous_location, new_location in enumerate(shuffle_order)
}

Shuffling


In [29]:
shuffle_order

array([245, 246, 430, 117, 380, 388,  12, 405, 276, 356,  77, 214, 368,
       119, 159, 432, 102,   8, 439,   1,  28,  72, 415,   9, 410, 299,
       427,  54, 218,  14, 391, 411, 169,  40,  64, 332, 382, 263, 258,
        62, 115,  21, 167, 257, 290, 203, 406, 304,  76, 147, 151, 114,
       165, 172, 319, 281, 228, 141, 162, 311, 132, 138, 420, 127, 350,
       242,  99, 286, 111, 412, 369, 312, 168, 383,  50, 379, 284,  98,
       123, 277, 275, 223, 148, 158, 253, 240, 112, 262, 261, 180, 231,
       394,   0, 291, 409, 301,  85, 289,  47, 122, 279, 363,  35, 155,
       183, 143, 171, 334, 288, 325,  27,  61, 361, 268,  93,  43, 357,
       116,  48,  67, 331, 377, 400, 274, 270, 421,  97, 200, 414,  86,
       404, 321, 233, 318, 124, 302,  58, 249, 149, 343,  30, 309, 426,
       352, 175,  87,  49, 239,  34, 170, 333, 326, 402, 433,  13,   5,
        66, 395, 129, 130, 330, 264, 157, 393, 267, 327, 282, 347, 140,
       164,  91, 204, 118,  94,  23,  15, 250, 273, 161, 324,  4

In [28]:
inverse_mapping

{245: 0,
 246: 1,
 430: 2,
 117: 3,
 380: 4,
 388: 5,
 12: 6,
 405: 7,
 276: 8,
 356: 9,
 77: 10,
 214: 11,
 368: 12,
 119: 13,
 159: 14,
 432: 15,
 102: 16,
 8: 17,
 439: 18,
 1: 19,
 28: 20,
 72: 21,
 415: 22,
 9: 23,
 410: 24,
 299: 25,
 427: 26,
 54: 27,
 218: 28,
 14: 29,
 391: 30,
 411: 31,
 169: 32,
 40: 33,
 64: 34,
 332: 35,
 382: 36,
 263: 37,
 258: 38,
 62: 39,
 115: 40,
 21: 41,
 167: 42,
 257: 43,
 290: 44,
 203: 45,
 406: 46,
 304: 47,
 76: 48,
 147: 49,
 151: 50,
 114: 51,
 165: 52,
 172: 53,
 319: 54,
 281: 55,
 228: 56,
 141: 57,
 162: 58,
 311: 59,
 132: 60,
 138: 61,
 420: 62,
 127: 63,
 350: 64,
 242: 65,
 99: 66,
 286: 67,
 111: 68,
 412: 69,
 369: 70,
 312: 71,
 168: 72,
 383: 73,
 50: 74,
 379: 75,
 284: 76,
 98: 77,
 123: 78,
 277: 79,
 275: 80,
 223: 81,
 148: 82,
 158: 83,
 253: 84,
 240: 85,
 112: 86,
 262: 87,
 261: 88,
 180: 89,
 231: 90,
 394: 91,
 0: 92,
 291: 93,
 409: 94,
 301: 95,
 85: 96,
 289: 97,
 47: 98,
 122: 99,
 279: 100,
 363: 101,
 35: 102,
 1

In [30]:
reverse_shuffle_order = np.array([
    inverse_mapping[new_location]
    for new_location, _ in enumerate(shuffle_order)
 ])

In [31]:
reverse_shuffle_order

array([ 92,  19, 295, 296, 412, 155, 227, 375,  17,  23, 235, 292,   6,
       154,  29, 175, 305, 261, 410, 383, 233,  41, 286, 174, 315, 406,
       348, 110,  20, 334, 140, 277, 346, 436, 148, 102, 427, 328, 343,
       283,  33, 276, 393, 115, 180, 351, 205,  98, 118, 146,  74, 208,
       400, 405,  27, 403, 217, 223, 136, 431, 212, 111,  39, 419,  34,
       216, 156, 119, 407, 236, 199, 415,  21, 347, 352, 371,  48,  10,
       306, 389, 273, 335, 330, 439, 397,  96, 129, 145, 219, 263, 382,
       170, 378, 114, 173, 239, 201, 126,  77,  66, 252, 361,  16, 237,
       214, 423, 340, 248, 399, 270, 379,  68,  86, 325,  51,  40, 117,
         3, 172,  13, 323, 413,  99,  78, 134, 396, 285,  63, 301, 158,
       159, 215,  60, 258, 381, 262, 424, 249,  61, 429, 168,  57, 251,
       105, 209, 198, 425,  49,  82, 138, 272,  50, 282, 269, 362, 103,
       204, 162,  83,  14, 401, 178,  58, 417, 169,  52, 394,  42,  72,
        32, 149, 106,  53, 357, 191, 144, 190, 364, 411, 284,  8

In [32]:
# group and reoder based on permuation matrix
print('Reodering')
day_of_week = np.array(train_dayofweek + test_dayofweek)
combined_tensor = np.array(train_tensor + test_tensor)

day_of_week = day_of_week[reverse_shuffle_order]
combined_tensor = combined_tensor[reverse_shuffle_order]

Reodering


In [33]:
# put everything back into a dataframe
print('Parsing as dataframe')
labels = ['traj_{}'.format(i) for i in read_single_list('stations_list')]

Parsing as dataframe


In [34]:
labels

['traj_400000',
 'traj_400001',
 'traj_400009',
 'traj_400010',
 'traj_400015',
 'traj_400017',
 'traj_400025',
 'traj_400026',
 'traj_400027',
 'traj_400030',
 'traj_400031',
 'traj_400035',
 'traj_400037',
 'traj_400039',
 'traj_400040',
 'traj_400041',
 'traj_400043',
 'traj_400044',
 'traj_400045',
 'traj_400049',
 'traj_400052',
 'traj_400053',
 'traj_400057',
 'traj_400059',
 'traj_400060',
 'traj_400065',
 'traj_400067',
 'traj_400071',
 'traj_400073',
 'traj_400074',
 'traj_400075',
 'traj_400078',
 'traj_400079',
 'traj_400082',
 'traj_400083',
 'traj_400085',
 'traj_400086',
 'traj_400088',
 'traj_400090',
 'traj_400091',
 'traj_400093',
 'traj_400094',
 'traj_400095',
 'traj_400096',
 'traj_400097',
 'traj_400100',
 'traj_400103',
 'traj_400107',
 'traj_400108',
 'traj_400109',
 'traj_400110',
 'traj_400113',
 'traj_400115',
 'traj_400116',
 'traj_400118',
 'traj_400122',
 'traj_400124',
 'traj_400125',
 'traj_400126',
 'traj_400127',
 'traj_400132',
 'traj_400137',
 'traj_4

In [35]:
hourly_list = []

In [36]:
for day, day_matrix in enumerate(combined_tensor):

    # hourly data
    hourly = pd.DataFrame(day_matrix.T, columns=labels)
    hourly['hour_on_day'] = [int(i / 6) for i in hourly.index
                            ]  # sampled at 10 min intervals
    if hourly['hour_on_day'].max() > 23 or hourly['hour_on_day'].min() < 0:
      raise ValueError('Invalid hour! {}-{}'.format(
          hourly['hour_on_day'].min(), hourly['hour_on_day'].max()))

    hourly = hourly.groupby('hour_on_day', as_index=True).mean()[labels]
    hourly['sensor_day'] = day
    hourly['time_on_day'] = hourly.index
    hourly['day_of_week'] = day_of_week[day]

    hourly_list.append(hourly)

In [37]:
len(hourly_list)

440

In [38]:
hourly_frame = pd.concat(hourly_list, axis=0, ignore_index=True, sort=False)

In [39]:
hourly_frame

Unnamed: 0,traj_400000,traj_400001,traj_400009,traj_400010,traj_400015,traj_400017,traj_400025,traj_400026,traj_400027,traj_400030,...,traj_402084,traj_402085,traj_402086,traj_402087,traj_402088,traj_402089,traj_402090,sensor_day,time_on_day,day_of_week
0,0.020933,0.005917,0.013850,0.012733,0.007850,0.009867,0.005850,0.010517,0.006083,0.032067,...,0.030283,0.038000,0.015067,0.013883,0.013883,0.013883,0.013883,0,0,4
1,0.019333,0.004367,0.009217,0.008817,0.006333,0.009717,0.003183,0.009133,0.004250,0.027283,...,0.028233,0.033367,0.012833,0.009600,0.009600,0.009600,0.009600,0,1,4
2,0.020200,0.004150,0.008717,0.006483,0.005200,0.007500,0.003433,0.007933,0.003967,0.024450,...,0.029900,0.032800,0.016250,0.008100,0.008100,0.008100,0.008100,0,2,4
3,0.022450,0.007350,0.015067,0.009400,0.005517,0.009667,0.005600,0.007083,0.004033,0.023867,...,0.034967,0.033550,0.026650,0.008817,0.008817,0.008817,0.008817,0,3,4
4,0.029283,0.018833,0.032683,0.018650,0.008083,0.018633,0.009617,0.007983,0.008383,0.025400,...,0.043833,0.037867,0.065750,0.015233,0.015233,0.015233,0.015233,0,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10555,0.038100,0.030067,0.063767,0.035733,0.018900,0.070300,0.048883,0.076950,0.086167,0.054167,...,0.034733,0.021050,0.070983,0.059967,0.060400,0.051800,0.041400,439,19,5
10556,0.033550,0.026617,0.051167,0.030267,0.016067,0.052617,0.029900,0.055617,0.063550,0.037917,...,0.032733,0.017817,0.051167,0.053300,0.046383,0.045883,0.037300,439,20,5
10557,0.027783,0.025967,0.048617,0.029100,0.015567,0.044633,0.023917,0.054067,0.054967,0.032050,...,0.030167,0.011833,0.038050,0.049667,0.034100,0.043317,0.035400,439,21,5
10558,0.019467,0.020467,0.042550,0.025050,0.013600,0.036683,0.019567,0.048700,0.044400,0.026017,...,0.027883,0.009800,0.039150,0.040583,0.027933,0.034500,0.029750,439,22,5


In [40]:
# flatten such that each entitiy uses one row in dataframe
store_columns = [c for c in hourly_frame.columns if 'traj' in c]
other_columns = [c for c in hourly_frame.columns if 'traj' not in c]
flat_df = pd.DataFrame(columns=['values', 'prev_values', 'next_values'] +
                       other_columns + ['id'])

In [41]:
flat_df

Unnamed: 0,values,prev_values,next_values,sensor_day,time_on_day,day_of_week,id


In [42]:
def format_index_string(x):
    """Returns formatted string for key."""

    if x < 10:
      return '00' + str(x)
    elif x < 100:
      return '0' + str(x)
    elif x < 1000:
      return str(x)

    raise ValueError('Invalid value of x {}'.format(x))

In [43]:
for store in store_columns:
    print('Processing {}'.format(store))

    sliced = hourly_frame[[store] + other_columns].copy()
    sliced.columns = ['values'] + other_columns
    sliced['id'] = int(store.replace('traj_', ''))

    # sort by sensor-date-time
    key = sliced['id'].apply(str) \
      + sliced['sensor_day'].apply(lambda x: '_' + format_index_string(x)) \
        + sliced['time_on_day'].apply(lambda x: '_' + format_index_string(x))
    sliced = sliced.set_index(key).sort_index()

    sliced['values'] = sliced['values'].fillna(method='ffill')
    sliced['prev_values'] = sliced['values'].shift(1)
    sliced['next_values'] = sliced['values'].shift(-1)

    flat_df = flat_df.append(sliced.dropna(), ignore_index=True, sort=False)

Processing traj_400000
Processing traj_400001
Processing traj_400009
Processing traj_400010
Processing traj_400015
Processing traj_400017
Processing traj_400025
Processing traj_400026
Processing traj_400027
Processing traj_400030
Processing traj_400031
Processing traj_400035
Processing traj_400037
Processing traj_400039
Processing traj_400040
Processing traj_400041
Processing traj_400043
Processing traj_400044
Processing traj_400045
Processing traj_400049
Processing traj_400052
Processing traj_400053
Processing traj_400057
Processing traj_400059
Processing traj_400060
Processing traj_400065
Processing traj_400067
Processing traj_400071
Processing traj_400073
Processing traj_400074
Processing traj_400075
Processing traj_400078
Processing traj_400079
Processing traj_400082
Processing traj_400083
Processing traj_400085
Processing traj_400086
Processing traj_400088
Processing traj_400090
Processing traj_400091
Processing traj_400093
Processing traj_400094
Processing traj_400095
Processing 

Processing traj_400700
Processing traj_400703
Processing traj_400704
Processing traj_400707
Processing traj_400712
Processing traj_400713
Processing traj_400714
Processing traj_400715
Processing traj_400716
Processing traj_400717
Processing traj_400723
Processing traj_400726
Processing traj_400728
Processing traj_400731
Processing traj_400733
Processing traj_400734
Processing traj_400738
Processing traj_400739
Processing traj_400740
Processing traj_400741
Processing traj_400742
Processing traj_400743
Processing traj_400744
Processing traj_400745
Processing traj_400747
Processing traj_400748
Processing traj_400749
Processing traj_400750
Processing traj_400754
Processing traj_400755
Processing traj_400756
Processing traj_400759
Processing traj_400760
Processing traj_400763
Processing traj_400765
Processing traj_400767
Processing traj_400769
Processing traj_400772
Processing traj_400774
Processing traj_400776
Processing traj_400778
Processing traj_400781
Processing traj_400782
Processing 

Processing traj_401562
Processing traj_401564
Processing traj_401566
Processing traj_401567
Processing traj_401568
Processing traj_401573
Processing traj_401578
Processing traj_401579
Processing traj_401580
Processing traj_401581
Processing traj_401582
Processing traj_401583
Processing traj_401586
Processing traj_401587
Processing traj_401588
Processing traj_401590
Processing traj_401592
Processing traj_401593
Processing traj_401594
Processing traj_401597
Processing traj_401598
Processing traj_401599
Processing traj_401600
Processing traj_401601
Processing traj_401602
Processing traj_401603
Processing traj_401605
Processing traj_401606
Processing traj_401608
Processing traj_401609
Processing traj_401610
Processing traj_401611
Processing traj_401612
Processing traj_401613
Processing traj_401614
Processing traj_401615
Processing traj_401616
Processing traj_401617
Processing traj_401619
Processing traj_401620
Processing traj_401621
Processing traj_401623
Processing traj_401624
Processing 

In [44]:
# filter to match range used by other academic papers
index = flat_df['sensor_day']
flat_df = flat_df[index < 173].copy()

In [45]:
flat_df

Unnamed: 0,values,prev_values,next_values,sensor_day,time_on_day,day_of_week,id
0,0.019333,0.020933,0.020200,0,1,4,400000
1,0.020200,0.019333,0.022450,0,2,4,400000
2,0.022450,0.020200,0.029283,0,3,4,400000
3,0.029283,0.022450,0.055483,0,4,4,400000
4,0.055483,0.029283,0.073933,0,5,4,400000
...,...,...,...,...,...,...,...
10160942,0.041400,0.053383,0.037300,172,19,5,402090
10160943,0.037300,0.041400,0.035400,172,20,5,402090
10160944,0.035400,0.037300,0.029750,172,21,5,402090
10160945,0.029750,0.035400,0.022700,172,22,5,402090


In [46]:
# creating columns of categorical inputs
flat_df['categorical_id'] = flat_df['id'].copy()
flat_df['hours_from_start'] = flat_df['time_on_day'] + flat_df['sensor_day']*24.
flat_df['categorical_day_of_week'] = flat_df['day_of_week'].copy()
flat_df['categorical_time_on_day'] = flat_df['time_on_day'].copy()

In [47]:
flat_df

Unnamed: 0,values,prev_values,next_values,sensor_day,time_on_day,day_of_week,id,categorical_id,hours_from_start,categorical_day_of_week,categorical_time_on_day
0,0.019333,0.020933,0.020200,0,1,4,400000,400000,1,4,1
1,0.020200,0.019333,0.022450,0,2,4,400000,400000,2,4,2
2,0.022450,0.020200,0.029283,0,3,4,400000,400000,3,4,3
3,0.029283,0.022450,0.055483,0,4,4,400000,400000,4,4,4
4,0.055483,0.029283,0.073933,0,5,4,400000,400000,5,4,5
...,...,...,...,...,...,...,...,...,...,...,...
10160942,0.041400,0.053383,0.037300,172,19,5,402090,402090,4147,5,19
10160943,0.037300,0.041400,0.035400,172,20,5,402090,402090,4148,5,20
10160944,0.035400,0.037300,0.029750,172,21,5,402090,402090,4149,5,21
10160945,0.029750,0.035400,0.022700,172,22,5,402090,402090,4150,5,22


In [50]:
flat_df.to_pickle('{}/filtered_output.pkl'.format(data_folder))

In [52]:
test_df = pd.read_pickle('{}/filtered_output.pkl'.format(data_folder))

In [58]:
test_df.iloc[:24]

Unnamed: 0,values,prev_values,next_values,sensor_day,time_on_day,day_of_week,id,categorical_id,hours_from_start,categorical_day_of_week,categorical_time_on_day
0,0.019333,0.020933,0.0202,0,1,4,400000,400000,1,4,1
1,0.0202,0.019333,0.02245,0,2,4,400000,400000,2,4,2
2,0.02245,0.0202,0.029283,0,3,4,400000,400000,3,4,3
3,0.029283,0.02245,0.055483,0,4,4,400000,400000,4,4,4
4,0.055483,0.029283,0.073933,0,5,4,400000,400000,5,4,5
5,0.073933,0.055483,0.057067,0,6,4,400000,400000,6,4,6
6,0.057067,0.073933,0.059383,0,7,4,400000,400000,7,4,7
7,0.059383,0.057067,0.06405,0,8,4,400000,400000,8,4,8
8,0.06405,0.059383,0.069033,0,9,4,400000,400000,9,4,9
9,0.069033,0.06405,0.066433,0,10,4,400000,400000,10,4,10
