In [1]:
import numpy as np
import pandas as pd

In [2]:
import os

In [3]:
from scipy.stats import zscore

In [4]:
from bokeh.plotting import figure, show, output_file, save
from bokeh.io import output_notebook
from bokeh.layouts import row, gridplot, layout
from bokeh.palettes import d3
output_notebook()

In [5]:
def plot_line(title, x, y, width=720, height=240):
    p = figure(
        title=title,
        plot_width=width,
        plot_height=height,
        # tools='',
        # x_axis_type='datetime'
    )
    p.grid.grid_line_alpha=0.3

    p.xaxis.axis_label = 'Lecture'
    p.yaxis.axis_label = 'Car Lane Occupancy'

    p.line(x,
           y,
           color='red',
           )

    return p

In [6]:
pd.set_option('display.max_rows', 1000)

In [7]:
PROJECT_ROOT = '/home/developer/gcp/cbidmltsf'

In [8]:
data_folder = '{}/datasets/traffic/PEMS-SF'.format(PROJECT_ROOT)
data_folder

'/home/developer/gcp/cbidmltsf/datasets/traffic/PEMS-SF'

In [9]:
raw_ts_folder = '{}/datasets/traffic/separated_raw'.format(PROJECT_ROOT)
raw_ts_folder

'/home/developer/gcp/cbidmltsf/datasets/traffic/separated_raw'

In [10]:
stats_columns = [
    'station_id',
    'count',
    'mean',
    'std',
    'min',
    'q25',
    'q50',
    'q75',
    'max',
    'zeros',
    'out-q_005',
    'out-z_3_5'
]

In [11]:
def process_list(s, variable_type=int, delimiter=None):
    """Parses a line in the PEMS format to a list."""
    if delimiter is None:
      l = [
          variable_type(i) for i in s.replace('[', '').replace(']', '').split()
      ]
    else:
      l = [
          variable_type(i)
          for i in s.replace('[', '').replace(']', '').split(delimiter)
      ]

    return l

In [12]:
def read_single_list(filename):
    """Returns single list from a file in the PEMS-custom format."""
    with open(os.path.join(data_folder, filename), 'r') as dat:
        l = process_list(dat.readlines()[0])
    return l

In [13]:
stations_list = [id for id in read_single_list('stations_list')]
len(stations_list)

963

In [14]:
stats_list = list()

alpha = 0.005
factor = 1.25

z_threshold = [-3, 5]

ts = dict()

In [15]:
for station in stations_list:
        
    # build a path to the persisted time series
    ts_path = '{}/ST_{}.pkl'.format(raw_ts_folder, station)
    # read the pickle file
    ts[station] = pd.read_pickle(ts_path)
    
    # start a temporary row list with the stats description of the time series
    # (count, mean, std, min, q25, q50, q75, max)
    row_list = list(ts[station]['values'].describe())
    
    # once the row list is ready, insert customer_id at the beginning
    row_list.insert(0, station)
    
    # add count of zero-values
    row_list.append(np.sum(ts[station]['values'] == 0))
    
    # add outliers given ceil and floor based on alpha quantile
    lower = ts[station]['values'].quantile(alpha)
    upper = ts[station]['values'].quantile(1 - alpha)
    
    ceil, floor = lower/factor, upper*factor
    
    out_q = np.sum(ts[station]['values'] < ceil) + np.sum(ts[station]['values'] > floor)
    row_list.append(out_q)
    
    # add outliers given a z-score threshold
    low_outliers = list(zscore(ts[station]['values']) < z_threshold[0])
    high_outliers = list(zscore(ts[station]['values']) > z_threshold[1])
    out_z = int(np.sum(low_outliers) + np.sum(high_outliers))
    row_list.append(out_z)

    stats_list.append(row_list)
    
stats_df = pd.DataFrame(stats_list, columns = stats_columns)

In [16]:
stats_df

Unnamed: 0,station_id,count,mean,std,min,q25,q50,q75,max,zeros,out-q_005,out-z_3_5
0,400000,4151.0,0.049574,0.026267,0.005383,0.028408,0.0549,0.065742,0.352417,0,15,16
1,400001,4151.0,0.042554,0.040099,0.00295,0.017375,0.039017,0.051583,0.361667,0,2,31
2,400009,4151.0,0.067712,0.048093,0.00625,0.028825,0.07175,0.089642,0.414533,0,3,20
3,400010,4151.0,0.057218,0.04232,0.0,0.020625,0.0539,0.083933,0.296817,12,15,1
4,400015,4151.0,0.030989,0.025899,0.00205,0.01515,0.02325,0.043358,0.509483,0,26,12
5,400017,4151.0,0.044701,0.04618,0.001783,0.015633,0.0378,0.052917,0.429367,0,13,16
6,400025,4151.0,0.032761,0.029949,0.001783,0.013217,0.025517,0.041283,0.200517,0,11,5
7,400026,4151.0,0.054498,0.048603,0.002067,0.01875,0.049717,0.067642,0.333367,0,6,3
8,400027,4151.0,0.051937,0.038794,0.002067,0.020908,0.0508,0.07065,0.2725,0,7,2
9,400030,4151.0,0.05037,0.032799,0.002383,0.029542,0.04895,0.062258,0.425583,0,7,27


In [17]:
# plot max values for all time series, to find out if a min-max scaler is required or not

In [18]:
show(plot_line(title='Maximun car lane occupancy per sensor',
               x=stats_df.index,
               y=stats_df['max']))

In [19]:
# most of the time series have maximum values far below 1
# then it is possible to train a better model by min-max scaling them

In [20]:
# how many time series have indeed a maximum value of 1?
np.sum(stats_df['max'] == 1), np.sum(stats_df['max'] > 0.9)

(7, 10)

In [21]:
# use individual min-max scaling on all the time series

In [22]:
# how many time series have less than the expected 4151 lectures?
np.sum(stats_df['count'] < 4151)

0

In [23]:
# how many time series have more than 12 zero values?
stats_df[stats_df['zeros'] > 12]

Unnamed: 0,station_id,count,mean,std,min,q25,q50,q75,max,zeros,out-q_005,out-z_3_5
193,400382,4151.0,0.061715,0.051625,0.0,0.026242,0.053233,0.073758,0.499033,15,21,8
305,400615,4151.0,0.05318,0.049585,0.0,0.021217,0.044633,0.064208,0.51525,24,2,28
417,400823,4151.0,0.056617,0.043952,0.0,0.022133,0.050317,0.07845,0.289367,18,18,4
418,400828,4151.0,0.056736,0.047905,0.0,0.019375,0.040617,0.085225,0.420333,52,7,6
420,400835,4151.0,0.05264,0.039313,0.0,0.023083,0.051017,0.070767,0.42995,18,23,6
425,400853,4151.0,0.045155,0.041378,0.0,0.016325,0.039217,0.054992,0.36105,14,18,9
427,400861,4151.0,0.065171,0.050703,0.0,0.026458,0.056167,0.090242,0.404183,14,21,4
433,400870,4151.0,0.039198,0.033018,0.0,0.015317,0.035517,0.051167,0.344017,18,30,27
434,400873,4151.0,0.051006,0.045871,0.0,0.023575,0.0433,0.065783,0.439083,14,26,32
482,400993,4151.0,0.0525,0.038832,0.0,0.02045,0.047383,0.0753,0.568067,14,27,4


In [24]:
# how many time series have more than 50 combined outliers
np.sum(stats_df['out-q_005'] + stats_df['out-z_3_5'] > 50)

50

In [25]:
# plot some time series

In [26]:
station_id = 400000
show(plot_line(title='ST_{}'.format(station_id),
               x=ts[station_id].index - ts[station_id].index[0],
               y=ts[station_id]['values']))

In [27]:
station_id = 400828
show(plot_line(title='ST_{}'.format(station_id),
               x=ts[station_id].index - ts[station_id].index[0],
               y=ts[station_id]['values']))

In [28]:
station_id = 402090
show(plot_line(title='ST_{}'.format(station_id),
               x=ts[station_id].index - ts[station_id].index[0],
               y=ts[station_id]['values']))

In [29]:
show(plot_line(title='ST_{}'.format(station_id),
               x=ts[station_id].index - ts[station_id].index[0],
               y=ts[station_id]['day_of_week']))

In [30]:
1559-1583

-24

In [34]:
ts[station_id].loc[ts[station_id].index[0]+1550:ts[station_id].index[0]+1590]

Unnamed: 0,values,sensor_day,time_on_day,day_of_week,id,hours_from_start
10158346,0.12905,64,15,7,402090,1551
10158347,0.1101,64,16,7,402090,1552
10158348,0.1264,64,17,7,402090,1553
10158349,0.091217,64,18,7,402090,1554
10158350,0.07875,64,19,7,402090,1555
10158351,0.06915,64,20,7,402090,1556
10158352,0.063067,64,21,7,402090,1557
10158353,0.053517,64,22,7,402090,1558
10158354,0.04005,64,23,7,402090,1559
10158355,0.0285,65,0,2,402090,1560
