# PROCESS RAW NDOT XML

For each raw ndot xml realtime file, process to dataframe and save as parquet file in /interim/.

## Imports

In [1]:
import xmltodict
from pathlib import Path
import datetime
import pandas as pd
#import json
import collections
import holoviews as hv
#import dask
#import dask.dataframe as dd
import hvplot.pandas
#import hvplot.dask
import numpy as np

import shutil

pd.options.plotting.backend = 'holoviews'

data_dir = Path('c:/projects/nrel-presentation/data')

## Functions

In [2]:
def mkcsv(adict, this_detector_id, this_detector_status):
    """
    From a dictionary of one lane's value, create csv
    """
    number = int(adict['detector-lane-number'])
    status = int(adict['lane-status'])
    count = int(adict['lane-vehicle-count'])
    occupancy = int(adict['occupancy'])
    speed = int(adict['lane-vehicle-speed'])
    return([this_detector_id, number, this_detector_status, status, count, occupancy, speed])


## Parse all realtime XML

In [39]:
# find files to process
def find_files_to_process():
    # get lists of raw and interim files
    print('getting lists of raw and interim files')
    raw_files = list((data_dir/'raw').glob('*realtime.xml'))
    interim_files = list((data_dir/'interim').glob('*.pq'))
    
    # find raw files not in interim files
    print('finding raw files not in interim files')
    raw_file_stems = [x.stem for x in raw_files]
    interim_file_stems = [x.stem for x in interim_files]
    this_diff = set(raw_file_stems).difference(set(interim_file_stems))
    raw_files_to_process = sorted([data_dir / 'raw' / (x + '.xml') for x in list(this_diff)])
    
    print('{} raw files yet to process'.format(len(raw_files_to_process)))
    
    D = {'raw_file_stems': raw_file_stems,
        'interim_file_stems': interim_file_stems,
        'raw_files_to_process': this_diff,
        'raw_filenames_to_process': raw_files_to_process}
    return(D)

In [40]:
D = find_files_to_process()
print(D.keys())

getting lists of raw and interim files
finding raw files not in interim files
151 raw files yet to process
dict_keys(['raw_file_stems', 'interim_file_stems', 'raw_files_to_process', 'raw_filenames_to_process'])


In [41]:
# plot file dates
def plot_file_dates(raw_file_stems, interim_file_stems, raw_files_to_process):
    raw_file_dts = [datetime.datetime.strptime(x[:15], "%Y%m%d_%H%M%S") for x in raw_file_stems]
    interim_file_dts = [datetime.datetime.strptime(x[:15], "%Y%m%d_%H%M%S") for x in interim_file_stems]
    not_processed_dts = [datetime.datetime.strptime(x[:15], "%Y%m%d_%H%M%S") for x in raw_files_to_process]

    not_processed_spikes = hv.Spikes(not_processed_dts)
    not_processed_spikes.opts(width=1200, height=150, title='raw not processed')
    raw_spikes = hv.Spikes(raw_file_dts)
    raw_spikes.opts(width=1200, height=150, title='raw')
    interim_spikes = hv.Spikes(interim_file_dts)
    interim_spikes.opts(width=1200, height=150, title='interim')
    return((not_processed_spikes + interim_spikes + raw_spikes).opts(width=1200).cols(1))


In [42]:
layout = plot_file_dates(D['raw_file_stems'], D['interim_file_stems'], D['raw_files_to_process'])
layout

In [35]:
for i, xml_file in enumerate(raw_files_to_process):
    print(i, end=', ')
    #print(xml_file.name, end=', ')
    with open(xml_file) as rt_file:
        try:
            xml_data = rt_file.read()
            raw_data = xmltodict.parse(xml_data)

            dt = raw_data['DetectorData']['collection-period']['collection-period-item']['detection-time-stamp']['date']
            tm = raw_data['DetectorData']['collection-period']['collection-period-item']['detection-time-stamp']['time']
            this_dt = datetime.datetime(int(dt[:4]), int(dt[4:6]), int(dt[6:8]), int(tm[:2]), int(tm[2:4]), int(tm[4:6]))

            all_detectors = raw_data['DetectorData']['collection-period']['collection-period-item']['detector-reports']['detector-report']
            all_lanes = []
            for this_detector in all_detectors:
                this_detector_id = this_detector['detector-id']
                this_detector_status = this_detector['detector-status']

                if isinstance(this_detector['lane-data']['lane-data-item'], collections.OrderedDict):
                    this_lane = this_detector['lane-data']['lane-data-item']
                    x = mkcsv(this_lane, this_detector_id, this_detector_status)
                    all_lanes.append(x)
                if isinstance(this_detector['lane-data']['lane-data-item'], list):
                    for this_lane in list(this_detector['lane-data']['lane-data-item']):
                        x = mkcsv(this_lane, this_detector_id, this_detector_status)
                        all_lanes.append(x)

            DF = pd.DataFrame(all_lanes, columns = \
                    ['detector_id','detector_lane_number','detector_status',\
                     'lane_status','lane_vehicle_count','occupancy','lane_vehicle_speed'])
            DF['timestamp'] = this_dt

            new_fn = xml_file.with_suffix('.pq').name
            DF.to_parquet(Path('c:/projects/nrel-presentation/data/interim/{}'.format(new_fn)))

            # move raw file to archive
            #   dunno why shutil won't move files.
            src = xml_file
            dest = data_dir / 'archive' / xml_file.name
            if src.exists():
                #shutil.move(src, dest)
                #print('moved :{}'.format(src))
                pass
        except:
            print('File: {} is bad. Skipping.'.format(xml_file.name), end=', ')

0, File: 20211104_143210_FASTrealtime.xml is bad. Skipping., 1, File: 20211104_152211_FASTrealtime.xml is bad. Skipping., 2, File: 20211104_152311_FASTrealtime.xml is bad. Skipping., 3, File: 20211104_152412_FASTrealtime.xml is bad. Skipping., 4, File: 20211104_152512_FASTrealtime.xml is bad. Skipping., 5, File: 20211104_152711_FASTrealtime.xml is bad. Skipping., 6, File: 20211104_152911_FASTrealtime.xml is bad. Skipping., 7, File: 20211104_153008_FASTrealtime.xml is bad. Skipping., 8, File: 20211104_161410_FASTrealtime.xml is bad. Skipping., 9, File: 20211104_165812_FASTrealtime.xml is bad. Skipping., 10, File: 20211104_165912_FASTrealtime.xml is bad. Skipping., 11, File: 20211104_174212_FASTrealtime.xml is bad. Skipping., 12, File: 20211104_182314_FASTrealtime.xml is bad. Skipping., 13, File: 20211104_182414_FASTrealtime.xml is bad. Skipping., 14, File: 20211104_182714_FASTrealtime.xml is bad. Skipping., 15, File: 20211104_182814_FASTrealtime.xml is bad. Skipping., 16, File: 20211104

## Explore

In [43]:
this_file = Path('c:/projects/nrel-presentation/data/interim/20211027_184419_FASTrealtime.pq')
X = pd.read_parquet(this_file)
X.head()

Unnamed: 0,detector_id,detector_lane_number,detector_status,lane_status,lane_vehicle_count,occupancy,lane_vehicle_speed,timestamp
0,101_1_755,1,1,1,1,0,40,2021-10-27 18:43:30
1,101_1_755,2,1,1,7,4,38,2021-10-27 18:43:30
2,101_1_755,3,1,1,5,3,42,2021-10-27 18:43:30
3,10_1_267,1,1,1,2,0,77,2021-10-27 18:43:30
4,10_1_267,2,1,1,23,6,71,2021-10-27 18:43:30


In [44]:
X.set_index('timestamp', inplace=True)

In [45]:
X[['lane_vehicle_count', 'occupancy', 'lane_vehicle_speed']].boxplot(figsize=(12, 12))