In [1]:
import copy
from IPython.display import display
import logging
import numpy
import os
import pandas
import pdb
import plotly
import pprint
import pyarrow
import pyarrow.parquet as pq
import six
import sys
import time

plotly.offline.init_notebook_mode(connected=True)

In [2]:
LOG = logging.getLogger('CS230')
LOG.setLevel(logging.INFO)

# create a file handler
handler = logging.StreamHandler()
handler.setLevel(logging.INFO)

# create a logging format
formatter = logging.Formatter('%(asctime)s | %(levelname)s | %(message)s', '%H-%M-%S')
handler.setFormatter(formatter)

# add the handlers to the logger
LOG.addHandler(handler)

In [3]:
DATA_DIR = os.path.join(os.getcwd().split('CS230_project')[0], 'CS230_project', 'data')
FILE_PATHS = []
EXCLUDE = ['grandsport.parquet', '250lm.parquet']

for dir_path, dir_names, file_names in os.walk(DATA_DIR):
    for file_name in file_names:
        if file_name.endswith('.parquet') and file_name not in EXCLUDE:
            file_path = os.path.join(dir_path, file_name)
            FILE_PATHS.append(file_path)
FILE_PATHS.sort()
LOG.debug(FILE_PATHS)

In [4]:
COLUMNS_ORIG = ['time', 'handwheelAngle', 'throttle', 'brake', 'altitude', 'horizontalSpeed', 'vxCG', 'vyCG', 'yawAngle', 'pitchAngle', 'rollAngle', 'distance']  # 'latitude', 'longitude'

COLUMNS_TO_DIFF = ['yawAngle', 'pitchAngle', 'rollAngle', 'horizontalSpeed', 'distance', 'vxCG', 'vyCG']
COLUMN_DIFF_PREFIX = 'diff_'
COLUMNS = copy.deepcopy(COLUMNS_ORIG)
for column in COLUMNS_TO_DIFF:
    new_column = COLUMN_DIFF_PREFIX + column
    COLUMNS.append(new_column)

COLUMNS_WITH_GPS_JUMP = ['horizontalSpeed', 'vxCG', 'vyCG', 'yawAngle', 'pitchAngle', 'rollAngle']
DIFF_COLUMNS_WITH_GPS_JUMP = [COLUMN_DIFF_PREFIX + x for x in COLUMNS_WITH_GPS_JUMP]

STRIDES = [1]

# load data from parquet files

In [6]:
DATA = {}

for file_path in FILE_PATHS[:2]:
    #file_path = os.path.join(DATA_DIR, file_name)
    file_name = file_path.replace(os.getcwd() + '/', '')
    
    # read parquet
    table = pq.read_table(file_path)
    
    # convert parquet table to pandas dataframe
    df = table.to_pandas()
    
    # save
    DATA[file_name] = {
        'orig': df
    }
    #sig_names = sorted(list(df.columns), key=lambda s: s.lower())
    
    LOG.debug('loaded %s', file_path)

# data size in rows

In [7]:
data_size_total = 0

for file_name, dfs in six.iteritems(DATA):
    df = dfs['orig']
    LOG.info('%s : %s' % (file_name, len(df)))
    data_size_total += len(df)

LOG.info('total: %s', data_size_total)

00-59-31 | INFO | /home/dave/code/github/CS230_project/data/2013_Monterey_Motorsports_Reunion/20130810_01_01_01_grandsport.parquet : 2564000
00-59-31 | INFO | /home/dave/code/github/CS230_project/data/2013_Monterey_Motorsports_Reunion/20130810_02_01_01_grandsport.parquet : 2325000
00-59-31 | INFO | total: 4889000


# add columns for discrete derivatives

- need to correct diff_yawAngle for when wheel axis flips from positive to negative

In [8]:
#fix_count = 0
#spot_check_freq = 1000

for file_name, dfs in six.iteritems(DATA):
    for stride in STRIDES:
        LOG.debug('%s stride: %s', file_name, stride)
        
        df = dfs['orig'].copy(deep=True)
        for column in COLUMNS_TO_DIFF:
            new_column = COLUMN_DIFF_PREFIX + column
            
            df[new_column] = df[column] - df[column].shift(stride)
        
        
        # correct for yawAngle sign flip
        indexes = df.index[(df['diff_yawAngle'] > 300) | (df['diff_yawAngle'] < -300)]
        for i in indexes:
            #fix_count += 1
            #if fix_count % spot_check_freq == 0:
            #    display(df.iloc[i-2:i+2][['yawAngle', 'diff_yawAngle']])

            #old_value = df.iloc[i]['diff_yawAngle']
            df.at[i, 'diff_yawAngle'] = (180 - abs(df.iloc[i]['yawAngle'])) + (180 - abs(df.iloc[i-stride]['yawAngle']))
            
            #if fix_count % spot_check_freq == 0:
            #    display(df.iloc[i-1:i+2][['yawAngle', 'diff_yawAngle']])
            #    LOG.warning('%s:%s fixed diff_yawAngle at %s (%s -> %s)', file_name, stride, i, round(old_value, 4), round(df.iloc[i]['diff_yawAngle'], 4))
        LOG.debug('# diff_yawAngle fixed: %s' % len(indexes))
        
        # replace NaN with zeros in diff columns
        values = {COLUMN_DIFF_PREFIX+x:0 for x in COLUMNS_TO_DIFF}
        df.fillna(value=values, inplace=True)
        
        DATA[file_name][stride] = df

### print max discrete derivative per column

In [9]:
output = {}

for file_name, dfs in six.iteritems(DATA):
    file_name = file_name.split('/')[-1]
    output[file_name] = {}
    for stride in STRIDES:
        output[file_name][stride] = {}
        df = dfs[stride]
        for column in COLUMNS_TO_DIFF:
            diff_column = COLUMN_DIFF_PREFIX + column
            output[file_name][stride][diff_column] = round(max(abs(df[diff_column])), 4)

pprint.pprint(output)

{'20130810_01_01_01_grandsport.parquet': {1: {'diff_distance': 0.534,
                                              'diff_horizontalSpeed': 5.28,
                                              'diff_pitchAngle': 1.23,
                                              'diff_rollAngle': 3.4,
                                              'diff_vxCG': 5.29,
                                              'diff_vyCG': 0.57,
                                              'diff_yawAngle': 48.93}},
 '20130810_02_01_01_grandsport.parquet': {1: {'diff_distance': 0.562,
                                              'diff_horizontalSpeed': 5.99,
                                              'diff_pitchAngle': 0.97,
                                              'diff_rollAngle': 2.27,
                                              'diff_vxCG': 6.02,
                                              'diff_vyCG': 0.68,
                                              'diff_yawAngle': 10.4}}}


# clean discontinuities

find diff values with unexpected jump, clean if prior row had NaN

In [10]:
thresholds = [
    ('diff_yawAngle', 10),
    ('diff_pitchAngle', 2),
    ('diff_rollAngle', 2),
    ('diff_distance', 10),
    ('diff_vxCG', 10),
    ('diff_vyCG', 10)
]

for file_name, dfs in six.iteritems(DATA):
    for stride in STRIDES:
        LOG.info('%s stride: %s', file_name, stride)
        df = dfs[stride]
        
        for column, threshold in thresholds:
            indexes = df.index[(df[column] > threshold) | (df[column] < -threshold)]
            for i in indexes:
                if numpy.isnan(df.iloc[i-stride]['altitude']):
                    LOG.warning('GPS NaN at %s : %s -> %s', i, 
                                df.iloc[i][DIFF_COLUMNS_WITH_GPS_JUMP].to_string(header=False, index=False).replace(os.linesep, ', '), 
                                df.iloc[i-1][DIFF_COLUMNS_WITH_GPS_JUMP].to_string(header=False, index=False).replace(os.linesep, ', '))
                    df.at[i, DIFF_COLUMNS_WITH_GPS_JUMP] = df.iloc[i-stride][DIFF_COLUMNS_WITH_GPS_JUMP]

00-59-38 | INFO | /home/dave/code/github/CS230_project/data/2013_Monterey_Motorsports_Reunion/20130810_01_01_01_grandsport.parquet stride: 1
00-59-38 | INFO | /home/dave/code/github/CS230_project/data/2013_Monterey_Motorsports_Reunion/20130810_02_01_01_grandsport.parquet stride: 1


### print max discrete derivative per column after cleaning

In [11]:
output = {}

for file_name, dfs in six.iteritems(DATA):
    file_name = file_name.split('/')[-1]
    output[file_name] = {}
    for stride in STRIDES:
        output[file_name][stride] = {}
        df = dfs[stride]
        for column in COLUMNS_TO_DIFF:
            diff_column = COLUMN_DIFF_PREFIX + column
            output[file_name][stride][diff_column] = round(max(abs(df[diff_column])), 4)

pprint.pprint(output)

{'20130810_01_01_01_grandsport.parquet': {1: {'diff_distance': 0.534,
                                              'diff_horizontalSpeed': 0.66,
                                              'diff_pitchAngle': 0.61,
                                              'diff_rollAngle': 1.26,
                                              'diff_vxCG': 0.32,
                                              'diff_vyCG': 0.57,
                                              'diff_yawAngle': 0.79}},
 '20130810_02_01_01_grandsport.parquet': {1: {'diff_distance': 0.562,
                                              'diff_horizontalSpeed': 0.42,
                                              'diff_pitchAngle': 0.97,
                                              'diff_rollAngle': 2.27,
                                              'diff_vxCG': 0.4,
                                              'diff_vyCG': 0.68,
                                              'diff_yawAngle': 0.73}}}


### inspect any remaining discontinuities

In [12]:
manually_inspect = False

thresholds = [
    ('diff_yawAngle', 10),
    ('diff_pitchAngle', 2),
    ('diff_rollAngle', 2),
    ('diff_distance', 10),
    ('diff_vxCG', 10),
    ('diff_vyCG', 10)
]
discontinuities = {
    'count': {
        'total': 0
    },
    'indexes': {}
}

for file_name, dfs in six.iteritems(DATA):
    discontinuities['count'][file_name] = {}
    discontinuities['indexes'][file_name] = {}
    
    for stride in STRIDES:
        LOG.info('%s stride: %s', file_name.split('/')[-1], stride)
        df = dfs[stride]
        discontinuities['count'][file_name][stride] = 0
        discontinuities['indexes'][file_name][stride] = []
        
        for column, threshold in thresholds:
            indexes = df.index[(df[column] > threshold) | (df[column] < -threshold)]
            indexes_list = indexes.to_list()
            
            if manually_inspect:
                for i in indexes:
                    LOG.debug('%s : %s : %s', column, i, df.iloc[i][column])
                    display(df.iloc[i-5:i+5,:][COLUMNS])
            
            discontinuities['count']['total'] += len(indexes_list)
            discontinuities['count'][file_name][stride] += len(indexes_list)
            discontinuities['indexes'][file_name][stride].extend(indexes_list)

pprint.pprint(discontinuities['count'])

00-59-42 | INFO | 20130810_01_01_01_grandsport.parquet stride: 1
00-59-42 | INFO | 20130810_02_01_01_grandsport.parquet stride: 1


{'/home/dave/code/github/CS230_project/data/2013_Monterey_Motorsports_Reunion/20130810_01_01_01_grandsport.parquet': {1: 0},
 '/home/dave/code/github/CS230_project/data/2013_Monterey_Motorsports_Reunion/20130810_02_01_01_grandsport.parquet': {1: 9},
 'total': 9}


# view data tables

In [13]:
for file_name, dfs in six.iteritems(DATA):
    for stride, df in six.iteritems(dfs):
        print('%s : %s : %s' % (file_name, stride, len(df)))
        if stride == 'orig':
            columns = COLUMNS_ORIG
        else:
            columns = COLUMNS
        display(df[columns].head())
        display(df[columns].tail())

/home/dave/code/github/CS230_project/data/2013_Monterey_Motorsports_Reunion/20130810_01_01_01_grandsport.parquet : orig : 2564000


Unnamed: 0,time,handwheelAngle,throttle,brake,altitude,horizontalSpeed,vxCG,vyCG,yawAngle,pitchAngle,rollAngle,distance
0,0.0,29.3,0.7,2.3,,0.0,-0.0,0.0,0.7,0.4,-1.0,0.0
1,0.001,29.4,0.7,1.9,,0.0,-0.0,0.0,0.7,0.4,-1.0,0.0
2,0.002,29.4,0.7,1.9,,0.0,-0.0,0.0,0.7,0.4,-1.0,0.0
3,0.003,29.4,0.7,1.4,,0.0,-0.0,0.0,0.7,0.4,-1.0,0.0
4,0.004,29.4,0.7,0.9,,0.0,-0.0,0.0,0.7,0.4,-1.0,0.0


Unnamed: 0,time,handwheelAngle,throttle,brake,altitude,horizontalSpeed,vxCG,vyCG,yawAngle,pitchAngle,rollAngle,distance
2563995,2563.995,-6.4,0.9,0.0,194.171,0.03,-0.03,0.02,-14.09,1.22,1.2,39653.28
2563996,2563.996,-6.4,0.9,0.0,194.171,0.03,-0.03,0.02,-14.09,1.22,1.2,39653.28
2563997,2563.997,-6.4,0.9,0.0,194.171,0.03,-0.03,0.02,-14.09,1.22,1.2,39653.28
2563998,2563.998,-6.4,0.9,0.0,194.171,0.03,-0.03,0.02,-14.09,1.22,1.2,39653.28
2563999,2563.999,-6.4,0.9,0.0,194.171,0.03,-0.03,0.02,-14.09,1.22,1.2,39653.28


/home/dave/code/github/CS230_project/data/2013_Monterey_Motorsports_Reunion/20130810_01_01_01_grandsport.parquet : 1 : 2564000


Unnamed: 0,time,handwheelAngle,throttle,brake,altitude,horizontalSpeed,vxCG,vyCG,yawAngle,pitchAngle,rollAngle,distance,diff_yawAngle,diff_pitchAngle,diff_rollAngle,diff_horizontalSpeed,diff_distance,diff_vxCG,diff_vyCG
0,0.0,29.3,0.7,2.3,,0.0,-0.0,0.0,0.7,0.4,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.001,29.4,0.7,1.9,,0.0,-0.0,0.0,0.7,0.4,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.002,29.4,0.7,1.9,,0.0,-0.0,0.0,0.7,0.4,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.003,29.4,0.7,1.4,,0.0,-0.0,0.0,0.7,0.4,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.004,29.4,0.7,0.9,,0.0,-0.0,0.0,0.7,0.4,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,time,handwheelAngle,throttle,brake,altitude,horizontalSpeed,vxCG,vyCG,yawAngle,pitchAngle,rollAngle,distance,diff_yawAngle,diff_pitchAngle,diff_rollAngle,diff_horizontalSpeed,diff_distance,diff_vxCG,diff_vyCG
2563995,2563.995,-6.4,0.9,0.0,194.171,0.03,-0.03,0.02,-14.09,1.22,1.2,39653.28,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2563996,2563.996,-6.4,0.9,0.0,194.171,0.03,-0.03,0.02,-14.09,1.22,1.2,39653.28,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2563997,2563.997,-6.4,0.9,0.0,194.171,0.03,-0.03,0.02,-14.09,1.22,1.2,39653.28,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2563998,2563.998,-6.4,0.9,0.0,194.171,0.03,-0.03,0.02,-14.09,1.22,1.2,39653.28,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2563999,2563.999,-6.4,0.9,0.0,194.171,0.03,-0.03,0.02,-14.09,1.22,1.2,39653.28,0.0,0.0,0.0,0.0,0.0,0.0,0.0


/home/dave/code/github/CS230_project/data/2013_Monterey_Motorsports_Reunion/20130810_02_01_01_grandsport.parquet : orig : 2325000


Unnamed: 0,time,handwheelAngle,throttle,brake,altitude,horizontalSpeed,vxCG,vyCG,yawAngle,pitchAngle,rollAngle,distance
0,0.0,207.2,0.9,0.0,,0.0,-0.0,0.0,0.6,0.4,-1.0,0.0
1,0.001,207.2,0.9,0.0,,0.0,-0.0,0.0,0.6,0.4,-1.0,0.0
2,0.002,207.1,0.9,0.0,,0.0,-0.0,0.0,0.6,0.4,-1.0,0.0
3,0.003,207.1,0.9,0.0,,0.0,-0.0,0.0,0.6,0.4,-1.0,0.0
4,0.004,207.1,0.9,0.0,,0.0,-0.0,0.0,0.6,0.4,-1.0,0.0


Unnamed: 0,time,handwheelAngle,throttle,brake,altitude,horizontalSpeed,vxCG,vyCG,yawAngle,pitchAngle,rollAngle,distance
2324995,2324.995,-3.2,0.9,0.0,194.156,0.01,-0.0,0.01,-12.79,0.66,0.85,39662.235
2324996,2324.996,-3.2,0.9,0.0,194.156,0.01,-0.0,0.01,-12.79,0.66,0.85,39662.235
2324997,2324.997,-3.2,0.9,0.0,194.156,0.01,-0.0,0.01,-12.79,0.66,0.85,39662.235
2324998,2324.998,-3.2,0.9,0.0,194.156,0.01,-0.0,0.01,-12.79,0.66,0.85,39662.235
2324999,2324.999,-3.2,0.9,0.0,194.156,0.01,-0.0,0.01,-12.79,0.66,0.85,39662.235


/home/dave/code/github/CS230_project/data/2013_Monterey_Motorsports_Reunion/20130810_02_01_01_grandsport.parquet : 1 : 2325000


Unnamed: 0,time,handwheelAngle,throttle,brake,altitude,horizontalSpeed,vxCG,vyCG,yawAngle,pitchAngle,rollAngle,distance,diff_yawAngle,diff_pitchAngle,diff_rollAngle,diff_horizontalSpeed,diff_distance,diff_vxCG,diff_vyCG
0,0.0,207.2,0.9,0.0,,0.0,-0.0,0.0,0.6,0.4,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.001,207.2,0.9,0.0,,0.0,-0.0,0.0,0.6,0.4,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.002,207.1,0.9,0.0,,0.0,-0.0,0.0,0.6,0.4,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.003,207.1,0.9,0.0,,0.0,-0.0,0.0,0.6,0.4,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.004,207.1,0.9,0.0,,0.0,-0.0,0.0,0.6,0.4,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,time,handwheelAngle,throttle,brake,altitude,horizontalSpeed,vxCG,vyCG,yawAngle,pitchAngle,rollAngle,distance,diff_yawAngle,diff_pitchAngle,diff_rollAngle,diff_horizontalSpeed,diff_distance,diff_vxCG,diff_vyCG
2324995,2324.995,-3.2,0.9,0.0,194.156,0.01,-0.0,0.01,-12.79,0.66,0.85,39662.235,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2324996,2324.996,-3.2,0.9,0.0,194.156,0.01,-0.0,0.01,-12.79,0.66,0.85,39662.235,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2324997,2324.997,-3.2,0.9,0.0,194.156,0.01,-0.0,0.01,-12.79,0.66,0.85,39662.235,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2324998,2324.998,-3.2,0.9,0.0,194.156,0.01,-0.0,0.01,-12.79,0.66,0.85,39662.235,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2324999,2324.999,-3.2,0.9,0.0,194.156,0.01,-0.0,0.01,-12.79,0.66,0.85,39662.235,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# plot data

In [15]:
num_points = 2000000
stride = 2500
print('max points per plot: %s' % (num_points / stride))

file_names = list(DATA.keys())

data = {}
layouts = {}
figs = {}
i = 0

for file_name in file_names:
    df = DATA[file_name][1]
    
    data[file_name] = []
    times = df['time'].values

    for sig_name in COLUMNS_TO_DIFF:
        trace = plotly.graph_objs.Scatter(
            x = times[:num_points:stride],
            y = df[COLUMN_DIFF_PREFIX + sig_name].values[:num_points:stride],
            name = COLUMN_DIFF_PREFIX + sig_name,
        )
        
        data[file_name].append(trace)
        
    layouts[file_name] = plotly.graph_objs.Layout(
        title=file_name
    )
    
    #figs[file_name] = plotly.graph_objs.Figure(data=data[file_name], layout=layouts[file_name])
    #plotly.offline.iplot(figs[file_name])

max points per plot: 800.0


# plot as subplots

In [16]:
num_points = 2000000
stride = 2500
print('max points per plot: %s' % (num_points / stride))

file_names = list(DATA.keys())

fig = plotly.tools.make_subplots(rows=len(DATA), cols=1, subplot_titles=file_names)
fig['layout'].update(height=3000, width=800)

data = {}
layouts = {}
i = 0

for file_name in file_names:
    df = DATA[file_name][1]
    i += 1
    
    data[file_name] = []
    times = df['time'].values

    for sig_name in COLUMNS_TO_DIFF:
        trace = plotly.graph_objs.Scatter(
            x = times[:num_points:stride],
            y = df[COLUMN_DIFF_PREFIX + sig_name].values[:num_points:stride],
            name = COLUMN_DIFF_PREFIX + sig_name,
        )
        
        fig.append_trace(trace, i, 1)

#plotly.offline.iplot(fig)

max points per plot: 800.0
This is the format of your plot grid:
[ (1,1) x1,y1 ]
[ (2,1) x2,y2 ]

