## Imports

In [None]:
from bokeh.plotting import figure, output_notebook, show
from bokeh.layouts import gridplot
from bokeh.palettes import Spectral6
output_notebook()

import pandas as pd
import numpy as np

import os

## Create some stats about each trajectory

In [None]:
data_dir = '../../data/raw'
base_dir = os.path.join(data_dir, 'sample')
envs = os.listdir(base_dir)
envs

In [None]:
stats = pd.DataFrame(columns=['environment', 'path', 'trajectory', 'samples', 'duration', 
                              'linear_mean', 'linear_min' , 'linear_max', 'linear_std', 'linear_integral',
                              'angular_mean', 'angular_min' , 'angular_max', 'angular_std', 'angular_integral', 
                              'target_distance', 'target_distance_mean', 'target_distance_min', 'target_distance_max' ])

i = 0

for e in envs:
    
    path = os.path.join(base_dir,e)
    files = [os.path.join(path, f) for f in os.listdir(path)]
    
    for t in files:

        df = pd.read_csv(t)
        current = df[['stamp', 'target_x', 'target_y', 'linear_x', 'angular_z']].describe()

        duration = (df.stamp.iloc[-1] - df.stamp.iloc[0]) * 1e-9
        
        distance_to_target = np.linalg.norm(df[['target_x', 'target_y']].values, axis=1)
        
        sample_period = (df.stamp.shift(-1) - df.stamp).fillna(method='ffill') * 1e-9
        
        stats.loc[i] = [e, t, 
                        os.path.basename(t).split('.')[0].split('_')[-1], 
                        current.stamp.loc['count'], 
                        duration,
                        current.linear_x.loc['mean'],
                        current.linear_x.loc['min'],
                        current.linear_x.loc['max'],
                        current.linear_x.loc['std'],
                        (df['linear_x'] * sample_period).sum(),
                        current.angular_z.loc['mean'],
                        current.angular_z.loc['min'],
                        current.angular_z.loc['max'],
                        current.angular_z.loc['std'],
                        (df['angular_z'] * sample_period).sum(),
                        distance_to_target[0],
                        np.mean(distance_to_target),
                        np.min(distance_to_target),
                        np.max(distance_to_target)]
        i += 1

In [None]:
stats.to_csv('stats.csv')

## Read already created stats data

In [None]:
stats = pd.read_csv(os.path.join(data_dir, 'stats.csv'), index_col=0)
print(stats.shape)

In [None]:
stats.describe()

In [None]:
from bokeh.charts import Histogram

h = Histogram(stats, values='samples', color='environment')
show(h)

Analyze the longest trajectories and compute a view stats

In [None]:
long = stats.loc[(stats['samples'] < 1500) & (stats['samples'] > 1000)].sort_values('samples')
long = stats.loc[(stats['samples'] > 1000)].sort_values('samples')

Get the number of trajectories included after the long filter

In [None]:
long.groupby('environment').count()

Get the number of samples included after the long filter

In [None]:
long.groupby('environment').sum()

List of the included environments

In [None]:
envs = long['environment'].unique()
print(envs)

Write the filtered files into a .txt file. Each line contains the path to one trajectory

In [None]:
env = envs[1]
paths = long.loc[long['environment'] == env]['path'].values

with open(os.path.join(data_dir, 'paths.txt'), 'w') as f:
    for p in paths:
        f.write(p + '\n')

## Further analysis of trajectories

Remove trajectories with NAN values or trajectories with less than 10 samples

In [None]:
stats_clean = stats.dropna().loc[stats['samples'] > 10]
print(stats_clean.shape)
stats_clean.sort_values('samples').head()

long = stats_clean.loc[(stats_clean['samples'] > 1000)].sort_values('samples')
short = stats_clean.loc[(stats_clean['samples'] < 1000)].sort_values('samples')
long.describe()

### Perform a PCA to visualize the data in 2D

In [None]:
from sklearn.decomposition import PCA

X = short.loc[:,'linear_mean':'target_distance_max'].values
print(X.shape)

pca_dec = PCA(n_components=2)
pca_dec.fit(X)
Xpca = pca_dec.transform(X)
print(Xpca.shape)

#Xpca_long = pca_dec.transform(long.loc[:,'linear_mean':'target_distance_max'].values)
#print(Xpca_long.shape)


In [None]:
p = figure(plot_height=800, plot_width=800)
p.circle(Xpca[:,0], Xpca[:,1], radius=short['samples']*1e-3)
#p.circle(Xpca_long[:,0], Xpca_long[:,1], color='firebrick', radius=long['samples']*1e-3)

show(p)

In [None]:
from bokeh.models import ColumnDataSource, HoverTool

hover = HoverTool(
        tooltips=[
            ("index", "$index"),
            ("(x,y)", "($x, $y)"),
            ("samples", "@samples"),
            ("id", "@id")
        ]
    )

plot_data = ColumnDataSource(data=dict(x=Xpca[:,0], y=Xpca[:,1], samples=short['samples'], id=short['trajectory']))

p1 = figure(plot_height=800, plot_width=800)
p1.circle('x', 'y', source=plot_data, radius=2, alpha=0.75)
p1.add_tools(hover)

show(p1)

Filter trajectories with X > 40

In [None]:
short.iloc[Xpca[:,0] > 40].sort_values('trajectory')

Write the list of files into a .txt file

In [None]:
paths = short.iloc[Xpca[:,0] > 40]['path'].values
with open(os.path.join(data_dir, 'paths.txt'), 'w') as f:
    for p in paths:
        f.write(p + '\n')

Get a list of trajecories to include after the PCA filtering

In [None]:
pca_include = Xpca[:,0] <= 40

## Plot found trajectories

In [None]:
base_dir = os.path.join(data_dir, 'sample')
envs = os.listdir(base_dir)
envs

In [None]:
files = dict()

count = 0
for f in envs:
    path = os.path.join(base_dir, f)
    files[f] = [os.path.join(path,x) for x in os.listdir(path)]
    count += len(files[f])
print('We have found {} files'.format(count))

Plot 10 trajecories, specified by the environment variable and r in the script bellow. Each row in the grid represents one trajectory. The left plot shows the linear control commands and the distance to the goal pose. The right plot depicts the angular control commmands and the angle to the goal pose.

In [None]:
environment = envs[0] # Select the source environment

plots = list()

print('{}: {}'.format(environment, len(files[environment])))

r = 0 # Select the starting index for the plotted trajectories
for i,t in enumerate(files[environment][r:r+10]):

    df = pd.read_csv(t)
    
    df['filtered_linear_x'] = df['linear_x'].rolling(window=7, center=True).mean().fillna(df['linear_x'])
    df['filtered_angular_z'] = df['angular_z'].rolling(window=5, center=True).mean().fillna(df['angular_z'])
    
    distance = np.linalg.norm(df[['target_x', 'target_y']].values, axis=1)
    
    f1 = figure(title=t, plot_width=450, plot_height=300)
    f1.line(df.index, df['linear_x'], line_color=Spectral6[0])
    f1.line(df.index, df['filtered_linear_x'], line_color=Spectral6[5])
    f1.line(df.index, distance, line_color=Spectral6[1])
    
    f2 = figure(plot_width=450, plot_height=300)
    f2.line(df.index, df['angular_z'], line_color=Spectral6[0])
    f2.line(df.index, df['filtered_angular_z'], line_color=Spectral6[5])
    f2.line(df.index, df['target_yaw'], line_color=Spectral6[1])

    plots.append([f1, f2])
    
show(gridplot(plots))

## Generate final list of filtered trajectories

We will drop 421 trajectories for various reasons, resulting in a set of 11602 trajectories

In [None]:
filtered_stats = stats.dropna().loc[(stats['samples'] > 10) & (stats['samples'] < 1000)].sort_values('samples')
filtered_stats = filtered_stats.iloc[pca_include]

In [None]:
filtered_stats.shape

Write the list of valid trajectories into a .txt file

In [None]:
with open(os.path.join(data_dir, 'valid_trajectories.txt'), 'w') as f:
    for t in filtered_stats['path'].values:
        f.write(t + '\n')

### Get a few stats about valid and invalid trajectories

In [None]:
excluded_stats = stats.iloc[~stats.index.isin(filtered_stats.index)]

print('Valid trajectories: {}'.format(filtered_stats.shape[0]))
print('Excluded trajectories: {}'.format(excluded_stats.shape[0]))

In [None]:
print('Valid samples: {}'.format(filtered_stats['samples'].sum()))
print('Excluded samples: {}'.format(excluded_stats['samples'].sum()))

## Analyze the evaluation data

Load the trajectories which belong to the evaluation set and print a few stats about them

In [None]:
base_dir = os.path.join(data_dir, 'evaluation')
envs = os.listdir(base_dir)
envs

In [None]:
files = dict()

count = 0
for f in envs:
    path = os.path.join(base_dir, f)
    files[f] = [os.path.join(path,x) for x in os.listdir(path)]
    count += len(files[f])
print('We have found {} files'.format(count))

In [None]:
total = 0
for environment in envs:

    print('{}: {} trajectories'.format(environment, len(files[environment])))

    samples = 0
    for i,t in enumerate(files[environment]):

        df = pd.read_csv(t)
        samples += df.shape[0]

    print('{}: {} samples'.format(environment, samples))
    total += samples
    
print('{}: {} samples'.format('Total', total))