# Analyze the Toss Data from the HW assignment

We will load up all the data we can by scanning the data by each type to see if we spot any crazy data.

In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
from math import sqrt
import seaborn as sns

from pathlib import Path

## Load the data

We need a data loader!

In [None]:
# Define the directory where the CSV files are located
directory = Path('./data/ClassData')

def fetch_data(sub_dir_name: str) -> pd.DataFrame:
    # Define the directory where the CSV files are located
    f_dir = directory / sub_dir_name
    # Recursively get a list of all .txt files in this directory and below.
    csv_files = list(f_dir.glob('**/*.txt'))
    # Initialize an empty list to store the DataFrames
    dfs = []
    # Loop over the list of CSV files
    for index, file in enumerate(csv_files):
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file)
        # Add a new column to the DataFrame that stores the file name
        df['file_name'] = file
        df['file_index'] = index
        df['measurement_index'] = range(1, len(df) + 1)
        # Append the DataFrame to the list
        dfs.append(df)
    # Concatenate all the DataFrames in the list into a single DataFrame
    df = pd.concat(dfs, ignore_index=True)
    return df

In [None]:
df_holding = fetch_data('held')
df_horizontal = fetch_data('horizontal')
df_up = fetch_data('up')

## A few things about the data we've loaded.

What is min and max number of measurements in each?

In [None]:
def dump_stats(df: pd.DataFrame, name: str):
    # Number of measurements
    by_file = df.groupby('file_index')
    print(f'{name}: Min # of measurements: {by_file.size().min()}, max: {by_file.size().max()}')

    # Jitter
    df['time_diff'] = df['Time'].diff()

    # Plot the histogram. Simple cut of -0.5 to avoid where we reset the times
    # this means we don't have to do a group by to get the measurements right.
    df[df.time_diff > -0.5].time_diff.plot(kind='hist', bins=100)
    plt.xlabel('Time Difference')
    plt.ylabel('Frequency')
    plt.title(f'Histogram of Time Differences for {name}')
    plt.show()

    # Acceleration Profile
    df['a'] = (df.ax**2 + df.ay**2 + df.az**2).apply(sqrt)
    df['a'].plot(kind='hist', bins=100)
    plt.xlabel('Total Acceleration')
    plt.ylabel('Frequency')
    plt.title(f'Histogram of Total Acceleration for {name}')
    plt.show()

In [None]:
dump_stats(df_holding, 'Holding')

In [None]:
dump_stats(df_horizontal, 'Horizontal')

In [None]:
dump_stats(df_up, 'Up')

Last time we looked at that bump that was out beyond 0.16.

# Dump of the "holding" data

In [None]:
def per_trial(df: pd.DataFrame, name: str):
    # Create a FacetGrid with file_index as the row variable
    g = sns.FacetGrid(data=df, row='file_index', sharey=True, aspect=4, height=2)

    # Plot line plots for each file_index
    g.map(sns.lineplot, 'measurement_index', 'a')

    # Add a red line at 9.8
    g.map(plt.axhline, y=9.8, color='red')

    # Adjust the layout of the plots
    g.figure.tight_layout()

    # Show the plots
    plt.show()

per_trial(df_holding, 'Holding')

In [None]:
per_trial(df_horizontal, 'Horizontal')

In [None]:
per_trial(df_up, 'Up')