In [257]:
# Initialize Otter
import otter
grader = otter.Notebook("Lab_2_functions.ipynb")

# Lab 2: Statistical analysis of data using numpy

Lab slides: (https://docs.google.com/presentation/d/1ykwwcQ0onMvAjUxfJmKl9tbo-rJPdB5pRwDEmpDsd-g/edit?usp=sharing)

For this lab the goal is to write functions to pull out one data channel and print out statistics for different types of hand motions. We will write a function to load data from a single file, a second to combine data from multiple files, a third to pull out the data for a selected data channel, then a fourth to calculate statistics.

Written properly, you only need one function to do stats for the entire data channel or for just one type of hand motion. For any data channel. In the homework you'll use this logic to apply statistics to a specific data channel.

In [258]:
# Libraries that we need to import - numpy and json (for loading the description file)
import numpy as np
import json as json
# os is needed for calling os.path.basename
import os

### Reading in data

Based on your code from Lab 1, write a function `get_data` that loads the data from a single CSV and adds a column at the end containing the hand motion ID. Then, write a function `get_data_from_files` that loads data from a list of CSV files into a single numpy array using `get_data`.

Both functions will operate on a dictionary containing `"csv_path"` which holds the path to the data file and `"motion_id"` which holds the ID of the motion contained within the CSV file. You will write a third function `get_file_info` that returns this dictionary for a given CSV file, which will extract the motion type from the file name.

Note: The functions initially just contain `pass`, which is just a placeholder that you should remove. In Python, `pass` does nothing at all by design, but removing it and having an empty function is illegal ([docs](https://docs.python.org/3/tutorial/controlflow.html#pass-statements)).

In [259]:
# Numeric ids to indicate hand motion type from Lab 1
clap_id = 1
high_five_id = 2
snap_id = 3

In [260]:
def get_data(file_info):
    """ Function that returns the data from the given CSV file.
    @param file_info - a dictionary with keys "csv_path" and "motion_id"
    @return Return array should contain data in file with an extra column at the end containing the motion_id."""


    file_data = np.loadtxt(file_info["csv_path"], dtype="float", delimiter=",")

    motion_id = file_info["motion_id"]
    file_data_motion = np.zeros((file_data.shape[0], file_data.shape[1]+1))
    file_data_motion[:, 0:-1] = file_data
    file_data_motion[:, -1] = motion_id

    return file_data_motion

    pass


In [261]:
def get_data_from_files(file_list):
    """ Function that returns data from a list of files.
    @param file_list - a list of dictionaries, where each dictionary contains `csv_path` and `motion_id`.
    @return A single return array containing the data from all of the given input files."""

    file_list_list = []

    for item in file_list:

        data = get_data(item)
        file_list_list.append(data)

    return np.concatenate(file_list_list)


    # Hint: Use np.concatenate to combine multiple numpy arrays.
    pass

In [262]:
# SCRATCH cell
fname_chop_up = "Data/S01C01.csv"
res = os.path.basename(fname_chop_up)

# TODO.. look at res. How would you get out the C F or S character?
res_letter = res[3]
print(res_letter)

C


In [263]:
# Hint: Use `os.path.basename` to get the filename of the CSV file (e.g., S10F01.csv),
# extract the C/F/S character which will appear at a fixed offset in the filename,
# and then return the right motion ID based on that character. Remember that C is a
# clap, F is a high five, and S is a snap.
def get_file_info(csv_path):
    """Function that returns a file_info dictionary for a given filepath.
    @param csv_path - path to a CSV file containing a hand motion
    @return A dictionary with key "csv_path", containing csv_path, and "motion_id", containing the type of motion encoded in the file."""

    # get file name from path and find 4th letter
    filename = os.path.basename(csv_path)
    data_letter = filename[3]

    # determine motion_id based on 4th letter
    if data_letter == "C":
        motion_id = clap_id
    elif data_letter == "F":
        motion_id = high_five_id
    elif data_letter == "S":
        motion_id = snap_id

    # create dictionary based on path and motion_id
    data_info = {"csv_path":csv_path, "motion_id":motion_id}

    return data_info

    # data_letter = filename

    # The base file path is of the form S##[C|F|S]##.csv.
    pass

get_file_info("Data/S01C01.csv")

{'csv_path': 'Data/S01C01.csv', 'motion_id': 1}

In [264]:
# Read in data from the files in lab 1 using the functions you just wrote.
all_data = get_data_from_files([
    get_file_info("Data/S01C01.csv"),
    get_file_info("Data/S01F01.csv"),
    get_file_info("Data/S01S01.csv"),
])

In [265]:
grader.check("get_data")

## Doing the slice

Get the data for one of the channels. 

In [266]:
# This reads in the json data
try:
    with open("Data/data_description.json", "r") as fp:
        data_description = json.load(fp)
except FileNotFoundError:
    print(f"The file was not found; check that the data directory is in the current one and the file is in that directory")
        

In [267]:
def get_descriptor(data_description, name):
    """ Returns the descriptor for the given data channel.
    @param name - The name of the data channel to look for. """

    for item in data_description["data_channels"]:
        if item["name"] == name:
            descriptor_dict = item
            break

    return descriptor_dict

    pass

In [268]:
def get_channel_data(all_data, index_offset, n_dims):
    """ Get the data for just one channel (e.g., right hand accelerometer)
    @param all_data - numpy array containing data from one or more files
    @param index_offset - the column to begin getting data from
    @param n_dims - number of dimensions for the data channel
    @return Return array should be number of rows in all_data X n_dims"""

    print(all_data)
    print(index_offset)
    print(n_dims)
    
    channel_data = np.zeros((n_dims, all_data.shape[1]))
    channel_data = all_data[index_offset:n_dims, :]

    return channel_data

    pass

In [269]:
# Test 1 - the right hand accelerometer data
rh_accelerometer_descriptor = get_descriptor(data_description, "Right hand accelerometer")
rh_accelerometer_data = get_channel_data(all_data, index_offset=rh_accelerometer_descriptor["index_offset"], n_dims=rh_accelerometer_descriptor["dimensions"])


[[ 7.7093e+04  7.0000e-01 -1.3000e+00 ...  9.2830e+01 -3.1140e+02
   1.0000e+00]
 [ 7.7098e+04  6.9000e-01 -1.2600e+00 ...  9.4150e+01 -3.2170e+02
   1.0000e+00]
 [ 7.7103e+04  7.1000e-01 -1.1700e+00 ...  9.4790e+01 -3.3240e+02
   1.0000e+00]
 ...
 [ 6.2870e+04  1.0000e-01 -3.4000e-01 ...  4.1810e+01 -1.8564e+02
   3.0000e+00]
 [ 6.2875e+04  1.0000e-01 -3.4000e-01 ...  2.0750e+01 -1.7082e+02
   3.0000e+00]
 [ 6.2880e+04  7.0000e-02 -3.6000e-01 ...  3.0200e+00 -1.5436e+02
   3.0000e+00]]
1
3


In [270]:
# SELF TESTS
print(f"Shape of rhs_accelerometer_data is {rh_accelerometer_data.shape}, should be 285 X 3")
print(f"First row, first column value {rh_accelerometer_data[0, 0]:0.2f}, should be 0.70")
print(f"First row, last column value {rh_accelerometer_data[0, -1]:0.2f}, should be -0.41")
print(f"Last row, first column value {rh_accelerometer_data[-1, 0]:0.2f}, should be 0.07")
print(f"Last row, last column value {rh_accelerometer_data[-1, -1]:0.2f}, should be -0.98")


Shape of rhs_accelerometer_data is (2, 14), should be 285 X 3
First row, first column value 77098.00, should be 0.70
First row, last column value 1.00, should be -0.41
Last row, first column value 77103.00, should be 0.07
Last row, last column value 1.00, should be -0.98


In [271]:
# Tests for Left hand gyroscope
lh_gyroscope_descriptor = get_descriptor(data_description, "Left hand gyroscope")
lh_gyroscope_data = get_channel_data(all_data, index_offset=lh_gyroscope_descriptor["index_offset"], n_dims=lh_gyroscope_descriptor["dimensions"])

[[ 7.7093e+04  7.0000e-01 -1.3000e+00 ...  9.2830e+01 -3.1140e+02
   1.0000e+00]
 [ 7.7098e+04  6.9000e-01 -1.2600e+00 ...  9.4150e+01 -3.2170e+02
   1.0000e+00]
 [ 7.7103e+04  7.1000e-01 -1.1700e+00 ...  9.4790e+01 -3.3240e+02
   1.0000e+00]
 ...
 [ 6.2870e+04  1.0000e-01 -3.4000e-01 ...  4.1810e+01 -1.8564e+02
   3.0000e+00]
 [ 6.2875e+04  1.0000e-01 -3.4000e-01 ...  2.0750e+01 -1.7082e+02
   3.0000e+00]
 [ 6.2880e+04  7.0000e-02 -3.6000e-01 ...  3.0200e+00 -1.5436e+02
   3.0000e+00]]
10
3


In [272]:
# Check size and first, last element
print(lh_gyroscope_data.shape)
print(all_data.shape[0], 3)
print(lh_gyroscope_data[0, 0])
print(lh_gyroscope_data[-1, -1])

assert(lh_gyroscope_data.shape == (all_data.shape[0], 3))
assert(np.isclose(lh_gyroscope_data[0, 0], 429.34))
assert(np.isclose(lh_gyroscope_data[-1, -1], -154.36))

(0, 14)
285 3


IndexError: index 0 is out of bounds for axis 0 with size 0

In [None]:
grader.check("check_slice")

## Compute stats: Write a function to calculate the four stats

This is a variation on what you did in lab 1; in this case, we're going to do it with two functions. The first calculates the stats and returns the dictionary (**calc_stats**) the second does the **for** loop to make one dictionary for each dimension in the data.

- Step 1 [this problem] - do the **calc_stats** function
- Step 2 [next problem] - do the loop to calculate the stats for each x,y,z channel

In [None]:
def calc_stats(data):
    """Calculate min, max, mean and standard deviation for the array and put in a dictionary
    @param data a numpy array
    @return a dictionary"""

    # Use keys Min, Max, Mean, and SD
    pass

In [None]:
# Test the function with known data
test_data = np.linspace(0, 1, 10)
ret_dict = calc_stats(test_data)

assert(np.isclose(ret_dict["Min"], 0.0))
assert(np.isclose(ret_dict["Max"], 1.0))
assert(np.isclose(ret_dict["Mean"], 0.5))
assert(np.isclose(ret_dict["SD"], 0.319, atol=0.01))

In [None]:
grader.check("stats_channel")

### Now do the second half - 

This function calculates the stats for an entire channel of the data, and stores the result in a list of dictionaries

In [None]:
def calc_stats_for_channel(data, n_dims):
    """ Calculate the stats for a channel
    @param data - an n_timestamps * n_dims size array
    @param n_dims - 1, 2, or 3 (just x; x,y; or x,y,z)
    @return A list of dictionaries. The list is the length of n_dims"""

    stats_list = []
    # TODO Copy in your for loop from the statistics problem in Lab 1
    # - You DO need to slice the data into the x,y,z channels
    # - You need to loop n_dims times
    # - Don't forget to return the array
    pass

In [None]:
# SCRATCH CELL
# If you're having trouble, try setting n_dims to 1 and use test_data for the data input

In [None]:
# Testing with known data - make a fake data set with 5 time steps and x, y, z data
#  
test_stats = np.zeros((5, 3))
# Set the x data to be ones
test_stats[:, 0] = np.ones(5)
# Set the y data to be twos
test_stats[:, 1] = np.ones(5) * 2
# Set the z data to be threes
test_stats[:, 2] = np.ones(5) * 3

# Now get the actual stats
ret_stats_array = calc_stats_for_channel(test_stats, n_dims=3)

# Check the mean result for x, y, and z - should be 1, 2, and 3 respectively
assert(ret_stats_array[0]["Mean"] == 1.0)
assert(ret_stats_array[1]["Mean"] == 2.0)
assert(ret_stats_array[2]["Mean"] == 3.0)

In [None]:
# this should work
ret_stats_rh_accelerometer = calc_stats_for_channel(rh_accelerometer_data, 3)

In [None]:
# As should this
res_stats_lh_gyroscope = calc_stats_for_channel(lh_gyroscope_data, 3)

In [None]:
grader.check("loop_data_calc_stats")

## Boolean slicing to get successful versus unsuccessful statistics out

Use the functions you just wrote to get out the min and max z values for each type of hand motion.

For this problem I have written code that is *incorrect*. You know the functions themselves are correct - you just tested them. The following bits of code have something wrong with either the way the function is called OR with the way the results are gotten back.

In [None]:
# Boolean filters for getting rows for a specific motion type. 
# Motion type should be the last row in all_data.
motion_type = all_data[:, 10]

# These should match the specific IDs for each motion type.
b_clap = motion_type == 5
b_snap = motion_type == 6
b_high_five = motion_type == 7

# Use b_clap to pick out the rows that are for claps. Send all column data for the selected rows.
#   Right hand accelerometer has 3 dimensions (x,y,z)
#   There's two errors here - one that actually will create incorrect results, one that just *happens* to work
#   correctly, although it doesn't do what the first sentance says...
ret_rh_accelerometer_clap = calc_stats_for_channel(rh_accelerometer_data[b_clap], n_dims=1)

# The minimum should be in the third (last) element in the list, the "min" key
z_min_clap = ret_rh_accelerometer_clap["Min"]
z_max_clap = ret_rh_accelerometer_clap["Max"]

# Now, do the same thing above, but for snap and high_five
z_min_snap = ...
z_max_snap = ...
z_min_high_five = ...
z_max_high_five = ...

print(f"Clap: Minimum {z_min_clap} and maximum {z_max_clap} value of right hand accelerometer z channel")
print(f"Snap: Minimum {z_min_snap} and maximum {z_max_snap} value of right hand accelerometer z channel")
print(f"High five: Minimum {z_min_high_five} and maximum {z_max_high_five} value of right hand accelerometer z channel")

In [None]:
grader.check("boolean_slicing")

## Optional/Extra credit: print out all of the rows where the minimum z value for a clap motion was reached

See the tutorial on **np.where** (c_tutorial_where.ipynb)

TODO: Use **np.where** to pick out the row that has the minimum z value for a clap motion.

In [None]:
# Use np.where to get out the indices. You can use == OR np.isclose() here; either works. In general, use .isclose for 
#  floating point comparisons.
# Append the row number of any matches to this list
all_rows_with_min = []

# Look at JUST the z values in rh_accelerometer_data
all_indices_from_where = ...

# Pseudo code - see tutorial for exact format
# for all row in all_indices_from_where
#    if this is row is from a clap: 
#       print(f"Row: {r}, Time step: {c}")

In [None]:
grader.check("optional_where")

## Hours and collaborators
Required for every assignment - fill out before you hand-in.

Listing names and websites helps you to document who you worked with and what internet help you received in the case of any plagiarism issues. You should list names of anyone (in class or not) who has substantially helped you with an assignment - or anyone you have *helped*. You do not need to list TAs.

Listing hours helps us track if the assignments are too long.

In [None]:

# List of names (creates a set)
worked_with_names = {"not filled out"}
# List of URLS I25 (creates a set)
websites = {"not filled out"}
# Approximate number of hours, including lab/in-class time
hours = -1.5

In [None]:
grader.check("hours_collaborators")

### To submit

- Do a restart then run all to make sure everything runs ok
- Save the file
- Submit just this .ipynb file through gradescope, Lab 2, functions
- You do NOT need to submit the data files - we will supply those
- Where there are given variable/file names (eg, foo = ...) DON'T change those, or the autograder will fail

If the Gradescope autograder fails, please check here first for common reasons for it to fail
    https://docs.google.com/presentation/d/1tYa5oycUiG4YhXUq5vHvPOpWJ4k_xUPp2rUNIL7Q9RI/edit?usp=sharing

Most likely failure for this assignment is not naming the data directory and files correctly; capitalization matters for the Gradescope grader. 