### Inspect data
#### List of tasks accomplished in this Jupyter Notebook:
- Remove dead larvae and experiments that were accidentally begun before the larval daylight cycle. 
- Convert experiment start time to number of minutes elapsed since light cycle ON time
- Save cleaned dataset in a new `CSV` file. 
- Check that the number of fed and starved animals in each experiment adds up to the total number of animals. 
- Output the number of animals in each experiment (total) and the number of fed and starved animals for each experiment as a TXT file
- Check that each experiment larva has one video HDF5 file and one folder

In [None]:
import numpy as np
import pandas as pd
import glob, os

- Remove dead larvae and experiments that were accidentally begun before the larval daylight cycle. 
- Convert experiment start time to number of minutes elapsed since light cycle ON time
- Save cleaned dataset in a new `CSV` file. 

In [None]:
df = pd.read_csv('./data/experiment_IDs/static_data_naive.csv')
print(len(df), "total larvae experiments")

no_dead = df[df['dead'] == 'no'].copy()
print(len(no_dead), "larvae after removing dead larvae")

no_dead["start_hour"] = pd.DatetimeIndex(no_dead['acclimate_start']).hour
no_dead["start_min"] = pd.DatetimeIndex(no_dead['acclimate_start']).minute

# Remove experiments starting before 9am
times = no_dead[no_dead["start_hour"] >= 9]

# Remove experiments starting after 
times = times[(times["start_hour"] <= 9+11) | \
              ((times["start_hour"] <= 9+12) & (times["start_min"] <= 45))]
print(len(times), "larvae after removing experiments not during daylight")

times["minutes_past_L"] = pd.to_datetime("2018-01-01"+times["experiment_start"], format="%Y-%m-%d%H:%M")
times["minutes_past_L"] = times["minutes_past_L"] - pd.to_datetime("2018-01-01-09:00:00")
times["minutes_past_L"] = pd.DatetimeIndex(times["minutes_past_L"]).hour * 60 + \
                            pd.DatetimeIndex(times["minutes_past_L"]).minute

times.to_csv("./data/experiment_IDs/cleaned_static_data.csv", index=None)
print("--- Data cleaned and saved to file ---")

- Check that the number of fed and starved animals in each experiment adds up to the total number of animals. 
- Output the number of animals in each experiment (total) and the number of fed and starved animals for each experiment as a TXT file

In [None]:
# READ IN CLEANED DATASET
df = pd.read_csv("./data/experiment_IDs/cleaned_static_data.csv")
textfile = [str(len(df))+" Total larvae in entire dataset"]

num_s = len(df[df["starved"] == '1day'])
num_f = len(df[df["starved"] == 'no'])
textfile.append(str(num_s)+" Starved larvae in entire dataset")
textfile.append(str(num_f)+" Fed larvae in entire dataset")
textfile.append('---')  

# Check that number of fed and starved animals adds up to the total
assert len(df) == num_s + num_f

experiments = df["treatment_odor"].unique()
for experiment in experiments:
    temp = df[df["treatment_odor"] == experiment]
    temp_starved = temp[temp["starved"] == '1day']
    temp_fed = temp[temp["starved"] == 'no']
    
    # Check that number of fed and starved animals adds up to the total
    assert len(temp) == len(temp_fed) + len(temp_starved)
    
    textfile.append(experiment.upper())
    textfile.append(str(len(temp))+" Total larvae")
    textfile.append(str(len(temp_starved))+" Starved larvae")
    textfile.append(str(len(temp_fed))+" Fed larvae")
    textfile.append('---')
    
text_df = pd.DataFrame(textfile)
text_df.to_csv("./data/experiment_IDs/n_values.csv", header=None, index=None)
print("--- Data summary values saved to file ---")

- Check that each experiment larva has one video HDF5 file and one folder

In [None]:
# READ IN CLEANED DATASET
df = pd.read_csv("./data/experiment_IDs/cleaned_static_data.csv")
animals = df["animal_ID"].values

# Create folder names for each animal ID to use in checking
df["acc_filenames"] = "./data/trajectories/analysis_files_reviewed/"+df["animal_ID"]+"-acclimate"
df["exp_filenames"] = "./data/trajectories/analysis_files_reviewed/"+df["animal_ID"]+"-experiment"

acc_fnames = df["acc_filenames"].values
exp_fnames = df["exp_filenames"].values

# Check that every animal ID is unique
assert len(set(acc_fnames)) == len(acc_fnames)
assert len(set(exp_fnames)) == len(exp_fnames)

acc_folders = glob.glob('./data/trajectories/analysis_files_reviewed/*-acclimate')
exp_folders = glob.glob('./data/trajectories/analysis_files_reviewed/*-experiment')

# Convert filenames to animal IDs for comparison
acc_folders_str = [x.split("reviewed/")[-1].split("reviewed\\")[-1].split("-acclimate")[0] for x in acc_folders]
exp_folders_str = [x.split("reviewed/")[-1].split("reviewed\\")[-1].split("-experiment")[0] for x in exp_folders]

# Check that each animal has 1 and only 1 folder associated with the animal
assert sorted(animals) == sorted(acc_folders_str)
assert sorted(animals) == sorted(exp_folders_str)

acc_hdf5 = glob.glob('./data/trajectories/analysis_files_reviewed/*-acclimate/data/*.hdf5')
exp_hdf5 = glob.glob('./data/trajectories/analysis_files_reviewed/*-experiment/data/*.hdf5')

# Convert filenames to animal IDs for comparison
acc_hdf5_str = [x.split("reviewed/")[-1].split("reviewed\\")[-1].split("-acclimate")[0] for x in acc_hdf5]
exp_hdf5_str = [x.split("reviewed/")[-1].split("reviewed\\")[-1].split("-experiment")[0] for x in exp_hdf5]

# Check that each animal has 1 and only 1 hdf5 file associated with the folder
assert sorted(animals) == sorted(acc_hdf5_str)
assert sorted(animals) == sorted(exp_hdf5_str)

print("--- All checks passed ---")

In [None]:
# CHECK THAT MANUALLY ANNOTATED FILES ALL EXIST AND ARE SPELLED CORRECTLY
df = pd.read_csv("./data/trajectories/manually_checked_beginning_pause.csv")

for name in df["filename"].values:
    fname = "./data/trajectories/video_csvs/"+name+".csv"
    if not os.path.isfile(fname):
        print(fname)
        
print("--- All checks passed ---")