### Inspect data
#### List of tasks accomplished in this Jupyter Notebook:
- Remove dead larvae and experiments that were accidentally begun before the larval daylight cycle. 
- Convert experiment start time to number of minutes elapsed since light cycle ON time
- Save cleaned dataset in a new `CSV` file. 
- Check that the number of fed and starved animals in each experiment adds up to the total number of animals. 
- Output the number of animals in each experiment (total) and the number of fed and starved animals for each experiment as a TXT file
- Check that each experiment larva has one video HDF5 file and one folder

In [1]:
import numpy as np
import pandas as pd
import glob, os

- Remove dead larvae and experiments that were accidentally begun before the larval daylight cycle. 
- Convert experiment start time to number of minutes elapsed since light cycle ON time
- Save cleaned dataset in a new `CSV` file. 

In [2]:
df = pd.read_csv('./data/experiment_IDs/static_data_naive.csv')
print(len(df), "total larvae experiments")

no_dead = df[df['dead'] == 'no'].copy()
print(len(no_dead), "larvae after removing dead larvae")

no_dead["start_hour"] = pd.DatetimeIndex(no_dead['acclimate_start']).hour
no_dead["start_min"] = pd.DatetimeIndex(no_dead['acclimate_start']).minute

# Remove experiments starting before 9am
times = no_dead[no_dead["start_hour"] >= 9]

# Remove experiments starting after 
times = times[(times["start_hour"] <= 9+11) | \
              ((times["start_hour"] <= 9+12) & (times["start_min"] <= 45))]
print(len(times), "larvae after removing experiments not during daylight")

times["minutes_past_L"] = pd.to_datetime("2018-01-01"+times["experiment_start"], format="%Y-%m-%d%H:%M")
times["minutes_past_L"] = times["minutes_past_L"] - pd.to_datetime("2018-01-01-09:00:00")
times["minutes_past_L"] = pd.DatetimeIndex(times["minutes_past_L"]).hour * 60 + \
                            pd.DatetimeIndex(times["minutes_past_L"]).minute

times.to_csv("./data/experiment_IDs/cleaned_static_data.csv", index=None)
print("--- Data cleaned and saved to file ---")

540 total larvae experiments
503 larvae after removing dead larvae
499 larvae after removing experiments not during daylight
--- Data cleaned and saved to file ---


- Check that the number of fed and starved animals in each experiment adds up to the total number of animals. 
- Output the number of animals in each experiment (total) and the number of fed and starved animals for each experiment as a TXT file

In [3]:
# READ IN CLEANED DATASET
df = pd.read_csv("./data/experiment_IDs/cleaned_static_data.csv")
textfile = [str(len(df))+" Total larvae in entire dataset"]

num_s = len(df[df["starved"] == '1day'])
num_f = len(df[df["starved"] == 'no'])
textfile.append(str(num_s)+" Starved larvae in entire dataset")
textfile.append(str(num_f)+" Fed larvae in entire dataset")
textfile.append('---')  

# Check that number of fed and starved animals adds up to the total
assert len(df) == num_s + num_f

experiments = df["treatment_odor"].unique()
for experiment in experiments:
    temp = df[df["treatment_odor"] == experiment]
    temp_starved = temp[temp["starved"] == '1day']
    temp_fed = temp[temp["starved"] == 'no']
    
    # Check that number of fed and starved animals adds up to the total
    assert len(temp) == len(temp_fed) + len(temp_starved)
    
    textfile.append(experiment.upper())
    textfile.append(str(len(temp))+" Total larvae")
    textfile.append(str(len(temp_starved))+" Starved larvae")
    textfile.append(str(len(temp_fed))+" Fed larvae")
    textfile.append('---')
    
text_df = pd.DataFrame(textfile)
text_df.to_csv("./data/experiment_IDs/n_values.csv", header=None, index=None)
print("--- Data summary values saved to file ---")

--- Data summary values saved to file ---


- Check that each experiment larva has one video HDF5 file and one folder

In [4]:
# READ IN CLEANED DATASET
df = pd.read_csv("./data/experiment_IDs/cleaned_static_data.csv")
animals = df["animal_ID"].values

# Create folder names for each animal ID to use in checking
df["acc_filenames"] = "./data/trajectories/video_csvs/"+df["animal_ID"]+"-acclimate.csv"
df["exp_filenames"] = "./data/trajectories/video_csvs/"+df["animal_ID"]+"-experiment.csv"

acc_fnames = df["acc_filenames"].values
exp_fnames = df["exp_filenames"].values

# Check that every animal ID is unique
assert len(set(acc_fnames)) == len(acc_fnames)
assert len(set(exp_fnames)) == len(exp_fnames)

# Check that each animal has 1 and only 1 file associated with the animal
accs = glob.glob('./data/trajectories/video_csvs/*-acclimate.csv')
exps = glob.glob('./data/trajectories/video_csvs/*-experiment.csv')

assert sorted(acc_fnames) == sorted(accs)
assert sorted(exp_fnames) == sorted(exps)

# CHECK THAT MANUALLY ANNOTATED FILES ALL EXIST AND ARE SPELLED CORRECTLY
df = pd.read_csv("./data/trajectories/manually_checked_beginning_pause.csv")

for name in df["filename"].values:
    fname = "./data/trajectories/video_csvs/"+name+".csv"
    if not os.path.isfile(fname):
        print(fname)
        
print("--- All checks passed ---")

--- All checks passed ---


In [5]:
# READ IN CLEANED DATASET AND TEST THAT ALL FILES EXIST FOR DATA DRYAD UPLOAD
df = pd.read_csv("./data/experiment_IDs/cleaned_static_data.csv")
print(len(df), 'cleaned animal IDs to analyze\n---')

# Print out the files in the video folders that are not necessary
# These belong to dead larvae or those pupated before the end of the experiment

# 2018
df = pd.read_csv("./data/experiment_IDs/cleaned_static_data.csv")
df = df[df['animal_ID'].str.startswith('18')]
df["acc_filenames"] = "/home/eleanor/Downloads/videos/"+df["animal_ID"]+"-acclimate.avi"
df["exp_filenames"] = "/home/eleanor/Downloads/videos/"+df["animal_ID"]+"-experiment.avi"
acc_fnames = df["acc_filenames"].values
exp_fnames = df["exp_filenames"].values
acc_fs = glob.glob('/home/eleanor/Downloads/videos/18*-acclimate.avi')
exp_fs = glob.glob('/home/eleanor/Downloads/videos/18*-experiment.avi')
a_missing = [x for x in acc_fs if x not in acc_fnames]
e_missing = [x for x in exp_fs if x not in exp_fnames]
print(len(a_missing), 'acclimate and', len(e_missing), 'experiment videos included that are not valid larvae (2018).')
for a in sorted(a_missing):
    print(a)
a_missing = [x for x in acc_fnames if x not in acc_fs]
e_missing = [x for x in exp_fnames if x not in exp_fs]
print(len(a_missing), 'acclimate and', len(e_missing), 'missing videos (2018).\n---')
for a in sorted(a_missing):
    print(a)
    
# 2017
df = pd.read_csv("./data/experiment_IDs/cleaned_static_data.csv")
df = df[df['animal_ID'].str.startswith('17')]
df["acc_filenames"] = "/home/eleanor/Downloads/videos/"+df["animal_ID"]+"-acclimate.avi"
df["exp_filenames"] = "/home/eleanor/Downloads/videos/"+df["animal_ID"]+"-experiment.avi"
acc_fnames = df["acc_filenames"].values
exp_fnames = df["exp_filenames"].values
acc_fs = glob.glob('/home/eleanor/Downloads/videos/17*-acclimate.avi')
exp_fs = glob.glob('/home/eleanor/Downloads/videos/17*-experiment.avi')
a_missing = [x for x in acc_fs if x not in acc_fnames]
e_missing = [x for x in exp_fs if x not in exp_fnames]
print(len(a_missing), 'acclimate and', len(e_missing), 'experiment videos included that are not valid larvae (2017).')
for a in sorted(e_missing):
    print(a)
a_missing = [x for x in acc_fnames if x not in acc_fs]
e_missing = [x for x in exp_fnames if x not in exp_fs]
print(len(a_missing), 'acclimate and', len(e_missing), 'missing videos (2017).\n---')
for a in sorted(a_missing):
    print(a)
    
# 2019
df = pd.read_csv("./data/experiment_IDs/cleaned_static_data.csv")
df = df[df['animal_ID'].str.startswith('19')]
df['dat'] = df['animal_ID'].str[0:6]
df['num'] = df['animal_ID'].str[7:9]
df['pos'] = df['animal_ID'].str[10:]
df["acc_filenames"] = "/home/eleanor/Downloads/videos/"+df['dat']+'-'+df['num']+'-'+"A"+'-'+df['pos']+'.avi'
df["exp_filenames"] = "/home/eleanor/Downloads/videos/"+df['dat']+'-'+df['num']+'-'+"E"+'-'+df['pos']+'.avi'
acc_fnames = df["acc_filenames"].values
exp_fnames = df["exp_filenames"].values
acc_fs = glob.glob('/home/eleanor/Downloads/videos/19*A*.avi')
exp_fs = glob.glob('/home/eleanor/Downloads/videos/19*E*.avi')
a_missing = [x for x in acc_fs if x not in acc_fnames]
e_missing = [x for x in exp_fs if x not in exp_fnames]
print(len(a_missing), 'acclimate and', len(e_missing), 'experiment videos included that are not valid larvae (2019).')
for a in sorted(a_missing):
    print(a)
a_missing = [x for x in acc_fnames if x not in acc_fs]
e_missing = [x for x in exp_fnames if x not in exp_fs]
print(len(a_missing), 'acclimate and', len(e_missing), 'missing videos (2019).\n---')
for a in sorted(a_missing):
    print(a)

499 cleaned animal IDs to analyze
---
0 acclimate and 0 experiment videos included that are not valid larvae (2018).
0 acclimate and 0 missing videos (2018).
---
0 acclimate and 0 experiment videos included that are not valid larvae (2017).
0 acclimate and 0 missing videos (2017).
---
0 acclimate and 0 experiment videos included that are not valid larvae (2019).
0 acclimate and 0 missing videos (2019).
---


In [6]:
# READ IN CLEANED DATASET AND TEST THAT ALL FILES EXIST FOR DATA DRYAD UPLOAD
df = pd.read_csv("./data/experiment_IDs/cleaned_static_data.csv")
print(len(df), 'cleaned animal IDs to analyze\n---')

# Print out the files in the video folders that are not necessary
# These belong to dead larvae or those pupated before the end of the experiment

# 2018
df = pd.read_csv("./data/experiment_IDs/cleaned_static_data.csv")
df = df[df['animal_ID'].str.startswith('18')]
df["acc_filenames"] = "/home/eleanor/Downloads/analysis_files_reviewed/"+df["animal_ID"]+"-acclimate"
df["exp_filenames"] = "/home/eleanor/Downloads/analysis_files_reviewed/"+df["animal_ID"]+"-experiment"
acc_fnames = df["acc_filenames"].values
exp_fnames = df["exp_filenames"].values
acc_fs = glob.glob('/home/eleanor/Downloads/analysis_files_reviewed/18*-acclimate')
exp_fs = glob.glob('/home/eleanor/Downloads/analysis_files_reviewed/18*-experiment')
a_missing = [x for x in acc_fs if x not in acc_fnames]
e_missing = [x for x in exp_fs if x not in exp_fnames]
print(len(a_missing), 'acclimate and', len(e_missing), 'experiment files included that are not valid larvae (2018).')
for a in sorted(a_missing):
    print(a)
a_missing = [x for x in acc_fnames if x not in acc_fs]
e_missing = [x for x in exp_fnames if x not in exp_fs]
print(len(a_missing), 'acclimate and', len(e_missing), 'missing files (2018).\n---')
for a in sorted(a_missing):
    print(a)
    
# 2017
df = pd.read_csv("./data/experiment_IDs/cleaned_static_data.csv")
df = df[df['animal_ID'].str.startswith('17')]
df["acc_filenames"] = "/home/eleanor/Downloads/analysis_files_reviewed/"+df["animal_ID"]+"-acclimate"
df["exp_filenames"] = "/home/eleanor/Downloads/analysis_files_reviewed/"+df["animal_ID"]+"-experiment"
acc_fnames = df["acc_filenames"].values
exp_fnames = df["exp_filenames"].values
acc_fs = glob.glob('/home/eleanor/Downloads/analysis_files_reviewed/17*-acclimate')
exp_fs = glob.glob('/home/eleanor/Downloads/analysis_files_reviewed/17*-experiment')
a_missing = [x for x in acc_fs if x not in acc_fnames]
e_missing = [x for x in exp_fs if x not in exp_fnames]
print(len(a_missing), 'acclimate and', len(e_missing), 'experiment files included that are not valid larvae (2017).')
for a in sorted(e_missing):
    print(a)
a_missing = [x for x in acc_fnames if x not in acc_fs]
e_missing = [x for x in exp_fnames if x not in exp_fs]
print(len(a_missing), 'acclimate and', len(e_missing), 'missing files (2017).\n---')
for a in sorted(a_missing):
    print(a)
    
# 2019
df = pd.read_csv("./data/experiment_IDs/cleaned_static_data.csv")
df = df[df['animal_ID'].str.startswith('19')]
df['dat'] = df['animal_ID'].str[0:6]
df['num'] = df['animal_ID'].str[7:9]
df['pos'] = df['animal_ID'].str[10:]
df["acc_filenames"] = "/home/eleanor/Downloads/analysis_files_reviewed/"+df['dat']+'-'+df['num']+'-'+"A"+'-'+df['pos']
df["exp_filenames"] = "/home/eleanor/Downloads/analysis_files_reviewed/"+df['dat']+'-'+df['num']+'-'+"E"+'-'+df['pos']
acc_fnames = df["acc_filenames"].values
exp_fnames = df["exp_filenames"].values
acc_fs = glob.glob('/home/eleanor/Downloads/analysis_files_reviewed/19*A*')
exp_fs = glob.glob('/home/eleanor/Downloads/analysis_files_reviewed/19*E*')
a_missing = [x for x in acc_fs if x not in acc_fnames]
e_missing = [x for x in exp_fs if x not in exp_fnames]
print(len(a_missing), 'acclimate and', len(e_missing), 'experiment files included that are not valid larvae (2019).')
for a in sorted(e_missing):
    print(a)
a_missing = [x for x in acc_fnames if x not in acc_fs]
e_missing = [x for x in exp_fnames if x not in exp_fs]
print(len(a_missing), 'acclimate and', len(e_missing), 'missing files (2019).\n---')
for a in sorted(a_missing):
    print(a)

499 cleaned animal IDs to analyze
---
0 acclimate and 0 experiment files included that are not valid larvae (2018).
0 acclimate and 0 missing files (2018).
---
0 acclimate and 0 experiment files included that are not valid larvae (2017).
0 acclimate and 0 missing files (2017).
---
0 acclimate and 0 experiment files included that are not valid larvae (2019).
0 acclimate and 0 missing files (2019).
---
