# Links

[Open main branch version of this notebook in Colab](https://colab.research.google.com/github/ffvoigt/neuromatch-compneuro-2022-project/blob/main/Preprocessing_NMA2022_Sfenj1.ipynb)

[Open development branch version of this notebook in Colab](https://colab.research.google.com/github/ffvoigt/neuromatch-compneuro-2022-project/blob/development/Preprocessing_NMA2022_Sfenj1.ipynb)


##Caltech_preprocessing notebook 


*   The purpose of this notebook is to download the tracking dataset from the CalTech database and convert it to a .csv file that can be loaded using pandas. 




In [None]:
import os
import json
import numpy as np
import pandas as pd

In [None]:
# @title Download and unzip the data
import os, requests, zipfile

fname = 'task1.zip'
url = "https://data.caltech.edu/tindfiles/serve/a86f4297-a087-4f40-9ed4-765779105c2c/"

if not os.path.isfile(fname):
  try:
    r = requests.get(url)
  except requests.ConnectionError:
    print("!!! Failed to download data !!!")
  else:
    if r.status_code != requests.codes.ok:
      print("!!! Failed to download data !!!")
    else:
      with open(fname, "wb") as fid:
        fid.write(r.content)
else:
  print('Data have already been downloaded!!!')

if not os.path.exists('task1_classic_classification'):
  # Unzip the file
  with zipfile.ZipFile(fname, 'r') as zip_ref:
    zip_ref.extractall('.')


# Download the script
fname = 'calms21_convert_to_npy.py'
url = "https://data.caltech.edu/tindfiles/serve/ca84a583-ea06-440a-995c-c184bcb0291c/"

if not os.path.isfile(fname):
  try:
    r = requests.get(url)
  except requests.ConnectionError:
    print("!!! Failed to download data !!!")
  else:
    if r.status_code != requests.codes.ok:
      print("!!! Failed to download data !!!")
    else:
      with open(fname, "wb") as fid:
        fid.write(r.content)

##Convert .json files to .npy files 

In [None]:
!python calms21_convert_to_npy.py  --input_directory '.' --output_directory '.'
!python calms21_convert_to_npy.py  --input_directory '.' --output_directory '.' --parse_treba

Saving ./calms21_task1_test
tcmalloc: large alloc 1224941568 bytes == 0x4e832000 @  0x7f3f6ec451e7 0x4a3940 0x5b438c 0x5ea94f 0x5939cb 0x594cd3 0x5d0ecb 0x5939af 0x594cd3 0x594f8e 0x59526e 0x5bfba0 0x59aeca 0x515655 0x549e0e 0x593fce 0x548ae9 0x51566f 0x549576 0x593fce 0x5118f8 0x593dd7 0x5118f8 0x549576 0x604173 0x5f5506 0x5f8c6c 0x5f9206 0x64faf2 0x64fc4e 0x7f3f6e842c87
tcmalloc: large alloc 1224941568 bytes == 0x97864000 @  0x7f3f6ec451e7 0x4a3940 0x52ab72 0x527cf3 0x51d358 0x59358d 0x548c51 0x51566f 0x549576 0x4bcb19 0x59c019 0x59588e 0x595e64 0x4d8924 0x5bfbcb 0x59aeca 0x515655 0x549e0e 0x593fce 0x548ae9 0x51566f 0x549576 0x593fce 0x5118f8 0x593dd7 0x5118f8 0x549576 0x604173 0x5f5506 0x5f8c6c 0x5f9206
Saving ./calms21_task1_train
Saving ./calms21_task1_test_features
tcmalloc: large alloc 1224941568 bytes == 0xbd92000 @  0x7f8019a241e7 0x4a3940 0x5b438c 0x5ea94f 0x5939cb 0x594cd3 0x5d0ecb 0x5939af 0x594cd3 0x594f8e 0x59526e 0x5bfba0 0x59aeca 0x515655 0x549e0e 0x593fce 0x548ae9 0x51

## Load task data from converted .npy files

In [None]:
def load_task1_data(data_path):
  """
  Load data for task 1:
      The vocaubulary tells you how to map behavior names to class ids;
      it is the same for all sequences in this dataset.
  """
  data_dict = np.load(data_path, allow_pickle=True).item()
  dataset = data_dict['annotator-id_0']
  # Get any sequence key.
  sequence_id = list(data_dict['annotator-id_0'].keys())[0]
  vocabulary = data_dict['annotator-id_0'][sequence_id]['metadata']['vocab']
  return dataset, vocabulary

training_data, vocab = load_task1_data('./calms21_task1_train.npy')
test_data, _ = load_task1_data('./calms21_task1_test.npy')

##Parse training_data dictionary across all sessions and animals to return tracking and metadata in a workable dataframe format. 

In [None]:
# instantiate body part names associated with pose tracking
pos_x_names = ['nose_X', 'left_ear_X', 'neck_X', 'right_ear_X', 'left_hip_X', 'right_hip_X', 'tailbase_X'] 
pos_y_names = ['nose_Y', 'left_ear_Y', 'neck_Y','right_ear_Y', 'left_hip_Y', 'right_hip_Y', 'tailbase_Y']
tracking_df = pd.DataFrame({'frame_num':[], 'session_num': [], 'mouse_id': [], 'annotations':[]})
training_df = pd.DataFrame(training_data)
for idx, session in enumerate(training_df.columns):
  for mouse in [0,1]:
    session_df = pd.DataFrame()
    session_df['annotations']=training_df[session]['annotations']
    session_df['session_num'] = idx
    session_df['mouse_id'] = mouse
    session_df['frame_num'] = range(len(training_df[session]['annotations']))
    for x in range(len(pos_x_names)):
      session_df[pos_x_names[x]] = training_df[session]['keypoints'][:,mouse,0,x]
      session_df[pos_y_names[x]] = training_df[session]['keypoints'][:,mouse,1,x]
    tracking_df = tracking_df.append(session_df, ignore_index=True)

# change discrete values to int
tracking_df.frame_num = tracking_df.frame_num.astype('int64')
tracking_df.session_num = tracking_df.session_num.astype('int64')
tracking_df.mouse_id = tracking_df.mouse_id.astype('int64')
tracking_df.annotations = tracking_df.annotations.astype('int64')

## Export dataframe to .csv

In [None]:
tracking_df.to_csv('tracking_df.csv')