<a href="https://colab.research.google.com/github/grace3999/USV_Python/blob/colab/Notebooks/2_xr_Annotations_from_netcdf_PSD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#mount google drive containings required files: 1) csv of annotation features, 2) netcdf files
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
!pip install netcdf4



In [0]:
import numpy as np
import pandas as pd
import os
import re
from scipy import stats
import xarray as xr

#visualizing results
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import warnings; warnings.simplefilter('ignore')
np.set_printoptions(suppress=True)

In [0]:
#may need to be updated based on file naming scheme
def get_file_info(path, order):
    """takes in a file path for annotation selections table and finds the animal_number and session and saves each accordingly. 
    each file should be named with animal number and exp (e.g. 100_CPA.Table.1.selections)"""
    
    if order == 'animal':
      animal_number, session = re.split('_|-', path.split('/')[-1].split('.')[0])[0:2]
    else:
      session, animal_number = re.split('_|-', path.split('/')[-1].split('.')[0])[0:2] 
    
    print(animal_number, session)
    
    return animal_number, session

In [0]:
#need to be updated with animal naming scheme (e.g. int(animal_number) vs animal_number)
def create_annot_dataset(path, animal_number, session, annot_data, size_random):
    """takes in path of netcdf file (created using notebook 1) and corresponding annotations (created using notebook 0)
    and creates a new data set of only slices correspondingn to annotations (and random noise if set to True)"""
    
    #open xarray Dataset
    data = xr.open_dataset(path)
    print(data['slices'].shape)
    
    #get time_stamps of animal's annotations, select slices and save
    yes = annot_data[(annot_data['animal_number'] ==  animal_number) & (annot_data['session'] ==  session)]['time_stamp'].sort_values().values
    data_yes = data.sel(slices = yes)
    
    #get slice indexs (e.g. time_stamps) that are not annotations, select slices and save
    slice_indexes = data['slices'].values
    no = np.setdiff1d(slice_indexes,yes)
    print(yes.shape[0] + no.shape[0])
    no_short = np.random.choice(no, size_random, replace=False)
    data_no = data.sel(slices = no_short)
    
    return data_yes, data_no

In [0]:
def compute_psd(Dataset):
  """takes in netcdf dataset and computes psd, saves as df"""
  
  psd_df = pd.DataFrame(data = Dataset.groupby('slices').sum(dim='times')['__xarray_dataarray_variable__'].values, index = Dataset['slices'].values, columns = Dataset['freq'].values/1000)
  
  return psd_df

In [0]:
def create_annotation_slice_psds(path, order, annot_data, size_random = 15):
    """takes in path of netcdf file (created using notebook 1), corresponding annotations (created using notebook 0),
    and size desired for random slicecs from each file.
    uses create_annot_dataset and compute_psd functions to create a new data set of computed features
    for only slices correspondingn to annotations and random noise"""

    animal_number, session = get_file_info(path, order)
    print(animal_number, session)
    
    #create datasets of slices of known annotations and a random selection of noise
    data_yes, data_no = create_annot_dataset(path, animal_number, session, annot_data, size_random)
    
    #compute psd
    yes_psd = compute_psd(data_yes)
    no_psd = compute_psd(data_no)

    #add psd to exisiting dataframe of known annotations
    annot_yes_psd = annot_data[(annot_data['animal_number'] ==  animal_number) & (annot_data['session'] ==  session)].sort_values(by=['time_stamp'])
    annot_yes_psd = pd.merge(annot_yes_psd, yes_psd, left_on=annot_yes_psd['time_stamp'], right_on=yes_psd.index.values)
    
    #create and fill dataframe for randomly selected noise slices
    annot_no_psd = pd.DataFrame(columns = ['animal_number', 'session', 'time_stamp', 'Annotation'], index = np.arange(0,size_random))
    annot_no_psd['animal_number'] = animal_number
    annot_no_psd['session'] = session
    annot_no_psd['Annotation'] = 'rand_noise'
    annot_no_psd['time_stamp'] = data_no['slices'].values
    annot_no_psd = pd.merge(annot_no_psd, no_psd, left_on=annot_no_psd['time_stamp'], right_on=no_psd.index.values)
    
    return annot_yes_psd, annot_no_psd 

Create data frame of annotation info

Find path names for each netcdf file corresponding to wav file that has annotated data

In [0]:
annot_path_cage = '/content/gdrive/Team Drives/USV_eScience_Incubator/Data/annotation_data_frames/annot_df_homecage.csv'
annot_path_CPApost = '/content/gdrive/Team Drives/USV_eScience_Incubator/Data/annotation_data_frames/annot_df_CPA.csv'
annot_path_pain = '/content/gdrive/Team Drives/USV_eScience_Incubator/Data/annotation_data_frames/annot_df_pain.csv'

annot_paths = [annot_path_CPApost, annot_path_cage, annot_path_pain]

In [11]:
annot_data = pd.DataFrame()

for path in annot_paths:
    annot = pd.read_csv(path)
    annot = pd.DataFrame(data = annot)
    if 'radar' in annot['Annotation'].values:
      annot = annot[annot['Annotation'] != 'radar']
    if True in pd.isna(annot['Annotation']).values:
      annot['Annotation'] = ['BBC'] * annot['Annotation'].shape[0]
    
    print(annot.shape)
    print(annot.Annotation.value_counts())
    
    annot_data = annot_data.append(annot)

print(annot_data.shape)
annot_data.head()

(90, 6)
low slug      48
low multi     20
bbc           15
high slug      6
high multi     1
Name: Annotation, dtype: int64
(69, 6)
bbc           30
high slug     21
low slug      12
low multi      3
high multi     3
Name: Annotation, dtype: int64
(178, 6)
BBC    178
Name: Annotation, dtype: int64
(337, 6)


Unnamed: 0.1,Unnamed: 0,animal_number,session,time_stamp,Annotation,High Freq (Hz)
0,0,533,CPApair,46305.0,low slug,16500.0
2,2,533,CPApair,149692.5,low slug,16074.5
3,3,533,CPApair,243157.5,low slug,16500.0
4,4,533,CPApair,295560.0,low multi,27489.6
141,141,533,CPApair,376560.0,low slug,9940.8


In [0]:
netcdf_path_fear = '/content/gdrive/Team Drives/USV_eScience_Incubator/Data/netcdf_files/Fear'

netcdf_path_pain = '/content/gdrive/Team Drives/USV_eScience_Incubator/Data/netcdf_files/Pain'

In [19]:
path = netcdf_path_pain

path_names = []
files = os.listdir(path)
for file in files:
  path_names.append(path + "/" + file)

print(len(path_names))
path_names

18


['/content/gdrive/Team Drives/USV_eScience_Incubator/Data/netcdf_files/Pain/A2_D2_xr_Dataset.nc',
 '/content/gdrive/Team Drives/USV_eScience_Incubator/Data/netcdf_files/Pain/A4_D2_xr_Dataset.nc',
 '/content/gdrive/Team Drives/USV_eScience_Incubator/Data/netcdf_files/Pain/A6_D2_xr_Dataset.nc',
 '/content/gdrive/Team Drives/USV_eScience_Incubator/Data/netcdf_files/Pain/A8_D2_xr_Dataset.nc',
 '/content/gdrive/Team Drives/USV_eScience_Incubator/Data/netcdf_files/Pain/A10_D2_xr_Dataset.nc',
 '/content/gdrive/Team Drives/USV_eScience_Incubator/Data/netcdf_files/Pain/A12_D2_xr_Dataset.nc',
 '/content/gdrive/Team Drives/USV_eScience_Incubator/Data/netcdf_files/Pain/A2_D3_xr_Dataset.nc',
 '/content/gdrive/Team Drives/USV_eScience_Incubator/Data/netcdf_files/Pain/A4_D3_xr_Dataset.nc',
 '/content/gdrive/Team Drives/USV_eScience_Incubator/Data/netcdf_files/Pain/A6_D3_xr_Dataset.nc',
 '/content/gdrive/Team Drives/USV_eScience_Incubator/Data/netcdf_files/Pain/A8_D3_xr_Dataset.nc',
 '/content/gdrive/

In [23]:
### may have to update int(animal_number)

order = 'animal'
size_random = 15
session_name = 'pain'
save_path = '/content/gdrive/Team Drives/USV_eScience_Incubator/Data/feature_data_frames'

annot_features_yes_psd = pd.DataFrame()
annot_features_no_psd = pd.DataFrame()

for path in path_names:
    
    annot_yes, annot_no = create_annotation_slice_psds(path, order, annot_data, size_random)
    
    annot_features_yes_psd = annot_features_yes_psd.append(annot_yes, ignore_index=True)
    annot_features_no_psd = annot_features_no_psd.append(annot_no, ignore_index=True)

#create and save combined dataframe of yes and no
annot_features_yes_psd.drop(['Unnamed: 0', 'High Freq (Hz)'], axis=1, inplace=True)
annot_features_full_psd = pd.concat([annot_features_yes_psd, annot_features_no_psd])
print(annot_features_full_psd.shape)
print(annot_features_full_psd.Annotation.value_counts())

annot_features_full_psd.to_csv(str(save_path + '/annot_features_full_' + 'psd_' + session_name + '_new.csv'))

A2 D2
A2 D2
(13333,)
13333
A4 D2
A4 D2
(13333,)
13333
A6 D2
A6 D2
(13333,)
13333
A8 D2
A8 D2
(13333,)
13333
A10 D2
A10 D2
(13333,)
13333
A12 D2
A12 D2
(13333,)
13333
A2 D3
A2 D3
(13333,)
13333
A4 D3
A4 D3
(13333,)
13333
A6 D3
A6 D3
(13333,)
13333
A8 D3
A8 D3
(13333,)
13333
A10 D3
A10 D3
(13333,)
13333
A12 D3
A12 D3
(13333,)
13333
A2 D4
A2 D4
(13333,)
13333
A4 D4
A4 D4
(13333,)
13333
A6 D4
A6 D4
(13333,)
13333
A8 D4
A8 D4
(13333,)
13333
A10 D4
A10 D4
(13333,)
13333
A12 D4
A12 D4
(13333,)
13333
(448, 261)
rand_noise    270
BBC           178
Name: Annotation, dtype: int64
