<a href="https://colab.research.google.com/github/grace3999/USV_Python/blob/colab/Notebooks/2_xr_Annotations_from_netcdf_8features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [43]:
#mount google drive containings required files: 1) csv of annotation features, 2) netcdf files
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [44]:
!pip install netcdf4



In [0]:
import numpy as np
import pandas as pd
import os
import re
from scipy import stats
import xarray as xr

#visualizing results
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import warnings; warnings.simplefilter('ignore')
np.set_printoptions(suppress=True)

In [0]:
#may need to be updated based on file naming scheme
def get_file_info(path, order):
    """takes in a file path for annotation selections table and finds the animal_number and session and saves each accordingly. 
    each file should be named with animal number and exp (e.g. 100_CPA.Table.1.selections)"""
    
    if order == 'animal':
      animal_number, session = re.split('_|-', path.split('/')[-1].split('.')[0])[0:2]
    else:
      session, animal_number = re.split('_|-', path.split('/')[-1].split('.')[0])[0:2] 
    
    print(animal_number, session)
    
    return animal_number, session

In [0]:
#need to be updated with animal naming scheme (e.g. int(animal_number) vs animal_number)
def create_annot_dataset(path, animal_number, session, annot_data, size_random):
    """takes in path of netcdf file (created using notebook 1) and corresponding annotations (created using notebook 0)
    and creates a new data set of only slices correspondingn to annotations (and random noise if set to True)"""
    
    #open xarray Dataset
    data = xr.open_dataset(path)
    print(data['slices'].shape)
    
    #get time_stamps of animal's annotations, select slices and save
    yes = annot_data[(annot_data['animal_number'] ==  int(animal_number)) & (annot_data['session'] ==  session)]['time_stamp'].sort_values().values
    data_yes = data.sel(slices = yes)
    
    #get slice indexs (e.g. time_stamps) that are not annotations, select slices and save
    slice_indexes = data['slices'].values
    no = np.setdiff1d(slice_indexes,yes)
    print(yes.shape[0] + no.shape[0])
    no_short = np.random.choice(no, size_random, replace=False)
    data_no = data.sel(slices = no_short)
    
    return data_yes, data_no

In [0]:
def compute_spectral_features(Dataset):
    """takes in netcdf dataset and computes 8 features"""
    
    spec_power = []
    spec_purs = []
    spec_centroid = []
    spec_spread = []
    spec_skewness = []
    spec_kurtosis = []
    spec_slope = []
    spec_roll_off = []
    
    freq_array = Dataset['freq'].values
    
    #compute power sum using groupby
    spec_power = Dataset.groupby('slices').sum(xr.ALL_DIMS)['__xarray_dataarray_variable__'].values
    
    #compute other features for each slice individually
    for value in Dataset['slices'].values:
        
        spec_pur = stats.gmean(Dataset.sel(slices = value)['__xarray_dataarray_variable__'].values, axis = None) / Dataset.sel(slices = value)['__xarray_dataarray_variable__'].values.mean()
        
        mag_array = Dataset['__xarray_dataarray_variable__'].sel(slices=value).sum(dim = 'times').values
        mag_probs = mag_array/sum(mag_array)
        freq_mag = freq_array*mag_probs
        
        spec_cent = sum(freq_mag)
        spec_spr = np.var(freq_mag)
        spec_skew = stats.skew(freq_mag)
        spec_kurt = stats.kurtosis(freq_mag)
        slope, intercept, r_value, p_value, std_err = stats.linregress(freq_array, freq_mag)
        spec_ro = .95*sum(freq_mag)
        
        spec_purs.append(spec_pur)
        spec_centroid.append(spec_cent)
        spec_spread.append(spec_spr)
        spec_skewness.append(spec_skew)
        spec_kurtosis.append(spec_kurt)
        spec_slope.append(slope)
        spec_roll_off.append(spec_ro)
        
    return spec_power, spec_purs, spec_centroid, spec_spread, spec_skewness, spec_kurtosis, spec_slope, spec_roll_off

In [0]:
#need to be updated with animal naming scheme (e.g. int(animal_number) vs animal_number)
def create_annotation_slice_features(path, order, annot_data, size_random):
    """takes in path of netcdf file (created using notebook 1), corresponding annotations (created using notebook 0),
    and size desired for random slicecs from each file.
    uses create_annot_dataset and compute_spectral_features functions to create a new data set of computed features
    for only slices correspondingn to annotations and random noise"""
    
    
    animal_number, session = get_file_info(path, order)

    print(animal_number, session)
    
    #create datasets of slices of known annotations and a random selection of noise
    data_yes, data_no = create_annot_dataset(path, animal_number, session, annot_data, size_random)
    
    #compute spectral features
    yes_spec_power, yes_spec_purs, yes_spec_centroid, yes_spec_spread, yes_spec_skewness, yes_spec_kurtosis, yes_spec_slope, yes_spec_roll_off = compute_spectral_features(data_yes)
    no_spec_power, no_spec_purs, no_spec_centroid, no_spec_spread, no_spec_skewness, no_spec_kurtosis, no_spec_slope, no_spec_roll_off = compute_spectral_features(data_no)

    #add computed features to exisiting dataframe of known annotations
    annot_yes = annot_data[(annot_data['animal_number'] ==  int(animal_number)) & (annot_data['session'] ==  session)].sort_values(by=['time_stamp'])
    annot_yes['power_sum'] = yes_spec_power
    annot_yes['spec_pur'] = yes_spec_purs
    annot_yes['spec_cent'] = yes_spec_centroid
    annot_yes['spec_spread'] = yes_spec_spread
    annot_yes['spec_skew'] = yes_spec_skewness
    annot_yes['spec_kurt'] = yes_spec_kurtosis
    annot_yes['spec_slope'] = yes_spec_slope
    annot_yes['spec_roll'] = yes_spec_roll_off
    
    #create and fill dataframe for randomly selected noise slices
    annot_no = pd.DataFrame(columns = ['animal_number', 'session', 'time_stamp', 'Annotation'], index = np.arange(0,size_random))
    annot_no['animal_number'] = animal_number
    annot_no['session'] = session
    annot_no['Annotation'] = 'rand_noise'
    annot_no['time_stamp'] = data_no['slices'].values
    annot_no['power_sum'] = no_spec_power
    annot_no['spec_pur'] = no_spec_purs
    annot_no['spec_cent'] = no_spec_centroid
    annot_no['spec_spread'] = no_spec_spread
    annot_no['spec_skew'] = no_spec_skewness
    annot_no['spec_kurt'] = no_spec_kurtosis
    annot_no['spec_slope'] = no_spec_slope
    annot_no['spec_roll'] = no_spec_roll_off
    
    return annot_yes, annot_no 

Create data frame of annotation info

In [0]:
annot_path_cFos_CPA = '/content/gdrive/Shared drives/USV_eScience_Incubator/Data/annotation_data_frames/annot_df_CPApost_cFos_CPA.csv'
annot_path_cFos_neutral = '/content/gdrive/Shared drives/USV_eScience_Incubator/Data/annotation_data_frames/annot_df_neutral_cFos_neutral.csv'

annot_path_PETr1_CPA = '/content/gdrive/Shared drives/USV_eScience_Incubator/Data/annotation_data_frames/annot_df_CPApost_PETr1_CPA.csv'

annot_path_round2_CPA = '/content/gdrive/Shared drives/USV_eScience_Incubator/Data/annotation_data_frames/annot_df_CPApost_round2_CPA.csv'
annot_path_round2_neutral = '/content/gdrive/Shared drives/USV_eScience_Incubator/Data/annotation_data_frames/annot_df_neutral_round2_neutral.csv'

annot_paths = [annot_path_cFos_CPA, annot_path_cFos_neutral, annot_path_PETr1_CPA, annot_path_round2_CPA, annot_path_round2_neutral]

In [51]:
annot_data = pd.DataFrame()

for path in annot_paths:
    annot = pd.read_csv(path)
    annot = pd.DataFrame(data = annot)
    if 'radar' in annot['Annotation'].values:
      annot = annot[annot['Annotation'] != 'radar']
    if True in pd.isna(annot['Annotation']).values:
      annot['Annotation'] = ['BBC'] * annot['Annotation'].shape[0]
    
    print(annot.shape)
    print(annot.Annotation.value_counts())
    
    annot_data = annot_data.append(annot)

print(annot_data.shape)
annot_data.head()

(90, 6)
low           48
low multi     20
bbc           15
high           6
high multi     1
Name: Annotation, dtype: int64
(69, 6)
bbc           30
high          21
low           12
high multi     3
low multi      3
Name: Annotation, dtype: int64
(154, 6)
low            87
low multi      25
low complex    18
high           12
bbc            11
high multi      1
Name: Annotation, dtype: int64
(81, 6)
BBC    81
Name: Annotation, dtype: int64
(34, 6)
low            14
high            9
low complex     5
bbc             3
low multi       2
low             1
Name: Annotation, dtype: int64
(428, 6)


Unnamed: 0.1,Unnamed: 0,animal_number,session,time_stamp,Annotation,High Freq (Hz)
0,0,533,CPApost,46305.0,low,16500.0
2,2,533,CPApost,149692.5,low,16074.5
3,3,533,CPApost,243157.5,low,16500.0
4,4,533,CPApost,295560.0,low multi,27489.6
141,141,533,CPApost,376560.0,low,9940.8


Find path names for each netcdf file corresponding to wav file that has annotated data

In [0]:
netcdf_path_CPA = '/content/gdrive/Shared drives/USV_eScience_Incubator/Data/netcdf_files/CPA'

netcdf_path_neutral = '/content/gdrive/Shared drives/USV_eScience_Incubator/Data/netcdf_files/Neutral'

In [55]:
path = netcdf_path_neutral

path_names = []
files = os.listdir(path)
for file in files:
  path_names.append(path + "/" + file)

print(len(path_names))
path_names

17


['/content/gdrive/Shared drives/USV_eScience_Incubator/Data/netcdf_files/Neutral/527_neutral_xr_Dataset.nc',
 '/content/gdrive/Shared drives/USV_eScience_Incubator/Data/netcdf_files/Neutral/529_neutral_xr_Dataset.nc',
 '/content/gdrive/Shared drives/USV_eScience_Incubator/Data/netcdf_files/Neutral/540_neutral_xr_Dataset.nc',
 '/content/gdrive/Shared drives/USV_eScience_Incubator/Data/netcdf_files/Neutral/541_neutral_xr_Dataset.nc',
 '/content/gdrive/Shared drives/USV_eScience_Incubator/Data/netcdf_files/Neutral/552_neutral_xr_Dataset.nc',
 '/content/gdrive/Shared drives/USV_eScience_Incubator/Data/netcdf_files/Neutral/553_neutral_xr_Dataset.nc',
 '/content/gdrive/Shared drives/USV_eScience_Incubator/Data/netcdf_files/Neutral/556_neutral_xr_Dataset.nc',
 '/content/gdrive/Shared drives/USV_eScience_Incubator/Data/netcdf_files/Neutral/557_neutral_xr_Dataset.nc',
 '/content/gdrive/Shared drives/USV_eScience_Incubator/Data/netcdf_files/Neutral/621_neutral_xr_Dataset.nc',
 '/content/gdrive/S

In [56]:
order = 'animal'
size_random = 100
session_name = 'neutral'
save_path = '/content/gdrive/Shared drives/USV_eScience_Incubator/Data/feature_data_frames'

annot_features_yes = pd.DataFrame()
annot_features_no = pd.DataFrame()

for path in path_names:
  
  annot_yes, annot_no = create_annotation_slice_features(path, order, annot_data, size_random)
    
  annot_features_yes = annot_features_yes.append(annot_yes, ignore_index=True)
  annot_features_no = annot_features_no.append(annot_no, ignore_index=True)

#create and save combined dataframe of yes and no
annot_features_yes.drop(['Unnamed: 0', 'High Freq (Hz)'], axis=1, inplace=True)
annot_features_full = pd.concat([annot_features_yes, annot_features_no])
print(annot_features_full.shape)
print(annot_features_full.Annotation.value_counts())

annot_features_full.to_csv(str(save_path + '/annot_8features_100noise_' + session_name + '_.csv'))

527 neutral
527 neutral
(26666,)
26666
529 neutral
529 neutral
(26666,)
26666
540 neutral
540 neutral
(26666,)
26666
541 neutral
541 neutral
(26666,)
26666
552 neutral
552 neutral
(26666,)
26666
553 neutral
553 neutral
(26666,)
26666
556 neutral
556 neutral
(26666,)
26666
557 neutral
557 neutral
(26666,)
26666
621 neutral
621 neutral
(26666,)
26666
622 neutral
622 neutral
(26666,)
26666
624 neutral
624 neutral
(26666,)
26666
626 neutral
626 neutral
(26666,)
26666
628 neutral
628 neutral
(26666,)
26666
629 neutral
629 neutral
(26666,)
26666
630 neutral
630 neutral
(26666,)
26666
631 neutral
631 neutral
(26666,)
26666
632 neutral
632 neutral
(26666,)
26666
(1803, 12)
rand_noise     1700
bbc              33
high             30
low              26
low multi         5
low complex       5
high multi        3
low               1
Name: Annotation, dtype: int64
