In [1]:
import numpy as np
import pandas as pd
import os
import re
from scipy import stats
import xarray as xr

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import warnings; warnings.simplefilter('ignore')
np.set_printoptions(suppress=True)



In [2]:
def create_annot_dataset(path, annot_data, size_random):
    """takes in path of netcdf file (created using notebook 1) and corresponding annotations (created using notebook 0)
    and creates a new data set of only slices correspondingn to annotations (and random noise if set to True)"""
    
    #get animal name from path
    name = re.search("\d\d\d", path).group(0)
    print(name)
    
    #open xarray Dataset
    data = xr.open_dataset(path)
    print(data['slices'].shape)
    
    #get time_stamps of animal's annotations, select slices and save
    yes = annot_data[annot_data['animal_number'] ==  int(name)]['time_stamp'].sort_values().values
    data_yes = data.sel(slices = yes)
    
    #get slice indexs (e.g. time_stamps) that are not annotations, select slices and save
    slice_indexes = data['slices'].values
    no = np.setdiff1d(slice_indexes,yes)
    print(yes.shape[0] + no.shape[0])
    no_short = np.random.choice(no, size_random, replace=False)
    data_no = data.sel(slices = no_short)
    
    return data_yes, data_no

In [3]:
def compute_spectral_features(Dataset):
    """takes in netcdf dataset and computes 8 features"""
    
    spec_power = []
    spec_purs = []
    spec_centroid = []
    spec_spread = []
    spec_skewness = []
    spec_kurtosis = []
    spec_slope = []
    spec_roll_off = []
    
    freq_array = Dataset['freq'].values
    
    #compute power sum using groupby
    spec_power = Dataset.groupby('slices').sum(xr.ALL_DIMS)['__xarray_dataarray_variable__'].values
    
    #compute other features for each slice individually
    for value in Dataset['slices'].values:
        
        spec_pur = stats.gmean(Dataset.sel(slices = value)['__xarray_dataarray_variable__'].values, axis = None) / Dataset.sel(slices = value)['__xarray_dataarray_variable__'].values.mean()
        
        mag_array = Dataset['__xarray_dataarray_variable__'].sel(slices=value).max(dim = 'times').values
        mag_probs = mag_array/sum(mag_array)
        freq_mag = freq_array*mag_probs
        
        spec_cent = sum(freq_mag)
        spec_spr = np.var(freq_mag)
        spec_skew = stats.skew(freq_mag)
        spec_kurt = stats.kurtosis(freq_mag)
        slope, intercept, r_value, p_value, std_err = stats.linregress(freq_array, freq_mag)
        spec_ro = .95*sum(freq_mag)
        
        spec_purs.append(spec_pur)
        spec_centroid.append(spec_cent)
        spec_spread.append(spec_spr)
        spec_skewness.append(spec_skew)
        spec_kurtosis.append(spec_kurt)
        spec_slope.append(slope)
        spec_roll_off.append(spec_ro)
        
    return spec_power, spec_purs, spec_centroid, spec_spread, spec_skewness, spec_kurtosis, spec_slope, spec_roll_off

In [12]:
def create_annotation_slice_features(path, annot_data, size_random = 50):
    """takes in path of netcdf file (created using notebook 1), corresponding annotations (created using notebook 0),
    and size desired for random slicecs from each file.
    uses create_annot_dataset and compute_spectral_features functions to create a new data set of computed features
    for only slices correspondingn to annotations and random noise"""
    
    
    #get animal name from path
    name = re.search("\d\d\d", path).group(0)
    print(name)
    
    #create datasets of slices of known annotations and a random selection of noise
    data_yes, data_no = create_annot_dataset(path, annot_data, size_random)
    
    #compute spectral features
    yes_spec_power, yes_spec_purs, yes_spec_centroid, yes_spec_spread, yes_spec_skewness, yes_spec_kurtosis, yes_spec_slope, yes_spec_roll_off = compute_spectral_features(data_yes)
    no_spec_power, no_spec_purs, no_spec_centroid, no_spec_spread, no_spec_skewness, no_spec_kurtosis, no_spec_slope, no_spec_roll_off = compute_spectral_features(data_no)

    #add computed features to exisiting dataframe of known annotations
    annot_yes = annot_data[annot_data['animal_number'] == int(name)].sort_values(by=['time_stamp'])
    annot_yes['power_sum'] = yes_spec_power
    annot_yes['spec_pur'] = yes_spec_purs
    annot_yes['spec_cent'] = yes_spec_centroid
    annot_yes['spec_spread'] = yes_spec_spread
    annot_yes['spec_skew'] = yes_spec_skewness
    annot_yes['spec_kurt'] = yes_spec_kurtosis
    annot_yes['spec_slope'] = yes_spec_slope
    annot_yes['spec_roll'] = yes_spec_roll_off
    
    #create and fill dataframe for randomly selected noise slices
    annot_no = pd.DataFrame(columns = ['animal_number', 'session', 'time_stamp', 'Annotation'], index = np.arange(0,size_random))
    annot_no['animal_number'] = name
    annot_no['session'] = annot_data[annot_data['animal_number'] == int(name)]['session'].iloc[0]
    annot_no['Annotation'] = 'rand_noise'
    annot_no['time_stamp'] = data_no['slices'].values
    annot_no['power_sum'] = no_spec_power
    annot_no['spec_pur'] = no_spec_purs
    annot_no['spec_cent'] = no_spec_centroid
    annot_no['spec_spread'] = no_spec_spread
    annot_no['spec_skew'] = no_spec_skewness
    annot_no['spec_kurt'] = no_spec_kurtosis
    annot_no['spec_slope'] = no_spec_slope
    annot_no['spec_roll'] = no_spec_roll_off
    
    return annot_yes, annot_no 

Create data frame of annotation info

In [5]:
annot_path_CPA = "C:/Users/Schindler/Documents/ProgrammingFun/USV_python/annot_df_3x_CPA_pair_tables.csv"
annot_path_novel = "C:/Users/Schindler/Documents/ProgrammingFun/USV_python/annot_df_3x_novel_pair_tables.csv"
annot_paths = [annot_path_CPA, annot_path_novel]

In [6]:
annot_data = pd.DataFrame()

for path in annot_paths:
    annot = pd.read_csv(path)
    annot = pd.DataFrame(data = annot)
    annot = annot[annot['Annotation'] != 'radar']
    print(annot.shape)
    print(annot.Annotation.value_counts())
    
    annot_data = annot_data.append(annot)

print(annot_data.shape)
annot_data.head()

(90, 5)
low slug      48
low multi     20
bbc           15
high slug      6
high multi     1
Name: Annotation, dtype: int64
(69, 5)
bbc           30
high slug     21
low slug      12
low multi      3
high multi     3
Name: Annotation, dtype: int64
(159, 5)


Unnamed: 0.1,Unnamed: 0,animal_number,session,time_stamp,Annotation
0,0,533,CPApair,46305.0,low slug
2,2,533,CPApair,149692.5,low slug
3,3,533,CPApair,243157.5,low slug
4,4,533,CPApair,295560.0,low multi
141,141,533,CPApair,376560.0,low slug


Find path names for each netcdf file corresponding to wav file that has annotated data

In [7]:
netcdf_path = 'C:/Users/Schindler/Documents/ProgrammingFun/USV_python/Datasets'

In [8]:
path_names = []
files = os.listdir(netcdf_path)
for file in files: 
        path_names.append(netcdf_path + "/" + file)

len(path_names)

16

In [13]:
annot_features_yes = pd.DataFrame()
annot_features_no = pd.DataFrame()

for path in path_names:
    
    annot_yes, annot_no = create_annotation_slice_features(path, annot_data, size_random = 50)
    
    annot_features_yes = annot_features_yes.append(annot_yes, ignore_index=True)
    annot_features_no = annot_features_no.append(annot_no, ignore_index=True)

#create and save combined dataframe of yes and no
annot_features_yes.drop(['Unnamed: 0'], axis=1, inplace=True)
annot_features_full = pd.concat([annot_features_yes, annot_features_no])
print(annot_features_full.shape)
print(annot_features_full.Annotation.value_counts())

annot_features_full.to_csv('annot_features_full.csv')

527
527
(26666,)
26666
529
529
(26666,)
26666
533
533
(26666,)
26666
534
534
(26666,)
26666
535
535
(26666,)
26666
540
540
(26666,)
26666
541
541
(26666,)
26666
542
542
(26666,)
26666
543
543
(26666,)
26666
552
552
(26666,)
26666
553
553
(26666,)
26666
554
554
(26666,)
26666
555
555
(26666,)
26666
556
556
(26666,)
26666
557
557
(26666,)
26666
559
559
(26666,)
26666
(959, 12)
rand_noise    800
low slug       60
bbc            45
high slug      27
low multi      23
high multi      4
Name: Annotation, dtype: int64
