<a href="https://colab.research.google.com/github/grace3999/USV_Python/blob/colab/Notebooks/1_xr_Process_wav_to_netcdf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#mount google drive containings required files: 1) wav files
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
!pip install netcdf4



In [0]:
import numpy as np
import pandas as pd
import math
import time
import re
import os

from skimage import util
from scipy.io import wavfile
from scipy import signal
from scipy import stats

import xarray as xr

%matplotlib inline
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import warnings; warnings.simplefilter('ignore')
np.set_printoptions(suppress=True)

In [0]:
def create_slice_from_wav(file_path, file_len, slice_len, step_size):
    """Creates small slices from wav file. file_length is desired length of file in minutes.
    slice_len is desired legth of each slice in ms. 
    step_size is how big of step to take between steps (larger size is less overlap)."""
    
    #read in wav file
    samp_freq, sig_data = wavfile.read(file_path)
    print('Sampling frequency: ' + str(samp_freq))
    
    #convert file_length from min to sample numbers
    file_length_num = file_len*samp_freq*60
    
    #convert slice_len from ms to sample numbers
    slice_sample_num = slice_len*samp_freq/1000
    
    #use only slices within file_length
    sig_data = sig_data[0:file_length_num]
    
    #determine number of samples and length
    n_samples = sig_data.shape[0]
    print('Number of samples: ' + str(n_samples))
    sig_len = n_samples/samp_freq
    print('Length: ' + str(sig_len) + ' sec')
    
    #create slices 
    steps = int(slice_sample_num*step_size)
    slices = util.view_as_windows(sig_data, window_shape=(slice_sample_num,), step=steps)
    print(f'Audio shape: {sig_data.shape}, Sliced audio shape: {slices.shape}')
    
    return samp_freq, sig_data, slices, steps, sig_len

In [0]:
def create_spec_from_slice_array(slices, steps, spec_window, NFFT, samp_freq):
    """Creates fft spectrogram from slice. spec_window is length of each segment (nperseg).
    NFFT is length of the FFT used (nfft). samp_freq is sampling frequency (in Hz) of slice (fs).
    steps is step size between slices"""
    
    spec_slices = {}
    samp_freq_kHz = samp_freq/1000

    for i in range(slices.shape[0]): 
        if i % 5000 == 0:
            print(i)
            
        #spectrogram
        freqs_spec, times, Sx = signal.spectrogram(slices[i,:], fs=samp_freq, nperseg = spec_window, nfft = NFFT)
    
        time_stamp = ((i*steps) / samp_freq_kHz)
    
        #store as dic
        spec_slices[time_stamp] = Sx

    return spec_slices, freqs_spec, times


In [0]:
def create_xarray_dataset_from_dic(dic, freqs_spec, times):
    """Creates an xarray.Dataset object from a dictionary input."""
    
    slices_combined = {}
    
    for key, fft_slice in dic.items():
        slices_combined[key] = xr.DataArray(fft_slice, dims = ('freq', 'times'), coords = {'freq': freqs_spec, 'times': times})
    slices_Dataset = xr.Dataset(slices_combined).to_array(dim = 'slices')
    
    return slices_Dataset

In [0]:
#testing function
def get_remainders(slices_Dataset, slices, step_size, samp_freq, sig_len):
    slice_remainder = np.round((sig_len - (slices.shape[0] * slices.shape[1] * step_size / samp_freq))*1000, 1)
    netcdf_remainder = (sig_len * 1000) - slices_Dataset.slices.values[-1] - 22.5
    
    return slice_remainder, netcdf_remainder

In [0]:
#may need to be updated based on file naming scheme
def get_file_info(path, order):
    """takes in a file path for annotation selections table and finds the animal_number and session and saves each accordingly. 
    each file should be named with animal number and exp (e.g. 100_CPA.Table.1.selections)"""
    
    if order == 'animal':
      print('animal first')
      animal_number, session = re.split('_|-', path.split('/')[-1].split('.')[0])[0:2]
    else:
      print('session first')
      session, animal_number = re.split('_|-', path.split('/')[-1].split('.')[0])[0:2] 
    
    print(animal_number, session)
    
    return animal_number, session

Find path names for each wav file corresponding to annotated data

In [0]:
wav_cFos_CPA = '/content/gdrive/Shared drives/USV_eScience_Incubator/Data/wav_files/Fear/CPA_pair_cFos/18.12.07_CPA_pair_3x'
wav_cFos_neutral = '/content/gdrive/Shared drives/USV_eScience_Incubator/Data/wav_files/Fear/CPA_pair_cFos/18.12.05_housing_nopair_3x'

wav_PETr1_CPA = '/content/gdrive/Shared drives/USV_eScience_Incubator/Data/wav_files/Fear/CPA_pair_PET/CPApair'

wav_r2_CPA = '/content/gdrive/Shared drives/USV_eScience_Incubator/Data/wav_files/Fear/CPA_pair_round2/19.04.03_CPApost'
wav_r2_neutral = '/content/gdrive/Shared drives/USV_eScience_Incubator/Data/wav_files/Fear/CPA_pair_round2/19.04.01_neutralpost'

In [15]:
path = wav_r2_neutral

path_names = []
files = os.listdir(path)
for file in files: 
  path_names.append(path + "/" + file)

print(len(path_names))
path_names

12


['/content/gdrive/Shared drives/USV_eScience_Incubator/Data/wav_files/Fear/CPA_pair_round2/19.04.01_neutralpost/626_neutralpost.wav',
 '/content/gdrive/Shared drives/USV_eScience_Incubator/Data/wav_files/Fear/CPA_pair_round2/19.04.01_neutralpost/628_neutralpost.wav',
 '/content/gdrive/Shared drives/USV_eScience_Incubator/Data/wav_files/Fear/CPA_pair_round2/19.04.01_neutralpost/623_neutralpost.wav',
 '/content/gdrive/Shared drives/USV_eScience_Incubator/Data/wav_files/Fear/CPA_pair_round2/19.04.01_neutralpost/629_neutralpost.wav',
 '/content/gdrive/Shared drives/USV_eScience_Incubator/Data/wav_files/Fear/CPA_pair_round2/19.04.01_neutralpost/625_neutralpost.wav',
 '/content/gdrive/Shared drives/USV_eScience_Incubator/Data/wav_files/Fear/CPA_pair_round2/19.04.01_neutralpost/624_neutralpost.wav',
 '/content/gdrive/Shared drives/USV_eScience_Incubator/Data/wav_files/Fear/CPA_pair_round2/19.04.01_neutralpost/630_neutralpost.wav',
 '/content/gdrive/Shared drives/USV_eScience_Incubator/Data/wa

In [16]:
order = 'animal'
save_path = '/content/gdrive/Shared drives/USV_eScience_Incubator/Data/netcdf_files'

file_len=10
slice_len=25
step_size=0.9

spec_window=128
NFFT=512

for path in path_names:
    
    animal_number, session = get_file_info(path, order)
    
    #process wav file of animal corresponding to annotations
    print(str('Begin processing animal # ' + animal_number))

    #create slices
    start = time.time()
    samp_freq, sig_data, slices, steps, sig_len = create_slice_from_wav(path, file_len, slice_len, step_size)
    end = time.time()
    print(str('Slices created in ' + str(end - start) + '  seconds'))

    #create spectrograms
    start = time.time()
    spec_slices, freqs_spec, times = create_spec_from_slice_array(slices, steps, spec_window, NFFT, samp_freq)
    end = time.time()
    print(str('Spectrograms created in ' + str(end - start) + '  seconds'))

    #create xarray Dataset
    start = time.time()
    slices_Dataset = create_xarray_dataset_from_dic(spec_slices, freqs_spec, times)
    end = time.time()
    print(str('xarray created in ' + str(end - start) + '  seconds'))

    #confirm timestamps are correct
    slice_remainder, netcdf_remainder = get_remainders(slices_Dataset, slices, step_size, samp_freq, sig_len)
    if slice_remainder != netcdf_remainder:
        raise Exception('Mismatch between slice and timestamp remainders')
    else:
        #save    
        start = time.time()
        slices_Dataset.to_netcdf(str(save_path + '/' + animal_number + '_' + session + '_xr_Dataset.nc'))
        end = time.time()
        print(str('xarray saved in ' + str(end - start) + '  seconds'))

animal first
626 neutralpost
Begin processing animal # 626
Sampling frequency: 250000
Number of samples: 150000000
Length: 600.0 sec
Audio shape: (150000000,), Sliced audio shape: (26666, 6250)
Slices created in 7.061180830001831  seconds
0
5000
10000
15000
20000
25000
Spectrograms created in 20.37616991996765  seconds
xarray created in 16.260170698165894  seconds
xarray saved in 6.7128190994262695  seconds
animal first
628 neutralpost
Begin processing animal # 628
Sampling frequency: 250000
Number of samples: 150000000
Length: 600.0 sec
Audio shape: (150000000,), Sliced audio shape: (26666, 6250)
Slices created in 5.153409481048584  seconds
0
5000
10000
15000
20000
25000
Spectrograms created in 21.342045307159424  seconds
xarray created in 16.872518301010132  seconds
xarray saved in 6.8849196434021  seconds
animal first
623 neutralpost
Begin processing animal # 623
Sampling frequency: 250000
Number of samples: 150000000
Length: 600.0 sec
Audio shape: (150000000,), Sliced audio shape: 