In [1]:
import numpy as np
import pandas as pd
import math
import time
import re
import os
from scipy.io import wavfile
from skimage import util
from scipy import signal
from scipy import stats

In [2]:
def create_slice_from_wav(file_path, slice_len, step_size):
    """Creates small slices from wav file. Slice_len (use sampling frequency to convert to ms). 
    Step_size is amount of overlap between each slice."""
    
    #get animal name
    
    #read in wav file
    samp_freq, sig_data = wavfile.read(file_path)
    sig_data = sig_data[0:150000000]
    print('Sampling frequency: ' + str(samp_freq))
    
    #determine number of samples and length
    n_samples = sig_data.shape[0]
    print('Number of samples: ' + str(n_samples))
    sig_len = n_samples/samp_freq
    print('Length: ' + str(sig_len) + ' sec')
    
    #create slices 
    M = slice_len
    steps = int(M*step_size)
    slices = util.view_as_windows(sig_data, window_shape=(M,), step=steps)
    print(f'Audio shape: {sig_data.shape}, Sliced audio shape: {slices.shape}')
    
    return samp_freq, sig_data, slices, steps

Read in df of annotated USVs

In [3]:
annot_path = "C:/Users/Schindler/Documents/Schindler_Lab/Data/Analysis/Excel files/USV/annot_info_df.xlsx"

In [4]:
data = pd.read_excel(annot_path)
annot_data = pd.DataFrame(data = data)
annot_data.head()

Unnamed: 0,Begin Time (s),Annotation,Animal,Group,Begin Time (s)_1000,time_stamp
0,376.574455,low slug,533,5,376574.4545,376560.0
1,46.306579,low slug,533,5,46306.57941,46305.0
2,243.272865,low slug,533,5,243272.8651,243270.0
3,149.708324,low slug,533,5,149708.324,149692.5
4,243.176192,low slug,533,5,243176.1917,243157.5


Find path names for each wav file corresponding to annotated data

In [5]:
wav_dir_path = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/USVs/CPA_pair_exp/18.12.07_CPA_pair_3x'

In [6]:
path_names = []
files = os.listdir(wav_dir_path)
for file in files: 
        path_names.append(wav_dir_path + "/" + file)

path_names

['C:/Users/Schindler/Documents/Schindler_Lab/Data/USVs/CPA_pair_exp/18.12.07_CPA_pair_3x/533.wav',
 'C:/Users/Schindler/Documents/Schindler_Lab/Data/USVs/CPA_pair_exp/18.12.07_CPA_pair_3x/534.wav',
 'C:/Users/Schindler/Documents/Schindler_Lab/Data/USVs/CPA_pair_exp/18.12.07_CPA_pair_3x/535.wav',
 'C:/Users/Schindler/Documents/Schindler_Lab/Data/USVs/CPA_pair_exp/18.12.07_CPA_pair_3x/542.wav',
 'C:/Users/Schindler/Documents/Schindler_Lab/Data/USVs/CPA_pair_exp/18.12.07_CPA_pair_3x/543.wav',
 'C:/Users/Schindler/Documents/Schindler_Lab/Data/USVs/CPA_pair_exp/18.12.07_CPA_pair_3x/554.wav',
 'C:/Users/Schindler/Documents/Schindler_Lab/Data/USVs/CPA_pair_exp/18.12.07_CPA_pair_3x/555.wav',
 'C:/Users/Schindler/Documents/Schindler_Lab/Data/USVs/CPA_pair_exp/18.12.07_CPA_pair_3x/559.wav',
 'C:/Users/Schindler/Documents/Schindler_Lab/Data/USVs/CPA_pair_exp/18.12.07_CPA_pair_3x/600.wav']

In [7]:
annot_slices = pd.DataFrame()

In [24]:
spec_slices = {}
spec_slices_ravel = {}
spec_slices_df = pd.DataFrame()
annot_slices_int = pd.DataFrame()

#select animal to process (need to process individually due to memory contraints)
path = path_names[8]
name = re.search("\d\d\d", path).group(0)

#create data frame of annotations from animal 
annot = annot_data[(annot_data['Animal'] == int(name))]
annot.reset_index(inplace = True)

#process wav file of animal corresponding to annotations
print(str('Begin processing animal # ' + name))

spec_window = 128
NFFT = 512
    
#create slices
start = time.time()
samp_freq, sig_data, slices, steps = create_slice_from_wav(path, 6250, 0.9)
end = time.time()
print(str('Slices created in ' + str(end - start) + '  seconds'))
    
#create spectrogram from each slice
start = time.time()
i = 0
samp_freq_kHz = samp_freq/1000
    
for i in range(slices.shape[0]): 
    if i % 1000 == 0:
        print(i)
    #spectrogram
    freqs_spec, times, Sx = signal.spectrogram(slices[i,:], fs=samp_freq, nperseg = spec_window, nfft = NFFT)
    
    time_stamp = i*steps / samp_freq_kHz
    
    #store as dic
    spec_slices[time_stamp] = Sx
    spec_slices_ravel[time_stamp] = spec_slices[time_stamp].ravel().T
    
end = time.time()
print(str('Spectrograms created in ' + str(end - start) + '  seconds'))

#store as data frame
start = time.time()
spec_slices_df = pd.DataFrame(spec_slices_ravel).T
 
end = time.time()
print(str('Data frame created in ' + str(end - start) + '  seconds'))

#create new data frame of only slices corresponding to annotations
annot_slices_int = spec_slices_df.loc[annot['time_stamp']]
print(annot_slices_int.shape)

#add data frame to larger dictionary 
annot_slices = annot_slices.append(annot_slices_int, ignore_index=False)
print(annot_slices.shape)


Begin processing animal # 600
Sampling frequency: 250000
Number of samples: 76005376
Length: 304.021504 sec
Audio shape: (76005376,), Sliced audio shape: (13511, 6250)
Slices created in 0.5842423439025879  seconds
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
Spectrograms created in 76.05259823799133  seconds
Data frame created in 1.148120403289795  seconds
(75, 14135)
(552, 14135)


In [25]:
annot_slices_int.to_csv(name + '_annot_slices.csv')

In [25]:
annot_slices.to_csv('full_annot_slices.csv')