In [1]:
import numpy as np
import pandas as pd
import math
import time
import re
import os
from scipy.io import wavfile
from skimage import util
from scipy import signal
from scipy import stats

#from sklearn.preprocessing import StandardScaler
#from sklearn.model_selection import KFold, train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV

#from sklearn.cluster import KMeans
#from sklearn.metrics.cluster import silhouette_score

#visualizing results
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
#import yellowbrick as yb

In [2]:
def create_slice_from_wav(file_path, slice_len, step_size):
    """Creates small slices from wav file. Slice_len (use sampling frequency to convert to ms). 
    Step_size is amount of overlap between each slice."""
    
    #get animal name
    
    #read in wav file
    samp_freq, sig_data = wavfile.read(file_path)
    sig_data = sig_data[0:150000000]
    print('Sampling frequency: ' + str(samp_freq))
    
    #determine number of samples and length
    n_samples = sig_data.shape[0]
    print('Number of samples: ' + str(n_samples))
    sig_len = n_samples/samp_freq
    print('Length: ' + str(sig_len) + ' sec')
    
    #create slices 
    M = slice_len
    steps = int(M*step_size)
    slices = util.view_as_windows(sig_data, window_shape=(M,), step=steps)
    print(f'Audio shape: {sig_data.shape}, Sliced audio shape: {slices.shape}')
    
    return samp_freq, sig_data, slices, steps

In [3]:
def plot_spec(Sx, times, steps, time_stamp):
    """Plots a spectrogram from a slice"""
    
    f, ax = plt.subplots()
    plt.pcolormesh((times*1000) + (time_stamp), freqs_spec / 1000, 10 * np.log10(Sx), cmap = 'cubehelix')
    ax.ticklabel_format(useOffset=False)
    plt.ylabel('Frequency [kHz]')
    plt.xlabel('Time [msec]')
    plt.show()
    
    return plt

In [4]:
def multi_plot(image_df, time_stamp_list, x, y):
    """Plots spectrograms from a list of time_stamps"""
    for time_stamp in time_stamp_list[x:y]:
        plt.figure(figsize = (2,5))
        plt.pcolormesh((times*1000) + (time_stamp), freqs_spec / 1000, 10 * np.log10(image_df[time_stamp]), cmap = 'cubehelix')
        plt.show()

In [5]:
def find_features(data):
    """Finds spectral flatness and power sum for each time stamp in a df."""
    
    start = time.time()
    
    feature_df = pd.DataFrame(index = data.index, columns = ['animal', 'time_stamp', 'spec_flat', 'power_sum'])
    
    for time_stamp in data.index:
        #spectral flattness
        feature_df.loc[time_stamp]['spec_flat'] = (stats.gmean(data.loc[time_stamp])) / (data.loc[time_stamp].mean())
        #power sum
        feature_df.loc[time_stamp]['power_sum'] = data.loc[time_stamp].sum()
        #time stamp
        feature_df.loc[time_stamp]['time_stamp'] = time_stamp
    
    feature_df['animal'] = data['Animal']
    end = time.time()
    print(end - start)

    return feature_df

Create df of annotated USVs from RavenLite

In [6]:
annot_path = "C:/Users/Schindler/Documents/Schindler_Lab/Data/Analysis/Excel files/USV/USV_annot.csv"

In [7]:
data = pd.read_csv(annot_path)
annot_data = pd.DataFrame(data = data)
annot_data.head()

Unnamed: 0.1,Unnamed: 0,Begin Time (s),End Time (s),Low Freq (Hz),High Freq (Hz),Delta Time (s),Delta Freq (Hz),Avg Power Density (dB FS),Annotation,Animal,Session
0,0,18.929297,18.934698,74332.5,78276.7,0.005,3944.2,-44.4,high slug,527,cagepair
1,1,24.758961,24.769763,10012.1,13046.1,0.011,3034.0,-41.6,low slug,527,cagepair
2,2,174.353273,174.357636,34706.7,41988.2,0.004,7281.5,-41.5,bbc,527,cagepair
3,3,342.886783,342.892601,16874.1,53888.7,0.006,37014.6,-39.1,bbc,527,cagepair
4,4,393.056112,393.06193,35194.2,50970.9,0.006,15776.7,-39.1,bbc,527,cagepair


In [8]:
annot_data.groupby(['Animal', 'Session', 'Annotation']).describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Avg Power Density (dB FS),Avg Power Density (dB FS),Avg Power Density (dB FS),Avg Power Density (dB FS),Avg Power Density (dB FS),Avg Power Density (dB FS),Avg Power Density (dB FS),Avg Power Density (dB FS),Begin Time (s),Begin Time (s),...,Low Freq (Hz),Low Freq (Hz),Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Animal,Session,Annotation,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2
527,cagepair,bbc,5.0,-37.56,4.500889,-41.5,-39.1,-39.1,-38.3,-29.8,5.0,272.998069,...,34706.7,35194.2,5.0,4.0,1.581139,2.0,3.0,4.0,5.0,6.0
527,cagepair,high slug,1.0,-44.4,,-44.4,-44.4,-44.4,-44.4,-44.4,1.0,18.929297,...,74332.5,74332.5,1.0,0.0,,0.0,0.0,0.0,0.0,0.0
527,cagepair,low slug,1.0,-41.6,,-41.6,-41.6,-41.6,-41.6,-41.6,1.0,24.758961,...,10012.1,10012.1,1.0,1.0,,1.0,1.0,1.0,1.0,1.0
529,cagepair,bbc,5.0,-37.36,6.689021,-49.1,-36.4,-34.7,-33.6,-33.0,5.0,318.486728,...,33677.2,34890.8,5.0,12.2,2.588436,9.0,10.0,13.0,14.0,15.0
529,cagepair,high slug,3.0,-52.666667,0.152753,-52.8,-52.75,-52.7,-52.6,-52.5,3.0,231.577334,...,72682.5,77669.9,3.0,10.0,2.645751,7.0,9.0,11.0,11.5,12.0
529,cagepair,low slug,2.0,-41.4,1.555635,-42.5,-41.95,-41.4,-40.85,-40.3,2.0,228.678717,...,26288.725,30643.2,2.0,12.0,5.656854,8.0,10.0,12.0,14.0,16.0
533,CPApair,high multi,1.0,-43.2,,-43.2,-43.2,-43.2,-43.2,-43.2,1.0,379.543597,...,66835.9,66835.9,1.0,17.0,,17.0,17.0,17.0,17.0,17.0
533,CPApair,high slug,1.0,-45.3,,-45.3,-45.3,-45.3,-45.3,-45.3,1.0,567.125668,...,65487.6,65487.6,1.0,18.0,,18.0,18.0,18.0,18.0,18.0
533,CPApair,low multi,3.0,-38.366667,1.855622,-40.3,-39.25,-38.2,-37.4,-36.6,3.0,350.898081,...,7648.15,9321.2,3.0,20.0,1.0,19.0,19.5,20.0,20.5,21.0
533,CPApair,low slug,7.0,-39.7,3.510461,-42.5,-41.7,-41.2,-39.25,-32.3,7.0,250.077887,...,13701.5,14170.9,7.0,25.0,2.160247,22.0,23.5,25.0,26.5,28.0


Determine closest time stamp of each annotation and add as column to df

In [9]:
annot_data['Begin Time (s)_1000'] = annot_data['Begin Time (s)']*1000
annot_time_stamps = []
values = annot_data['Begin Time (s)_1000'].values
for value in values:
    time_stamp_num = int(value / 22.5)
    time_stamp_index = time_stamp_num*22.5
    annot_time_stamps.append(time_stamp_index)

annot_data['time_stamp'] = annot_time_stamps
print(annot_data.shape)

(552, 13)


In [36]:
writer = pd.ExcelWriter('annot_data_raven.xlsx', engine='xlsxwriter')
annot_data.to_excel(writer, sheet_name='Sheet1')
writer.save()

In [32]:
annot_data[annot_data['Session'] == 'CPApair'].shape

(478, 13)

Find path names for each wav file corresponding to annotated data

In [11]:
wav_dir_path = 'C:/Users/Schindler/Documents/Schindler_Lab/Data/USVs/CPA_pair_exp/18.12.07_CPA_pair_3x'

In [26]:
path_names = []
files = os.listdir(wav_dir_path)
for file in files: 
        path_names.append(wav_dir_path + "/" + file)

path_names

['C:/Users/Schindler/Documents/Schindler_Lab/Data/USVs/CPA_pair_exp/18.12.07_CPA_pair_3x/533.wav',
 'C:/Users/Schindler/Documents/Schindler_Lab/Data/USVs/CPA_pair_exp/18.12.07_CPA_pair_3x/534.wav',
 'C:/Users/Schindler/Documents/Schindler_Lab/Data/USVs/CPA_pair_exp/18.12.07_CPA_pair_3x/535.wav',
 'C:/Users/Schindler/Documents/Schindler_Lab/Data/USVs/CPA_pair_exp/18.12.07_CPA_pair_3x/542.wav',
 'C:/Users/Schindler/Documents/Schindler_Lab/Data/USVs/CPA_pair_exp/18.12.07_CPA_pair_3x/543.wav',
 'C:/Users/Schindler/Documents/Schindler_Lab/Data/USVs/CPA_pair_exp/18.12.07_CPA_pair_3x/554.wav',
 'C:/Users/Schindler/Documents/Schindler_Lab/Data/USVs/CPA_pair_exp/18.12.07_CPA_pair_3x/555.wav',
 'C:/Users/Schindler/Documents/Schindler_Lab/Data/USVs/CPA_pair_exp/18.12.07_CPA_pair_3x/559.wav',
 'C:/Users/Schindler/Documents/Schindler_Lab/Data/USVs/CPA_pair_exp/18.12.07_CPA_pair_3x/600.wav']

In [13]:
annot_slices = pd.DataFrame()

In [30]:
spec_slices = {}
spec_slices_ravel = {}
spec_slices_df = pd.DataFrame()
annot_slices_int = pd.DataFrame()

#select animal to process (need to process individually due to memory contraints)
path = path_names[8]
name = re.search("\d\d\d", path).group(0)

#create data frame of annotations from animal 
annot = annot_data[(annot_data['Animal'] == int(name))]
annot.reset_index(inplace = True)

#process wav file of animal corresponding to annotations
print(str('Begin processing animal # ' + name))

spec_window = 128
NFFT = 512
    
#create slices
start = time.time()
samp_freq, sig_data, slices, steps = create_slice_from_wav(path, 6250, 0.9)
end = time.time()
print(str('Slices created in ' + str(end - start) + '  seconds'))
    
#create spectrogram from each slice
start = time.time()
i = 0
samp_freq_kHz = samp_freq/1000
    
for i in range(slices.shape[0]): 
    if i % 1000 == 0:
        print(i)
    #spectrogram
    freqs_spec, times, Sx = signal.spectrogram(slices[i,:], fs=samp_freq, nperseg = spec_window, nfft = NFFT)
    
    time_stamp = i*steps / samp_freq_kHz
    
    #store as dic
    spec_slices[time_stamp] = Sx
    spec_slices_ravel[time_stamp] = spec_slices[time_stamp].ravel().T
    
end = time.time()
print(str('Spectrograms created in ' + str(end - start) + '  seconds'))

#store as data frame
start = time.time()
spec_slices_df = pd.DataFrame(spec_slices_ravel).T
 
end = time.time()
print(str('Data frame created in ' + str(end - start) + '  seconds'))

#create new data frame of only slices corresponding to annotations
annot_slices_int = spec_slices_df.loc[annot['time_stamp']]
print(annot_slices_int.shape)
name_mult = [name] * len(annot_slices_int)
annot_slices_int['Animal'] = name_mult

#add data frame to larger dictionary 
annot_slices = annot_slices.append(annot_slices_int, ignore_index=False)
print(annot_slices.shape)


Begin processing animal # 600
Sampling frequency: 250000
Number of samples: 76005376
Length: 304.021504 sec
Audio shape: (76005376,), Sliced audio shape: (13511, 6250)
Slices created in 0.6097986698150635  seconds
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
Spectrograms created in 87.0321478843689  seconds
Data frame created in 1.203193187713623  seconds
(0, 14135)
(478, 14136)


In [34]:
annot_slices.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14126,14127,14128,14129,14130,14131,14132,14133,14134,Animal
491670.0,0.019183,0.017771,0.151593,0.208036,0.157183,0.113061,0.027229,0.06913,0.104704,0.126154,...,0.119235,0.000167,0.022791,0.004284,0.020812,0.033002,0.002019,0.033532,0.000759,559
574672.5,0.005228,0.141959,0.01176,0.020904,0.000413,0.026608,0.058466,0.474165,0.009032,0.223671,...,0.01973,0.20822,0.026557,0.024421,0.067778,0.00019,0.00839,0.016071,0.045426,559
581107.5,0.233618,0.762965,16.717909,1.721122,4.868372,0.219798,0.350043,0.232977,1.5e-05,0.12788,...,0.003329,0.326374,0.03094,0.034727,0.005775,0.0002,0.002509,0.019687,0.032781,559
581332.5,0.139566,0.028976,0.08033,0.000404,0.189469,0.025914,0.054812,0.005557,0.148925,0.226749,...,0.04604,0.008197,0.007777,0.050236,0.11921,0.000174,0.080439,0.048799,0.027694,559
213997.5,0.05779,0.531218,0.07967,0.041175,0.50333,0.002556,0.405981,2.653546,0.104074,0.113949,...,0.01094,0.002559,0.0076,0.001338,0.096458,0.022573,0.102888,0.013343,0.034777,559


In [35]:
writer = pd.ExcelWriter('annot_slices_df.xlsx', engine='xlsxwriter')
annot_slices.to_excel(writer, sheet_name='Sheet1')
writer.save()