In [1]:
# Imports 
import librosa
import librosa.display
import librosa.effects
import librosa.util

import numpy as np
import sys, os
import pandas as pd

import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from tqdm import tqdm

In [3]:
#Features we want right now: min f0, max f0, and mean f0 and maybe rms (not sure exactly what that is but was used in the paper)
path = '/home/CREMA-D/AudioWAV/'
files = os.listdir(path)

summary = pd.read_csv('/home/CREMA-D/processedResults/summaryTable.csv')

num_files = len(os.listdir(path)) #not sure how you want to count files
count = 0

# Aim to get to 12 features
num_features = 8

# Keep track of min and max duration of all data
min_dur = np.inf
max_dur = 0

X = np.zeros((num_files, num_features))
Y = np.zeros(num_files).astype(str)
for sample in tqdm(files): #depends on how you access
    file = os.path.join(path,sample)
    current_wav, current_sr = librosa.load(file) #fix for set up 
    f0_series = librosa.yin(current_wav, librosa.note_to_hz('C2'), librosa.note_to_hz('C7'))
    rms_series = librosa.feature.rms(y=current_wav)
    f0_max = np.amax(f0_series)
    f0_min = np.amin(f0_series)
    # Get f0 range
    f0_range = f0_max - f0_min
    # duration
    duration = librosa.get_duration(y=current_wav, sr=current_sr)
    
    # Outer duration
    if duration > max_dur:
        max_dur = duration
    if duration < min_dur:
        min_dur = duration
        
    # Get the pitches
#     pitches, magnitudes = librosa.piptrack(y=current_wav, sr=current_sr)
#     pitch_max = np.amax(pitches)
#     pitch_min = np.amin(pitches)
#     # Get f0 range
#     pitch_range = pitch_max - pitch_min
#     pitch_mean = np.mean(pitches)
    
    f0_mean = np.mean(f0_series)
    rms_max = np.amax(rms_series)
    rms_min = np.amin(rms_series)
    rms_mean = np.mean(rms_series)
#     x = np.array([f0_min, f0_max, f0_mean, f0_range, duration, rms_min, rms_max, rms_mean, pitch_max, pitch_min, 
#                   pitch_range, pitch_mean])
    x = np.array([f0_min, f0_max, f0_mean, f0_range, duration, rms_min, rms_max, rms_mean])
    X[count,:] = x
    # Get the label for VoiceVote
    info = summary.loc[summary['FileName'] == sample.split('.')[0]]
    try:
        Y[count] = info['VoiceVote'].values[0]
    except Exception as ex:
        print(f'info: {info}')
        print(f'index count: {count}')
        index = count
        print(f'unable to find file: {sample}')
        count -= 1
    count += 1
print(f'shape of train data: {X.shape}')
print(f'shape of labels: {Y.shape}')

 39%|███▉      | 2914/7442 [04:25<07:22, 10.24it/s]

info: Empty DataFrame
Columns: [Unnamed: 0, FileName, VoiceVote, VoiceLevel, FaceVote, FaceLevel, MultiModalVote, MultiModalLevel]
Index: []
index count: 2912
unable to find file: 1040_ITH_SAD_X.wav


100%|██████████| 7442/7442 [11:22<00:00, 10.91it/s]

shape of train data: (7442, 12)
shape of labels: (7442,)





In [6]:
#For Logistic Regression, can use sklearn.linear_model.LogisticRegression
# !pip install -U scikit-learn
# import sklearn
# from sklearn.linear_model import LogisticRegression
# Remove the file that wasn't in results
print(f'shape of train data: {X.shape}')
print(f'shape of labels: {Y.shape}')
print(Y[7440:])

# Remove that one example without a label
X = np.delete(X,-1,axis=0)
Y = Y[:-1]
print(f'New X shape: {X.shape}')
print(f'New Y shape: {Y.shape}')
print(f'Example feature: {X[4,:]}')
print(f'Last Y to check: {Y[-1]}')
print(f'min duration: {min_dur}')
print(f'max duration: {max_dur}')

shape of train data: (7440, 12)
shape of labels: (7440,)
[]
New X shape: (7439, 12)
New Y shape: (7439,)
Example feature: [7.25388728e+01 2.79622457e+02 1.54880632e+02 2.07083584e+02
 2.70272109e+00 4.40623751e-03 5.49128167e-02 1.39149548e-02
 3.98937622e+03 0.00000000e+00 3.98937622e+03 8.57418728e+00]
Last Y to check: S
min duration: 1.2679818594104308
max duration: 5.005034013605442


In [7]:
# Get data and labels for training
train_split = int(0.8 * num_files)
print('train_split: ', train_split)
print(f'train size: {X[:train_split].shape}, val size: {X[train_split:].shape}')

model = LogisticRegression().fit(X[:train_split], Y[:train_split])

# Predict on validation/test (80-20 split)
predictions = model.predict(X[train_split:])

# Output score (mean accuracy)
score = model.score(X[train_split:],Y[train_split:])
print(f'Test accuracy score: {score}')

f1 = f1_score(Y[train_split:], predictions, average='macro')
print(f'macro f1 score: {f1}')
f1 = f1_score(Y[train_split:], predictions, average='micro')
print(f'micro f1 score: {f1}')

train_split:  5953
train size: (5953, 12), val size: (1486, 12)
Test accuracy score: 0.5356662180349933
macro f1 score: 0.03730999819169182
micro f1 score: 0.5356662180349933


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [9]:
# Some basic stats for the dataset
avg = np.mean(X, axis=0)
print(f'avg shape: {avg.shape}')
# stats = ['f0_min', 'f0_max', 'f0_mean', 'f0_range', 'duration', 'rms_min', 'rms_max', 'rms_mean', 'pitch_max', 'pitch_min', 
#                   'pitch_range', 'pitch_mean']
stats = ['f0_min', 'f0_max', 'f0_mean', 'f0_range', 'duration', 'rms_min', 'rms_max', 'rms_mean']
for j,stat in enumerate(stats):
    print(f'{stat} average: {avg[j]}')

avg shape: (12,)
f0_min average: 68.42691202324633
f0_max average: 1108.2675292037368
f0_mean average: 197.609968232653
f0_range average: 1039.8406171804854
duration average: 2.542796678289513
rms_min average: 0.003896825949034051
rms_max average: 0.10693585834278004
rms_mean average: 0.027580022187794605
pitch_max average: 3729.755598693593
pitch_min average: 0.0
pitch_range average: 3729.755598693593
pitch_mean average: 10.164273588371174
