In [None]:
import librosa
import numpy as np
import pandas as pd
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, GRU, Flatten, Conv1D, MaxPooling1D
from keras.layers import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder, StandardScaler
from algorithms import *
from scipy.stats import skew, kurtosis
import noisereduce as nr
import os
import pickle

In [None]:
#extract development and evaluation
df = pd.read_csv("dsl_data/development.csv")
df_eval = pd.read_csv("dsl_data/evaluation.csv")

In [None]:
label_encoder(df, 'gender')
label_encoder(df_eval, 'gender')
label_encoder(df, 'ageRange')
label_encoder(df_eval, 'ageRange')

In [None]:
condition = (df['Current language used for work/school'] == np.unique(df_eval['Current language used for work/school'].values)[0]) & (df['First Language spoken'] == np.unique(df_eval['First Language spoken'].values)[0]) & (df['Self-reported fluency level '] == np.unique(df_eval['Self-reported fluency level '].values)[0]) & (df['ageRange'] == np.unique(df_eval['ageRange'].values)[0])
df = df.loc[condition]

In [None]:
cols = ['Id','Self-reported fluency level ', 'First Language spoken', 'Current language used for work/school']
df.drop(columns=cols,inplace=True)
df_eval.drop(columns=cols[:4],inplace=True)

In [None]:
def extract_all_features(df):
        # Extract features for each audio
    for index,row in df.iterrows():
        y_untrimmed, sr = librosa.load(row["path"], mono=True)
        y_trimmed, i = librosa.effects.trim(y_untrimmed, top_db=30, frame_length=2048, hop_length=512)
        y_noise_reduced = nr.reduce_noise(y=y_trimmed, sr=sr)
        extracted_duration = librosa.get_duration(y=y_noise_reduced, sr=sr)
        chroma_stft = librosa.feature.chroma_stft(y=y_noise_reduced, sr=sr)
        rmse = librosa.feature.rms(y=y_noise_reduced)
        spec_cent = librosa.feature.spectral_centroid(y=y_noise_reduced, sr=sr)
        spec_bw = librosa.feature.spectral_bandwidth(y=y_noise_reduced, sr=sr)
        rolloff = librosa.feature.spectral_rolloff(y=y_noise_reduced, sr=sr)
        zcr = librosa.feature.zero_crossing_rate(y= y_noise_reduced)
        mfcc = librosa.feature.mfcc(y=y_noise_reduced, sr=sr)
        tonnetz = librosa.feature.tonnetz(y=y_noise_reduced, sr=sr)
        spectral_contrast = librosa.feature.spectral_contrast(y=y_noise_reduced, sr=sr)
        spectrogram = librosa.feature.melspectrogram(y=y_noise_reduced, sr=sr)

        df.at[index, "audio_duration"] = extracted_duration
        # Fill in the features for each audio
        df.at[index, "chroma_stft_mean"] = np.mean(chroma_stft)
        df.at[index, "chroma_stft_std"] = np.std(chroma_stft)
        df.at[index, "chroma_stft_min"] = np.min(chroma_stft)
        df.at[index, "chroma_stft_max"] = np.max(chroma_stft)

        df.at[index, "rmse_mean"] = np.mean(rmse)
        df.at[index, "rmse_std"] = np.std(rmse)
        df.at[index, "rmse_min"] = np.min(rmse)
        df.at[index, "rmse_max"] = np.max(rmse)

        df.at[index, "spectral_centroid_mean"] = np.mean(spec_cent)
        df.at[index, "spectral_centroid_std"] = np.std(spec_cent)
        df.at[index, "spectral_centroid_min"] = np.min(spec_cent)
        df.at[index, "spectral_centroid_max"] = np.max(spec_cent)

        df.at[index, "spectral_bandwidth_mean"] = np.mean(spec_bw)
        df.at[index, "spectral_bandwidth_std"] = np.std(spec_bw)
        df.at[index, "spectral_bandwidth_min"] = np.min(spec_bw)
        df.at[index, "spectral_bandwidth_max"] = np.max(spec_bw)
        
        df.at[index, "rolloff_mean"] = np.mean(rolloff)
        df.at[index, "rolloff__std"] = np.std(rolloff)
        df.at[index, "rolloff_min"] = np.min(rolloff)
        df.at[index, "rolloff_max"] = np.max(rolloff)

        df.at[index, "zero_crossing_rate_mean"] = np.mean(zcr)
        df.at[index, "zero_crossing_rate_std"] = np.std(zcr)
        df.at[index, "zero_crossing_rate_min"] = np.min(zcr)
        df.at[index, "zero_crossing_rate_max"] = np.max(zcr)

        for i in range(len(mfcc)):
            df.at[index, f"mfcc_mean{i + 1}"] = np.mean(mfcc[i])
            df.at[index, f"mfcc_std{i + 1}"] = np.std(mfcc[i])
            df.at[index, f"mfcc_min{i + 1}"] = np.min(mfcc[i])
            df.at[index, f"mfcc_max{i + 1}"] = np.max(mfcc[i])
            df.at[index, f"mfcc_skew{i + 1}"] = skew(mfcc[i])
            df.at[index, f"mfcc_kurtosis{i + 1}"] = kurtosis(mfcc[i])
            

        for i in range(len(tonnetz)):
            df.at[index, f"tonnetz_mean{i + 1}"] = np.mean(tonnetz[i])
            df.at[index, f"tonnetz_std{i + 1}"] = np.std(tonnetz[i])
            df.at[index, f"tonnetz_min{i + 1}"] = np.min(tonnetz[i])
            df.at[index, f"tonnetz_max{i + 1}"] = np.max(tonnetz[i])
            df.at[index, f"tonnetz_skew{i + 1}"] = skew(tonnetz[i])
            df.at[index, f"tonnetz_kurtosis{i + 1}"] = kurtosis(tonnetz[i])


        for i in range(len(spectral_contrast)):
            df.at[index, f"spectral_contrast_mean{i + 1}"] = np.mean(spectral_contrast[i])
            df.at[index, f"spectral_contrast_std{i + 1}"] = np.std(spectral_contrast[i])
            df.at[index, f"spectral_contrast_min{i + 1}"] = np.min(spectral_contrast[i])
            df.at[index, f"spectral_contrast_max{i + 1}"] = np.max(spectral_contrast[i])
            df.at[index, f"spectral_contrast_skew{i + 1}"] = skew(spectral_contrast[i])
            df.at[index, f"spectral_contrast_kurtosis{i + 1}"] = kurtosis(spectral_contrast[i])


        for i in range(len(chroma_stft)):
            df.at[index, f"chroma_stft_mean{i + 1}"] = np.mean(chroma_stft[i])
            df.at[index, f"chroma_stft_std{i + 1}"] = np.std(chroma_stft[i])
            df.at[index, f"chroma_stft_min{i + 1}"] = np.min(chroma_stft[i])
            df.at[index, f"chroma_stft_max{i + 1}"] = np.max(chroma_stft[i])
            df.at[index, f"chroma_stft_skew{i + 1}"] = skew(chroma_stft[i])
            df.at[index, f"chroma_stft_kurtosis{i + 1}"] = kurtosis(chroma_stft[i])

        for i in range(len(spectrogram)):
            df.at[index, f"spectogram_mean{i + 1}"] = np.mean(spectrogram[i])
            df.at[index, f"spectogram_std{i + 1}"] = np.std(spectrogram[i])
            df.at[index, f"spectogram_min{i + 1}"] = np.min(spectrogram[i])
            df.at[index, f"spectogram_max{i + 1}"] = np.max(spectrogram[i])
            df.at[index, f"spectogram_skew{i + 1}"] = skew(spectrogram[i])
            df.at[index, f"spectogram_kurtosis{i + 1}"] = kurtosis(spectrogram[i])

    return df



In [None]:
if os.path.isfile("df.pkl"):
    with open('df.pkl', 'rb') as f:
        df = pickle.load(f)
    with open('df_eval.pkl', 'rb') as f:
        df_eval = pickle.load(f)
else:
    df = extract_all_features(df)
    df_eval = extract_all_features(df_eval)

In [None]:
target_class = df['action']+ df['object']
encoder = LabelEncoder()
y = encoder.fit_transform(target_class)

In [None]:
X = df.drop(columns=['path', 'speakerId','action','object'])

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(np.array(X, dtype = float))

In [None]:
X_eval = df_eval.drop(columns=['path', 'speakerId' ])

In [None]:
# X_eval = normalize_dataframe(X_eval)

In [None]:
X_eval = scaler.fit_transform(np.array(X_eval, dtype = float))

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
from keras import models
from keras import layers
model = Sequential()
model.add(Dense(512, input_shape=(X.shape[1],), activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(len(y), activation='softmax'))

In [None]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
history = model.fit(X,
                    y,
                    epochs=100,
                    batch_size=300)

In [None]:
# Plotting accuracy history
# plt.plot(history.history['accuracy'])
# plt.plot(history.history['val_acc'])
# plt.title('Model Accuracy')
# plt.ylabel('Accuracy')
# plt.xlabel('Epoch')
# plt.legend(['Train', 'Test'], loc='upper left')
# plt.show()

In [None]:
# test_loss, test_acc = model.evaluate(X_test,y_test)
# print('test_acc: ',test_acc)

In [None]:
predictions = model.predict(X_eval)


In [None]:
y_pred_classes = predictions.argmax(axis=-1)

# Convert the predicted class labels back to the original target classes
y_pred_classes_decoded = encoder.inverse_transform(y_pred_classes)

# Convert the decoded predictions to a pandas Series
y_pred_classes_decoded = pd.Series(y_pred_classes_decoded, name='Predicted')


In [None]:
import time

y_evaluation_df = pd.DataFrame(y_pred_classes_decoded, columns = ['Predicted'])
y_evaluation_df.index.name = 'Id'

from datetime import datetime
now = int(time.time())
readable_time = datetime.fromtimestamp(now).strftime('%H:%M:%S')
y_evaluation_df.to_csv(f'evaluation/copy_predictions-{readable_time}.csv')

In [None]:
# def predict(model, X_test, le):
#     y_pred = model.predict(X_test)
#     y_pred_classes = np.argmax(y_pred, axis=1)
#     y_pred_labels = le.inverse_transform(y_pred_classes)
#     y_pred_labels = pd.Series(y_pred_labels)
#     return y_pred_labels

In [85]:
if os.path.isfile("df.pkl"):
    with open('df.pkl', 'rb') as f:
        pickle.dump(df, f)
    with open('df_eval.pkl', 'rb') as f:
        pickle.dump(df_eval, f)
