# SpeakSense - Language Detection System (Machine Learning CSCI 6364)

**Abde Manaaf Ghadiali (G29583342), Gehna Ahuja (G00000000), Venkatesh Shanmugam (G00000000)**

The objective of this project is to develop a robust and accurate system capable of detecting the language spoken in audio recordings. By leveraging advanced machine learning algorithms and signal processing techniques, the system aims to accurately identify the language spoken in various audio inputs, spanning diverse accents, dialects, and environmental conditions. This language detection solution seeks to provide practical applications in speech recognition, transcription, translation, and other fields requiring language-specific processing, thereby enhancing accessibility and usability across linguistic boundaries.

This code sets up an environment for working with audio data, particularly focusing on Indian languages. Here's a breakdown of what each part does:

1. **Importing Libraries**: Imports necessary libraries for data manipulation, visualization, machine learning, and audio processing.

2. **Setting Display Options and Suppressing Warnings**: Configures display options for Pandas and suppresses warnings.

3. **Setting Random Seed**: Sets a random seed for reproducibility.

4. **Downloading Datasets**: Checks if the necessary datasets are downloaded, and if not, downloads them from Kaggle using the OpenDatasets library and organizes them into appropriate directories.

5. **Audio Data Processing**: Prepares the audio data for further analysis. This might include feature extraction, preprocessing, and organizing the data for training machine learning models.

6. **Machine Learning**: Utilizes machine learning techniques for tasks such as spoken language identification. This involves splitting the data into training and testing sets, building machine learning models (such as Random Forest or Gradient Boosting), evaluating the models, and generating classification reports and confusion matrices.

7. **Deep Learning**: Utilizes deep learning techniques, specifically convolutional neural networks (CNNs), for tasks such as spoken language identification. This involves building and training deep learning models using the TensorFlow and Keras libraries.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import IPython.display as ipd
import tensorflow as tf

import warnings
import math

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler, LabelBinarizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score

from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, Flatten, Dropout, BatchNormalization, Input, Rescaling
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler
from tensorflow.keras.preprocessing import image_dataset_from_directory

pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')
np.random.seed(42)

# Model Training

### Classical ML Models

In [None]:
mfcc_feature_mean_dataframe = pd.read_csv('../data/model_data/mfcc_feature_mean_dataframe_v1.csv', converters={'mfcc_features_mean': pd.eval})
mfcc_feature_mean_dataframe

In [None]:
X_train, X_test, y_train, y_test = train_test_split(mfcc_feature_mean_dataframe, mfcc_feature_mean_dataframe['language_label'], stratify=mfcc_feature_mean_dataframe['language_label'], test_size=0.05, random_state=0)

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

print(f'\nTrain Shape: {X_train.shape}, Test Shape: {X_test.shape}')

ipd.display(X_train)

language_labels_cols = list(pd.get_dummies(X_train['language_label'], dtype=np.int32).columns.values)

X_train, X_test = (np.concatenate(X_train['mfcc_features_mean'].values, axis=0).reshape(-1, 40),
                          np.concatenate(X_test['mfcc_features_mean'].values, axis=0).reshape(-1, 40))

y_train, y_test = (pd.factorize(y_train)[0],
                          pd.factorize(y_test)[0])

print(f'Train Shape: {X_train.shape}, Test Shape: {X_test.shape}')
print(f'Target: Train Shape: {y_train.shape}, Test Shape: {y_test.shape}')

In [None]:
scaler_object = StandardScaler()

X_train_scaled = scaler_object.fit_transform(X_train)
X_test_scaled = scaler_object.transform(X_test)

In [None]:
X_train

In [None]:
X_train_scaled

In [None]:
print('Training RFC and GBC on Unscaled Data!')
rfc_model = RandomForestClassifier(n_estimators=10, n_jobs=-1).fit(X=X_train, y=y_train)
gbc_model = GradientBoostingClassifier(n_estimators=10).fit(X=X_train, y=y_train)

print('Training RFC and GBC on Scaled Data!')
rfc_model_scaled = RandomForestClassifier(n_estimators=10, n_jobs=-1).fit(X=X_train_scaled, y=y_train)
gbc_model_scaled = GradientBoostingClassifier(n_estimators=10).fit(X=X_train_scaled, y=y_train)

In [None]:
confusion_matrix_data = confusion_matrix(y_test, rfc_model.predict(X_test))

plt.figure(figsize = (8, 6))
cmd = ConfusionMatrixDisplay(confusion_matrix_data, display_labels=language_labels_cols)
cmd.plot()

plt.title('Confusion Matrix')
plt.xticks(rotation=45)

In [None]:
confusion_matrix_data = confusion_matrix(y_test, gbc_model.predict(X_test))

plt.figure(figsize = (8, 6))
cmd = ConfusionMatrixDisplay(confusion_matrix_data, display_labels=language_labels_cols)
cmd.plot()

plt.title('Confusion Matrix')
plt.xticks(rotation=45)

In [None]:
confusion_matrix_data = confusion_matrix(y_test, rfc_model.predict(X_test_scaled))

plt.figure(figsize = (8, 6))
cmd = ConfusionMatrixDisplay(confusion_matrix_data, display_labels=language_labels_cols)
cmd.plot()

plt.title('Confusion Matrix')
plt.xticks(rotation=45)

In [None]:
confusion_matrix_data = confusion_matrix(y_test, gbc_model.predict(X_test_scaled))

plt.figure(figsize = (8, 6))
cmd = ConfusionMatrixDisplay(confusion_matrix_data, display_labels=language_labels_cols)
cmd.plot()

plt.title('Confusion Matrix')
plt.xticks(rotation=45)

In [None]:
rfc_model.predict(X_test)

In [None]:
roc_auc_score(list(y_test), list(rfc_model.predict_proba(X_test)), multi_class='ovr', average='macro')

In [None]:
roc_auc_score(list(y_test), list(gbc_model.predict_proba(X_test)), multi_class='ovr', average='macro')

In [None]:
roc_auc_score(list(y_test), list(rfc_model_scaled.predict_proba(X_test_scaled)), multi_class='ovr', average='macro')

In [None]:
roc_auc_score(list(y_test), list(gbc_model_scaled.predict_proba(X_test_scaled)), multi_class='ovr', average='macro')

In [None]:
print(classification_report(y_test, rfc_model.predict(X_test)))

In [None]:
print(classification_report(y_test, gbc_model.predict(X_test)))

In [None]:
print(classification_report(y_test, rfc_model.predict(X_test_scaled)))

In [None]:
print(classification_report(y_test, gbc_model.predict(X_test_scaled)))

### Dense Model

In [None]:
mfcc_feature_mean_dataframe = pd.read_csv('../data/model_data/mfcc_feature_mean_dataframe_v1.csv', converters={'mfcc_features_mean': pd.eval})
mfcc_feature_mean_dataframe

In [None]:
X_train, X_test, y_train, y_test = train_test_split(mfcc_feature_mean_dataframe, mfcc_feature_mean_dataframe['language_label'], stratify=mfcc_feature_mean_dataframe['language_label'], test_size=0.03, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_train, X_train['language_label'], stratify=X_train['language_label'], test_size=0.03, random_state=0)

X_train = X_train.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

print(f'\nTrain Shape: {X_train.shape}, Validation Shape: {X_val.shape}, Test Shape: {X_test.shape}')

ipd.display(X_train)

language_labels_cols = list(pd.get_dummies(X_train['language_label'], dtype=np.int32).columns.values)

X_train, X_val, X_test = (np.concatenate(X_train['mfcc_features_mean'].values, axis=0).reshape(-1, 40, 1),
                          np.concatenate(X_val['mfcc_features_mean'].values, axis=0).reshape(-1, 40, 1),
                          np.concatenate(X_test['mfcc_features_mean'].values, axis=0).reshape(-1, 40, 1))

y_train, y_val, y_test = (pd.get_dummies(y_train.values, dtype=np.int32).values,
                          pd.get_dummies(y_val.values, dtype=np.int32).values,
                          pd.get_dummies(y_test.values, dtype=np.int32).values)

print(f'Train Shape: {X_train.shape}, Validation Shape: {X_val.shape}, Test Shape: {X_test.shape}')
print(f'Target: Train Shape: {y_train.shape}, Validation Shape: {y_val.shape}, Test Shape: {y_test.shape}')

In [None]:
scaler_object = StandardScaler()

X_train_scaled = scaler_object.fit_transform(X_train.reshape(-1, 40)).reshape(-1, 40, 1)
X_val_scaled = scaler_object.transform(X_val.reshape(-1, 40)).reshape(-1, 40, 1)
X_test_scaled = scaler_object.transform(X_test.reshape(-1, 40)).reshape(-1, 40, 1)

In [None]:
def build_model_dense(input_shape: tuple, output_shape: int) -> object:
    model = Sequential()

    model.add(Dense(64, activation='relu', input_shape=input_shape))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    model.add(Dense(128, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dense(256, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dense(512, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    model.add(Dense(256, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dense(128, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    model.add(Dense(output_shape, activation='softmax'))

    return model

In [None]:
model = build_model_dense(input_shape=(40, ), output_shape=y_train.shape[1])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

def learning_rate_decay(epoch: int) -> float:
	return 0.00158 * math.pow(0.9, math.floor((1 + epoch) / 1))

checkpoint_callback = ModelCheckpoint('../data/models/language_detection_model_unscaled_v1.keras', monitor='val_accuracy', verbose=0, save_best_only=True, mode='max')
learning_rate_callback = LearningRateScheduler(learning_rate_decay)

model_history = model.fit(
    X_train, y_train, epochs=20, verbose=1, batch_size=32, callbacks=[checkpoint_callback, learning_rate_callback],
    validation_data=(X_val, y_val))

In [None]:
model = build_model_dense(input_shape=(40, ), output_shape=y_train.shape[1])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

def learning_rate_decay(epoch: int) -> float:
	return 0.00158 * math.pow(0.9, math.floor((1 + epoch) / 1))

checkpoint_callback = ModelCheckpoint('../data/models/language_detection_model_scaled_v1.keras', monitor='val_accuracy', verbose=0, save_best_only=True, mode='max')
learning_rate_callback = LearningRateScheduler(learning_rate_decay)

model_history = model.fit(
    X_train_scaled, y_train, epochs=20, verbose=1, batch_size=32, callbacks=[checkpoint_callback, learning_rate_callback],
    validation_data=(X_val_scaled, y_val))

In [None]:
model = load_model('../data/models/language_detection_model_unscaled_v1.keras')
model.evaluate(X_test, y_test)

In [None]:
y_pred = model.predict(X_test)

y_test_argmax = [np.argmax(y_test[i,:]) for i in range(0, len(y_test))]
y_pred_argmax = [np.argmax(y_pred[i,:]) for i in range(0,len(y_pred))]

confusion_matrix_data = confusion_matrix(y_test_argmax, y_pred_argmax)

plt.figure(figsize = (8, 6))
cmd = ConfusionMatrixDisplay(confusion_matrix_data, display_labels=language_labels_cols)
cmd.plot()

plt.title('Confusion Matrix')
plt.xticks(rotation=45)

In [None]:
label_binarizer = LabelBinarizer().fit(y_test_argmax)

y_test_lb = label_binarizer.transform(y_test_argmax)
y_pred_lb = label_binarizer.transform(y_pred_argmax)

roc_auc_score(list(y_test_lb), list(y_pred_lb), multi_class='ovr', average='macro')

In [None]:
print(classification_report(y_test_argmax, y_pred_argmax))

In [None]:
print(model_history.history.keys())

plt.plot(model_history.history['accuracy'])
plt.plot(model_history.history['val_accuracy'])

plt.title('Training vs Validation Model Accuracy')
plt.ylabel('Accuracy [%]')
plt.xlabel('Epoch [number]')

plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

In [None]:
plt.plot(model_history.history['loss'])
plt.plot(model_history.history['val_loss'])
plt.title('Training vs Validation Model Loss')
plt.ylabel('Loss [CCE]')
plt.xlabel('Epoch [number]')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.show()

In [None]:
model = load_model('../data/models/language_detection_model_scaled_v1.keras')
model.evaluate(X_test_scaled, y_test)

In [None]:
y_pred = model.predict(X_test_scaled)

y_test_argmax = [np.argmax(y_test[i,:]) for i in range(0, len(y_test))]
y_pred_argmax = [np.argmax(y_pred[i,:]) for i in range(0,len(y_pred))]

confusion_matrix_data = confusion_matrix(y_test_argmax, y_pred_argmax)

plt.figure(figsize = (8, 6))
cmd = ConfusionMatrixDisplay(confusion_matrix_data, display_labels=language_labels_cols)
cmd.plot()

plt.title('Confusion Matrix')
plt.xticks(rotation=45)

In [None]:
label_binarizer = LabelBinarizer().fit(y_test_argmax)

y_test_lb = label_binarizer.transform(y_test_argmax)
y_pred_lb = label_binarizer.transform(y_pred_argmax)

roc_auc_score(list(y_test_lb), list(y_pred_lb), multi_class='ovr', average='macro')

In [None]:
print(classification_report(y_test_argmax, y_pred_argmax))

In [None]:
print(model_history.history.keys())

plt.plot(model_history.history['accuracy'])
plt.plot(model_history.history['val_accuracy'])

plt.title('Training vs Validation Model Accuracy')
plt.ylabel('Accuracy [%]')
plt.xlabel('Epoch [number]')

plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

In [None]:
plt.plot(model_history.history['loss'])
plt.plot(model_history.history['val_loss'])
plt.title('Training vs Validation Model Loss')
plt.ylabel('Loss [CCE]')
plt.xlabel('Epoch [number]')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.show()

### CNN Model

In [None]:
mfcc_feature_dataframe = pd.read_csv('../data/model_data/mfcc_feature_dataframe_v1.csv', converters={'mfcc_features': pd.eval}, verbose=2, chunksize=10)

for chunk in mfcc_feature_dataframe:
    mfcc_feature_dataframe = chunk
    break

In [None]:
X_train, X_test, y_train, y_test = train_test_split(mfcc_feature_dataframe, mfcc_feature_dataframe['language_label'], stratify=mfcc_feature_dataframe['language_label'], test_size=0.05, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_train, X_train['language_label'], stratify=X_train['language_label'], test_size=0.05, random_state=0)

X_train = X_train.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

print(f'\nTrain Shape: {X_train.shape}, Validation Shape: {X_val.shape}, Test Shape: {X_test.shape}')

ipd.display(X_train)

language_labels_cols = list(pd.get_dummies(X_train['language_label'], dtype=np.int32).columns.values)

X_train, X_val, X_test = (np.concatenate(X_train['mfcc_features'].values, axis=0).reshape(-1, 40, 431, 1),
                          np.concatenate(X_val['mfcc_features'].values, axis=0).reshape(-1, 40, 431, 1),
                          np.concatenate(X_test['mfcc_features'].values, axis=0).reshape(-1, 40, 431, 1))

y_train, y_val, y_test = (pd.get_dummies(y_train.values, dtype=np.int32).values,
                          pd.get_dummies(y_val.values, dtype=np.int32).values,
                          pd.get_dummies(y_test.values, dtype=np.int32).values)

print(f'Train Shape: {X_train.shape}, Validation Shape: {X_val.shape}, Test Shape: {X_test.shape}')
print(f'Target: Train Shape: {y_train.shape}, Validation Shape: {y_val.shape}, Test Shape: {y_test.shape}')

In [None]:
scaler_object = StandardScaler()

X_train_scaled = scaler_object.fit_transform(X_train.reshape(-1, 40 * 431)).reshape(-1, 40, 431, 1)
X_val_scaled = scaler_object.transform(X_val.reshape(-1, 40 * 431)).reshape(-1, 40, 431, 1)
X_test_scaled = scaler_object.transform(X_test.reshape(-1, 40 * 431)).reshape(-1, 40, 431, 1)

In [None]:
X_train.shape

In [None]:
X_train_scaled.shape

In [None]:
def build_model_cnn(input_shape: tuple, output_shape: int) -> object:
    model = Sequential()

    model.add(Conv2D(32, (7, 7), activation='relu', padding='valid', input_shape=input_shape))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(pool_size=(3, 3), strides=2, padding='same'))
    model.add(Conv2D(64, (5, 5), activation='relu', padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(pool_size=(3, 3), strides=2, padding='same'))
    model.add(Conv2D(128, (3, 3), activation='relu', padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(pool_size=(3, 3), strides=2, padding='same'))
    model.add(Flatten())
    model.add(BatchNormalization())
    model.add(Dense(256, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    model.add(Dense(output_shape, activation='softmax'))

    return model

In [None]:
model = build_model_cnn(input_shape=(40, 431, 1), output_shape=y_train.shape[1])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

def learning_rate_decay(epoch: int) -> float:
	return 0.00158 * math.pow(0.9, math.floor((1 + epoch) / 1))

checkpoint_callback = ModelCheckpoint('../data/models/language_detection_model_unscaled_v2.keras', monitor='val_accuracy', verbose=0, save_best_only=True, mode='max')
learning_rate_callback = LearningRateScheduler(learning_rate_decay)

model_history = model.fit(
    X_train, y_train, epochs=5, verbose=1, batch_size=32, callbacks=[checkpoint_callback, learning_rate_callback],
    validation_data=(X_val, y_val))

In [None]:
model = build_model_cnn(input_shape=(40, 431, 1), output_shape=y_train.shape[1])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

def learning_rate_decay(epoch: int) -> float:
	return 0.00158 * math.pow(0.9, math.floor((1 + epoch) / 1))

checkpoint_callback = ModelCheckpoint('../data/models/language_detection_model_scaled_v2.keras', monitor='val_accuracy', verbose=0, save_best_only=True, mode='max')
learning_rate_callback = LearningRateScheduler(learning_rate_decay)

model_history = model.fit(
    X_train_scaled, y_train, epochs=5, verbose=1, batch_size=32, callbacks=[checkpoint_callback, learning_rate_callback],
    validation_data=(X_val_scaled, y_val))

In [None]:
model = load_model('../data/models/language_detection_model_unscaled_v2.keras')
model.evaluate(X_test, y_test)

In [None]:
y_pred = model.predict(X_test)

y_test_argmax = [np.argmax(y_test[i,:]) for i in range(0, len(y_test))]
y_pred_argmax = [np.argmax(y_pred[i,:]) for i in range(0,len(y_pred))]

confusion_matrix_data = confusion_matrix(y_test_argmax, y_pred_argmax)

plt.figure(figsize = (8, 6))
cmd = ConfusionMatrixDisplay(confusion_matrix_data, display_labels=language_labels_cols)
cmd.plot()

plt.title('Confusion Matrix')
plt.xticks(rotation=45)

In [None]:
label_binarizer = LabelBinarizer().fit(y_test_argmax)

y_test_lb = label_binarizer.transform(y_test_argmax)
y_pred_lb = label_binarizer.transform(y_pred_argmax)

roc_auc_score(list(y_test_lb), list(y_pred_lb), multi_class='ovr', average='macro')

In [None]:
print(classification_report(y_test_argmax, y_pred_argmax))

In [None]:
print(model_history.history.keys())

plt.plot(model_history.history['accuracy'])
plt.plot(model_history.history['val_accuracy'])

plt.title('Training vs Validation Model Accuracy')
plt.ylabel('Accuracy [%]')
plt.xlabel('Epoch [number]')

plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

In [None]:
plt.plot(model_history.history['loss'])
plt.plot(model_history.history['val_loss'])
plt.title('Training vs Validation Model Loss')
plt.ylabel('Loss [CCE]')
plt.xlabel('Epoch [number]')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.show()

In [None]:
model = load_model('../data/models/language_detection_model_scaled_v2.keras')
model.evaluate(X_test_scaled, y_test)

In [None]:
y_pred = model.predict(X_test_scaled)

y_test_argmax = [np.argmax(y_test[i,:]) for i in range(0, len(y_test))]
y_pred_argmax = [np.argmax(y_pred[i,:]) for i in range(0,len(y_pred))]

confusion_matrix_data = confusion_matrix(y_test_argmax, y_pred_argmax)

plt.figure(figsize = (8, 6))
cmd = ConfusionMatrixDisplay(confusion_matrix_data, display_labels=language_labels_cols)
cmd.plot()

plt.title('Confusion Matrix')
plt.xticks(rotation=45)

In [None]:
label_binarizer = LabelBinarizer().fit(y_test_argmax)

y_test_lb = label_binarizer.transform(y_test_argmax)
y_pred_lb = label_binarizer.transform(y_pred_argmax)

roc_auc_score(list(y_test_lb), list(y_pred_lb), multi_class='ovr', average='macro')

In [None]:
print(classification_report(y_test_argmax, y_pred_argmax))

In [None]:
print(model_history.history.keys())

plt.plot(model_history.history['accuracy'])
plt.plot(model_history.history['val_accuracy'])

plt.title('Training vs Validation Model Accuracy')
plt.ylabel('Accuracy [%]')
plt.xlabel('Epoch [number]')

plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

In [None]:
plt.plot(model_history.history['loss'])
plt.plot(model_history.history['val_loss'])
plt.title('Training vs Validation Model Loss')
plt.ylabel('Loss [CCE]')
plt.xlabel('Epoch [number]')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.show()

### CNN Spectogram Model

In [None]:
spectogram_images = image_dataset_from_directory("../data/model_data/spectogram_images", labels="inferred", image_size=(64, 64), batch_size=None, shuffle=True)
class_names = spectogram_images.class_names

test_val_records = 100

test_images = spectogram_images.take(test_val_records)
train_images = spectogram_images.skip(test_val_records)

val_images = train_images.take(test_val_records)
train_images = train_images.skip(test_val_records)

size = (64, 64)

train_data = train_images.map(lambda x, y: (tf.image.resize(x, size), y))
val_data = val_images.map(lambda x, y: (tf.image.resize(x, size), y))
test_data = test_images.map(lambda x, y: (tf.image.resize(x, size), y))

print(f'Train Data Size: {len([i for i, v in enumerate(train_data.as_numpy_iterator())])}')
print(f'Test Data Size: {len([i for i, v in enumerate(val_data.as_numpy_iterator())])}')
print(f'Test Data Size: {len([i for i, v in enumerate(test_data.as_numpy_iterator())])}')

In [None]:
train_data = np.array([(images, labels) for (images, labels) in train_data.as_numpy_iterator()])
val_data = np.array([(images, labels) for (images, labels) in val_data.as_numpy_iterator()])
test_data = np.array([(images, labels) for (images, labels) in test_data.as_numpy_iterator()])

train_image_data, train_labels = np.concatenate(train_data[:, 0], axis=0).reshape(-1, 64, 64, 3), pd.get_dummies(train_data[:, 1], dtype=np.int32).values
val_image_data, val_labels = np.concatenate(val_data[:, 0], axis=0).reshape(-1, 64, 64, 3), pd.get_dummies(val_data[:, 1], dtype=np.int32).values
test_image_data, test_labels = np.concatenate(test_data[:, 0], axis=0).reshape(-1, 64, 64, 3), pd.get_dummies(test_data[:, 1], dtype=np.int32).values

In [None]:
train_labels

In [None]:
plt.imshow(np.array(train_image_data[0]).astype('uint8'))
class_names[np.argmax(train_labels[0])]

In [None]:
def build_model_cnn_spectrogram(output_shape: int) -> object:
    model = Sequential()

    model.add(Input(shape=(64, 64, 3), name='input'))
    model.add(Rescaling(1./255))
    model.add(Conv2D(32, (5, 5), activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(pool_size=(3, 3)))
    model.add(Conv2D(64, (5, 5), activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(pool_size=(3, 3)))
    model.add(Conv2D(128, (3, 3), activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(pool_size=(3, 3)))
    model.add(Flatten())
    model.add(BatchNormalization())
    model.add(Dense(256, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    model.add(Dense(output_shape, activation='softmax'))

    return model

In [None]:
model = build_model_cnn_spectrogram(output_shape=12)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

def learning_rate_decay(epoch: int) -> float:
	return 0.00158 * math.pow(0.9, math.floor((1 + epoch) / 1))

checkpoint_callback = ModelCheckpoint('../data/models/language_detection_model_raw_v3.keras', monitor='val_accuracy', verbose=0, save_best_only=True, mode='max')
learning_rate_callback = LearningRateScheduler(learning_rate_decay)

model_history = model.fit(
    train_image_data, train_labels, epochs=20, verbose=1, batch_size=32, callbacks=[checkpoint_callback, learning_rate_callback],
    validation_data=(val_image_data, val_labels))

In [None]:
model = load_model('../data/models/language_detection_model_raw_v3.keras')
model.evaluate(test_image_data, test_labels)

In [None]:
y_pred = model.predict(test_image_data)

y_test_argmax = [np.argmax(test_labels[i,:]) for i in range(0, len(test_labels))]
y_pred_argmax = [np.argmax(y_pred[i,:]) for i in range(0, len(test_labels))]

confusion_matrix_data = confusion_matrix(y_test_argmax, y_pred_argmax)

plt.figure(figsize = (8, 6))
cmd = ConfusionMatrixDisplay(confusion_matrix_data, display_labels=class_names)
cmd.plot()

plt.title('Confusion Matrix')
plt.xticks(rotation=45)

print(classification_report(y_test_argmax, y_pred_argmax))

In [None]:
label_binarizer = LabelBinarizer().fit(y_test_argmax)

y_test_lb = label_binarizer.transform(y_test_argmax)
y_pred_lb = label_binarizer.transform(y_pred_argmax)

roc_auc_score(list(y_test_lb), list(y_pred_lb), multi_class='ovr', average='macro')

In [None]:
print(model_history.history.keys())

plt.plot(model_history.history['accuracy'])
plt.plot(model_history.history['val_accuracy'])

plt.title('Training vs Validation Model Accuracy')
plt.ylabel('Accuracy [%]')
plt.xlabel('Epoch [number]')

plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

In [None]:
plt.plot(model_history.history['loss'])
plt.plot(model_history.history['val_loss'])
plt.title('Training vs Validation Model Loss')
plt.ylabel('Loss [CCE]')
plt.xlabel('Epoch [number]')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.show()