# SpeakSense - Language Detection System

The objective of this project is to develop a robust and accurate system capable of detecting the language spoken in audio recordings. By leveraging advanced machine learning algorithms and signal processing techniques, the system aims to accurately identify the language spoken in various audio inputs, spanning diverse accents, dialects, and environmental conditions. This language detection solution seeks to provide practical applications in speech recognition, transcription, translation, and other fields requiring language-specific processing, thereby enhancing accessibility and usability across linguistic boundaries.

In [None]:
import numpy as np
import pandas as pd
import opendatasets as od
import matplotlib.pyplot as plt
import IPython.display as ipd

import librosa
import os
import glob
import warnings

from tqdm import tqdm

pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')
np.random.seed(42)

if not (os.path.exists('./data/audio-dataset-with-10-indian-languages') or os.path.exists('./data/audio_dataset_indian_languages')):
    od.download(dataset_id_or_url="https://www.kaggle.com/datasets/hbchaitanyabharadwaj/audio-dataset-with-10-indian-languages", data_dir='./data/')
    os.rename('./data/audio-dataset-with-10-indian-languages/', './data/audio_dataset_indian_languages/')

if not (os.path.exists('./data/spoken-language-identification') or os.path.exists('./data/spoken_language_identification')):
    od.download(dataset_id_or_url="https://www.kaggle.com/datasets/toponowicz/spoken-language-identification", data_dir='./data/')
    os.rename('./data/spoken-language-identification/', './data/spoken_language_identification/')

In [None]:
spoken_languages_train_path_dataset = './data/spoken_language_identification/train/train/'
indian_languages_train_path_dataset = './data/audio_dataset_indian_languages/Language Detection Dataset/*/*.mp3'

filename_de = ['de_f_0809fd0642232f8c85b0b3d545dc2b5a.fragment1.flac', 'de_f_5d2e7f30d69f2d1d86fd05f3bbe120c2.fragment1.flac']
filename_en = ['en_f_058b70233667e1b64506dddf9f9d6b46.fragment1.flac', 'en_f_386ee651f6f1539ff5622c55e234e5a4.fragment3.flac']
filename_es = ['es_f_47bd2e6178465cd745c86c9db5ffe447.fragment1.flac', 'es_f_ea5fee5b16a663c988fbddb2137cf573.fragment15.flac']

In [None]:
data_de, sample_rate_de = librosa.load(spoken_languages_train_path_dataset + filename_de[0])
print(f'Audio Data Sample Rate: {sample_rate_de}')

ipd.Audio(data=data_de, rate=sample_rate_de)

In [None]:
data_en, sample_rate_en = librosa.load(spoken_languages_train_path_dataset + filename_en[0])
print(f'Audio Data Sample Rate: {sample_rate_en}')

ipd.Audio(data=data_en, rate=sample_rate_en)

In [None]:
data_es, sample_rate_es = librosa.load(spoken_languages_train_path_dataset + filename_es[0])
print(f'Audio Data Sample Rate: {sample_rate_es}')

ipd.Audio(data=data_es, rate=sample_rate_es)

In [None]:
data_bengali, sample_rate_bengali = librosa.load('./data/audio_dataset_indian_languages/Language Detection Dataset/Bengali/0.mp3')
print(f'Audio Data Sample Rate: {sample_rate_bengali}')

ipd.Audio(data=data_bengali, rate=sample_rate_bengali)

In [None]:
data_gujarati, sample_rate_gujarati = librosa.load('./data/audio_dataset_indian_languages/Language Detection Dataset/Gujarati/214.mp3')
print(f'Audio Data Sample Rate: {sample_rate_gujarati}')

ipd.Audio(data=data_gujarati, rate=sample_rate_gujarati)

In [None]:
data_hindi, sample_rate_hindi = librosa.load('./data/audio_dataset_indian_languages/Language Detection Dataset/Hindi/0.mp3')
print(f'Audio Data Sample Rate: {sample_rate_hindi}')

ipd.Audio(data=data_hindi, rate=sample_rate_hindi)

In [None]:
data_kannada, sample_rate_kannada = librosa.load('./data/audio_dataset_indian_languages/Language Detection Dataset/Kannada/214.mp3')
print(f'Audio Data Sample Rate: {sample_rate_kannada}')

ipd.Audio(data=data_kannada, rate=sample_rate_kannada)

In [None]:
data_malayalam, sample_rate_malayalam = librosa.load('./data/audio_dataset_indian_languages/Language Detection Dataset/Malayalam/214.mp3')
print(f'Audio Data Sample Rate: {sample_rate_malayalam}')

ipd.Audio(data=data_malayalam, rate=sample_rate_malayalam)

In [None]:
data_marathi, sample_rate_marathi = librosa.load('./data/audio_dataset_indian_languages/Language Detection Dataset/Marathi/214.mp3')
print(f'Audio Data Sample Rate: {sample_rate_marathi}')

ipd.Audio(data=data_marathi, rate=sample_rate_marathi)

In [None]:
data_punjabi, sample_rate_punjabi = librosa.load('./data/audio_dataset_indian_languages/Language Detection Dataset/Punjabi/214.mp3')
print(f'Audio Data Sample Rate: {sample_rate_punjabi}')

ipd.Audio(data=data_punjabi, rate=sample_rate_punjabi)

In [None]:
data_tamil, sample_rate_tamil = librosa.load('./data/audio_dataset_indian_languages/Language Detection Dataset/Tamil/214.mp3')
print(f'Audio Data Sample Rate: {sample_rate_tamil}')

ipd.Audio(data=data_tamil, rate=sample_rate_tamil)

In [None]:
data_telugu, sample_rate_telugu = librosa.load('./data/audio_dataset_indian_languages/Language Detection Dataset/Telugu/214.mp3')
print(f'Audio Data Sample Rate: {sample_rate_telugu}')

ipd.Audio(data=data_telugu, rate=sample_rate_telugu)

In [None]:
data_urdu, sample_rate_urdu = librosa.load('./data/audio_dataset_indian_languages/Language Detection Dataset/Urdu/214.mp3')
print(f'Audio Data Sample Rate: {sample_rate_urdu}')

ipd.Audio(data=data_urdu, rate=sample_rate_urdu)

In [None]:
def amplitude_plot_audio(data_dict: dict, n_rows: int = 1, n_cols: int = 3, figsize: tuple = (20, 5), file_name: str = 'amplitude_plot'):
    _, ax = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=figsize)

    for key in data_dict.keys():
        idx, lang = key.split('_')
        idx = int(idx)

        ax[idx].plot(data_dict[key])

        ax[idx].set_ylabel('Amplitude')
        ax[idx].set_xlabel('Time in samples')
        ax[idx].set_title(f'Audio Amplitude vs Time ({lang})')

    plt.savefig(f'./data/eda_results/{file_name}.png')

In [None]:
for i in range(len(filename_de)):
    data_de, _ = librosa.load(spoken_languages_train_path_dataset + filename_de[i])
    data_en, _ = librosa.load(spoken_languages_train_path_dataset + filename_en[i])
    data_es, _ = librosa.load(spoken_languages_train_path_dataset + filename_es[i])

    data_dict = {'0_de': data_de, '1_en': data_en, '2_es': data_es}

    amplitude_plot_audio(data_dict=data_dict, file_name='amplitude_plot_de_en_es')

In [None]:
indian_languages_list = ['Hindi', 'Bengali', 'Gujarati']
data_dict = {}

for idx, lang in enumerate(indian_languages_list):
    data_dict[f'{idx}_{lang.lower()}'] = librosa.load(f'./data/audio_dataset_indian_languages/Language Detection Dataset/{lang}/214.mp3')[0]

amplitude_plot_audio(data_dict=data_dict, file_name='amplitude_plot_hbg')

indian_languages_list = ['Kannada', 'Malayalam', 'Marathi']
data_dict = {}

for idx, lang in enumerate(indian_languages_list):
    data_dict[f'{idx}_{lang.lower()}'] = librosa.load(f'./data/audio_dataset_indian_languages/Language Detection Dataset/{lang}/214.mp3')[0]

amplitude_plot_audio(data_dict=data_dict, file_name='amplitude_plot_kmm')

indian_languages_list = ['Punjabi', 'Tamil', 'Telugu', 'Urdu']
data_dict = {}

for idx, lang in enumerate(indian_languages_list):
    data_dict[f'{idx}_{lang.lower()}'] = librosa.load(f'./data/audio_dataset_indian_languages/Language Detection Dataset/{lang}/214.mp3')[0]

amplitude_plot_audio(data_dict=data_dict, n_rows=1, n_cols=4, figsize=(25, 5), file_name='amplitude_plot_pttu')

In [None]:
def spectogram_plot_audio(data_dict: dict, sample_rate_dict: dict, n_rows: int = 1, n_cols: int = 3, figsize: tuple = (20, 5), file_name: str = 'amplitude_plot'):
    _, ax = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=figsize)

    for key in data_dict.keys():
        idx, lang = key.split('_')
        idx = int(idx)

        ax[idx].specgram(data_dict[key], Fs=sample_rate_dict[key])

        ax[idx].set_ylabel('Frequency [Hz]')
        ax[idx].set_xlabel('Time [sec]')
        ax[idx].set_title(f'Audio Frequency vs Time ({lang})')

    plt.savefig(f'./data/eda_results/{file_name}.png')

In [None]:
for i in range(len(filename_de)):
    data_de, samplerate_de = librosa.load(spoken_languages_train_path_dataset + filename_de[i])
    data_en, samplerate_en = librosa.load(spoken_languages_train_path_dataset + filename_en[i])
    data_es, samplerate_es = librosa.load(spoken_languages_train_path_dataset + filename_es[i])

    data_dict = {'0_de': data_de, '1_en': data_en, '2_es': data_es}
    sample_rate_dict = {'0_de': samplerate_de, '1_en': samplerate_en, '2_es': samplerate_es}

    spectogram_plot_audio(data_dict=data_dict, sample_rate_dict=sample_rate_dict, file_name='spectogram_plot_de_en_es')

In [None]:
indian_languages_list = ['Hindi', 'Bengali', 'Gujarati']
data_dict = {}
sample_rate_dict = {}

for idx, lang in enumerate(indian_languages_list):
    data, sample_rate = librosa.load(f'./data/audio_dataset_indian_languages/Language Detection Dataset/{lang}/214.mp3')

    data_dict[f'{idx}_{lang.lower()}'] = data
    sample_rate_dict[f'{idx}_{lang.lower()}'] = sample_rate

spectogram_plot_audio(data_dict=data_dict, sample_rate_dict=sample_rate_dict, file_name='spectogram_plot_hbg')

indian_languages_list = ['Kannada', 'Malayalam', 'Marathi']
audio_file_name = '214'
data_dict = {}
sample_rate_dict = {}

for idx, lang in enumerate(indian_languages_list):
    data, sample_rate = librosa.load(f'./data/audio_dataset_indian_languages/Language Detection Dataset/{lang}/{audio_file_name}.mp3')

    data_dict[f'{idx}_{lang.lower()}'] = data
    sample_rate_dict[f'{idx}_{lang.lower()}'] = sample_rate

spectogram_plot_audio(data_dict=data_dict, sample_rate_dict=sample_rate_dict, file_name='spectogram_plot_kmm')

indian_languages_list = ['Punjabi', 'Tamil', 'Telugu', 'Urdu']
audio_file_name = '214'
data_dict = {}
sample_rate_dict = {}

for idx, lang in enumerate(indian_languages_list):
    data, sample_rate = librosa.load(f'./data/audio_dataset_indian_languages/Language Detection Dataset/{lang}/{audio_file_name}.mp3')

    data_dict[f'{idx}_{lang.lower()}'] = data
    sample_rate_dict[f'{idx}_{lang.lower()}'] = sample_rate

spectogram_plot_audio(data_dict=data_dict, sample_rate_dict=sample_rate_dict, n_rows=1, n_cols=4, figsize=(25, 5), file_name='spectogram_plot_pttu')

In [None]:
def load_data(file_name: str) -> tuple:
    try:
        audio_data, sample_rate = librosa.load(file_name, sr=None)
        audio_duration_sec = int(librosa.get_duration(y=audio_data, sr=sample_rate))

        return (sample_rate, audio_duration_sec)

    except Exception as e:
        print(f"Error processing {file_name}: {str(e)}")

        return (np.nan, np.nan)

In [None]:
spoken_language_dataframe = pd.DataFrame({'file_name': [spoken_languages_train_path_dataset + file_name for file_name in os.listdir(spoken_languages_train_path_dataset)]})
spoken_language_dataframe['language_label'] = spoken_language_dataframe['file_name'].str.split('/', expand=True).iloc[:, 5].str.split('_', expand=True).iloc[:, 0].replace(to_replace={
    'de': 'german', 'en': 'english', 'es': 'spanish'})

indian_language_dataframe = pd.DataFrame({'file_name': glob.glob(indian_languages_train_path_dataset)})
indian_language_dataframe['language_label'] = indian_language_dataframe['file_name'].str.split('\\', expand=True).iloc[:, 1].str.lower()

indian_language_dataframe = indian_language_dataframe[indian_language_dataframe['language_label'] != 'punjabi']

language_dataframe = pd.concat([spoken_language_dataframe, indian_language_dataframe], ignore_index=True)
language_dataframe['file_size_kb'] = (language_dataframe['file_name'].apply(lambda x: os.path.getsize(x)) / 1024).round(3)

language_dataframe.to_csv('./data/model_data/language_dataframe_v1.csv', index=False)

for lang in tqdm(language_dataframe['language_label'].unique(), desc="Languages"):
    lang_data = language_dataframe[language_dataframe['language_label'] == lang].copy()
    lang_data[['sample_rate', 'audio_duration_sec']] = lang_data['file_name'].apply(lambda file_name: pd.Series(load_data(file_name=file_name)))

    lang_data.to_csv(f'./data/model_data/data_subset/language_dataframe_{lang}_v1.csv', index=False)

language_dataframe = pd.concat([pd.read_csv(f'./data/model_data/data_subset/language_dataframe_{lang}_v1.csv') for lang in language_dataframe['language_label'].unique()], ignore_index=True).dropna()
language_dataframe.to_csv('./data/model_data/language_dataframe_v1.csv', index=False)

language_dataframe

In [None]:
language_dataframe = pd.read_csv('./data/model_data/language_dataframe_v1.csv')

In [None]:
language_dataframe

In [None]:
language_dataframe['sample_rate'].unique()

In [None]:
language_dataframe['file_size_kb'].mean(), language_dataframe['file_size_kb'].median()

In [None]:
language_dataframe['audio_duration_sec'].value_counts()