# Spoken Language Recognition Using Convolutional Neural Networks

_written by Joscha S. Rieber (Fraunhofer IAIS) in 2020_

## Environment

In [3]:
train = 'train'
test = 'test'

eng = 'english'
ger = 'german'
swe = 'swedish'

languages = [eng, ger, swe]
categories = [train, test]

dataset_root_path = '../data/'

image_width = 500
image_height = 128

augment_data_factor = 1.0 # Set-up how much of the training data should be augmented

## Import Libraries

In [5]:
import os
import librosa as lr
#from librosa.display import waveplot
import librosa.display
import numpy as np
from glob import glob
import matplotlib.pyplot as plt
import warnings
from IPython.display import Audio
import soundfile as sf

## Helper Functions

### Function for loading the audio file

In [6]:
def load_audio_file(audio_file_path):
    warnings.simplefilter('ignore', UserWarning)
    
    audio_segment, sample_rate = lr.load(audio_file_path)
    return audio_segment, sample_rate

    warnings.simplefilter('default', UserWarning)

### Functions for data augmentation

In [7]:
def add_noise(audio_segment, gain):
    num_samples = audio_segment.shape[0]
    noise = gain * np.random.normal(size=num_samples)
    return audio_segment + noise

### Determine all available audio files

In [8]:
audio_files = {}

for lang in languages:
    for category in categories:
        audio_files[lang + '.' + category] = glob(dataset_root_path + category + '/' + lang + '/*.mp3')

print(audio_files.keys())

dict_keys(['english.train', 'english.test', 'german.train', 'german.test', 'swedish.train', 'swedish.test'])


## Example

In [9]:
key = list(audio_files.keys())[1]
audio_file = audio_files[key][1]
audio_file

'../data/test/english\\common_voice_en_102074.mp3'

In [10]:
audio, sample_rate = load_audio_file(audio_file)

Audio(audio, rate=sample_rate)

In [8]:
audio_segment_with_noise = add_noise(audio, 0.005)

Audio(audio_segment_with_noise, rate=sample_rate)

## Augment training dataset

In [11]:
def augment_audio_file_with_noise(audio_file_path):
    audio_segment, sample_rate = load_audio_file(audio_file_path)
    audio_segment_with_noise = add_noise(audio_segment, 0.005)
    audio_file_path_without_extension = os.path.splitext(audio_file_path)[0]
    augmented_audio_file_path = audio_file_path_without_extension + '_augmented_noise.wav'
    sf.write(augmented_audio_file_path, audio_segment_with_noise, sample_rate)

In [12]:
for lang in languages:
    category = train
        
    all_audio_files = audio_files[lang + '.' + category]
        
    num_files = int(len(all_audio_files) * augment_data_factor)

    for i in range(num_files):
        if i % (num_files / 10) == 0:
            print('Still processing ' + lang + ' ' + category + ' ' + str(i) + '/' + str(num_files))
        augment_audio_file_with_noise(all_audio_files[i])

Still processing english train 0/464
Still processing german train 0/464
Still processing swedish train 0/464
