# Clean up dataset and trim

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
import zipfile
import librosa
import os

# Extract all language audio files (~15 minutes)
# Make sure directories match
list_dir = os.listdir('gdrive/MyDrive/audio')

for dir in list_dir:
    with zipfile.ZipFile('gdrive/MyDrive/audio/' + dir, 'r') as zip_ref:
        print("Extracting ", dir)
        zip_ref.extractall('')

In [None]:
de_wavs = os.listdir('de')
el_wavs = os.listdir('el')
en_wavs = os.listdir('en')
es_wavs = os.listdir('es')
fr_wavs = os.listdir('fr')
nl_wavs = os.listdir('nl')
no_wavs = os.listdir('no')
sv_wavs = os.listdir('sv')

In [None]:
def clean_up(language, path, min_dur=10, max_dur=100000):
    for i, f_name in enumerate(path):
        audio, fs = librosa.load(language +'/' + f_name, sr=None)
        duration = librosa.get_duration(audio, sr=fs)
        if duration < min_dur or duration > max_dur:
            os.remove(language + '/' + f_name)

In [None]:
clean_up('de', de_wavs)
clean_up('el', el_wavs)
clean_up('en', en_wavs)
clean_up('es', es_wavs)
clean_up('fr', fr_wavs)
clean_up('nl', nl_wavs)
clean_up('no', no_wavs)
clean_up('sv', sv_wavs)

In [None]:
de_wavs = os.listdir('de')
el_wavs = os.listdir('el')
en_wavs = os.listdir('en')
es_wavs = os.listdir('es')
fr_wavs = os.listdir('fr')
nl_wavs = os.listdir('nl')
no_wavs = os.listdir('no')
sv_wavs = os.listdir('sv')
print('After cleanup')
print(len(de_wavs))
print(len(el_wavs))
print(len(en_wavs))
print(len(es_wavs))
print(len(fr_wavs))
print(len(nl_wavs))
print(len(no_wavs))
print(len(sv_wavs))

# Balance dataset and trim to 10 seconds

In [None]:
# WARNING this deletes random audio files and was used to balance the datasets
# !find directory -maxdepth 1 -type f -name "*.wav" -print0 | sort -z -R | head -z -n 500 | xargs -0 rm

In [None]:
!mkdir de_trim
!mkdir el_trim
!mkdir en_trim
!mkdir es_trim
!mkdir fr_trim
!mkdir nl_trim
!mkdir no_trim
!mkdir sv_trim

In [None]:
import soundfile as sf

def trim(language, path, trim_dur=10):
    for f_name in path:
        audio, fs = librosa.load(language +'/' + f_name, sr=None)
        l = len(audio)
        audio = audio[l-int(trim_dur * fs):]  # Keep the last 10 seconds
        sf.write(language + '_trim/' + f_name, audio, fs) # Save new file


In [None]:
trim('de', de_wavs)
trim('el', el_wavs)
trim('en', en_wavs)
trim('es', es_wavs)
trim('fr', fr_wavs)
trim('nl', nl_wavs)
trim('no', no_wavs)
trim('sv', sv_wavs)

# Make pandas dataframe


In [None]:
de_wavs = os.listdir('de_trim')
el_wavs = os.listdir('el_trim')
en_wavs = os.listdir('en_trim')
es_wavs = os.listdir('es_trim')
fr_wavs = os.listdir('fr_trim')
nl_wavs = os.listdir('nl_trim')
no_wavs = os.listdir('no_trim')
sv_wavs = os.listdir('sv_trim')

In [None]:
import pandas as pd
# Get labels of each audio

labels = ['de']*len(de_wavs) + \
         ['el']*len(el_wavs) + \
         ['en']*len(en_wavs) + \
         ['es']*len(es_wavs) + \
         ['fr']*len(fr_wavs) + \
         ['nl']*len(nl_wavs) + \
         ['no']*len(no_wavs) + \
         ['sv']*len(sv_wavs)

# Get filename of each audio

files = de_wavs + el_wavs + en_wavs + \
        es_wavs + fr_wavs + nl_wavs + \
        no_wavs + sv_wavs

data = pd.DataFrame((zip(files, labels)), columns =['labels', 'file'])

In [None]:
def wavs_to_list(language, path):
    audio_list = []
    fs_list = []
    for f_name in path:
        audio, fs = librosa.load(language +'/' + f_name, sr=None)
        audio_list.append(audio)
        fs_list.append(fs)

    return audio_list, fs_list

In [None]:
de_audio, de_fs = wavs_to_list('de_trim', de_wavs)

# el_audio, el_fs = wavs_to_list('el_trim', el_wavs)
# en_audio, en_fs = wavs_to_list('en_trim', en_wavs)
# es_audio, es_fs = wavs_to_list('es_trim', es_wavs)
# fr_audio, fr_fs = wavs_to_list('fr_trim', fr_wavs)
# nl_audio, nl_fs = wavs_to_list('nl_trim', nl_wavs)
# no_audio, no_fs = wavs_to_list('no_trim', no_wavs)
# sv_audio, sv_fs = wavs_to_list('sv_trim', sv_wavs)

In [None]:
pwd

In [None]:
!sudo mv de_trim/ gdrive/MyDrive/audio_clean_v2/

In [None]:
!sudo mv el_trim/ gdrive/MyDrive/audio_clean_v2/
!sudo mv en_trim/ gdrive/MyDrive/audio_clean_v2/
!sudo mv es_trim/ gdrive/MyDrive/audio_clean_v2/
!sudo mv fr_trim/ gdrive/MyDrive/audio_clean_v2/
!sudo mv nl_trim/ gdrive/MyDrive/audio_clean_v2/
!sudo mv no_trim/ gdrive/MyDrive/audio_clean_v2/
!sudo mv sv_trim/ gdrive/MyDrive/audio_clean_v2/