In [1]:
import os
import pandas as pd
import numpy as np
import librosa
import soundfile as sf
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift

In [2]:
# Define the audio augmentation pipeline
augment = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
    TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
    PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
    Shift(p=0.5),
])

In [None]:
# Paths to data and CSV files
DATA_PATH = "data/"
TRAIN_CSV = DATA_PATH + "train.csv"
    
# Define the output directory for augmented audio
AUGMENTED_DIR = os.path.join(DATA_PATH, "augmented_audio")
OUTPUT_CSV = DATA_PATH + "train_aug.csv"
sample_rate = 16000

In [26]:
train_df = pd.read_csv(TRAIN_CSV)

In [27]:
for i, row in train_df.iterrows():
	file_path = os.path.join(DATA_PATH, row["file"])
	# print(file_path)
	# output_path = AUGMENTED_DIR + "/"
	# print(output_path)
    
	samples, sr = librosa.load(file_path, sr=sample_rate)
	augmented_samples = augment(samples=samples, sample_rate=sample_rate)

	base_name, ext = os.path.splitext(row['file'])
	base_name = base_name.split("geo/")[1]
	output_file = f"{base_name}_aug{ext}"
	# print(output_file)
	output_path = os.path.join(AUGMENTED_DIR, output_file)
	# print(output_path)
	sf.write(output_path, augmented_samples, sample_rate)

	# Print every 1000 files processed
	if (i + 1) % 1000 == 0:  # `i + 1` because `i` starts at 0
		print(f"Processed {i + 1} files. Augmented file saved: {output_path}")

Processed 1000 files. Augmented file saved: data/augmented_audio/train_999_aug.wav
Processed 2000 files. Augmented file saved: data/augmented_audio/train_1999_aug.wav
Processed 3000 files. Augmented file saved: data/augmented_audio/train_2999_aug.wav
Processed 4000 files. Augmented file saved: data/augmented_audio/train_3999_aug.wav
Processed 5000 files. Augmented file saved: data/augmented_audio/train_4999_aug.wav
Processed 6000 files. Augmented file saved: data/augmented_audio/train_5999_aug.wav


In [28]:
augmented_files = []
for file in os.listdir(AUGMENTED_DIR):
    if file.endswith(".wav"):  # Adjust file extension if necessary
        augmented_files.append(file)

In [31]:
len(augmented_files)

6000

In [61]:
augmented_labels = []


for file in augmented_files:
    # print(file)
    base_name = "geo/" + file.replace('_aug', '')
    # print(base_name)
    
    # Find the original file's label
    original_label = train_df[train_df['file'] == base_name]['transcript']
    augmented_labels.append(original_label.values[0])
    # print(original_label.values[0])
    

In [72]:
# Create a DataFrame for augmented files
import os.path


df_augmented = pd.DataFrame({
    'file': [os.path.join("augmented_audio", file) for file in augmented_files],
    'transcript': augmented_labels
})

In [73]:
df_augmented.head(10)

Unnamed: 0,file,transcript
0,augmented_audio/train_0_aug.wav,dangon pro la sukisto
1,augmented_audio/train_1_aug.wav,ja igzestas gilgaj opiroj plini in ispiranto
2,augmented_audio/train_2_aug.wav,roles pluraj vagtoroj gilkaj ple sireozaj ol a...
3,augmented_audio/train_3_aug.wav,teutimpi multaj rusaj pintrestoj gaj gomponest...
4,augmented_audio/train_4_aug.wav,gun sea dua idzo carlos thompson ŝe fefes ĉivi...
5,augmented_audio/train_5_aug.wav,teo ogazas in la vranca etala hespana kirmana ...
6,augmented_audio/train_6_aug.wav,me ni gulpas pre teo
7,augmented_audio/train_7_aug.wav,teo istas por me triiki entirisa
8,augmented_audio/train_8_aug.wav,la akado ogazas ĉi la gaspea maro gaj cintra azeo
9,augmented_audio/train_9_aug.wav,ni deras la senjoreno gapsguanti ni me ni parolos


In [74]:
df_combined = pd.concat([train_df, df_augmented], ignore_index=True)


In [75]:
# shuffle
df_combined = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)

In [76]:
df_combined

Unnamed: 0,file,transcript
0,geo/train_1935.wav,ŝe istas tri populara in tajfano
1,augmented_audio/train_494_aug.wav,la unusola sfilta turo istas mizi gei istas su...
2,geo/train_1720.wav,plui aleaj ambecioj jam di lonki enstikas mean...
3,augmented_audio/train_3120_aug.wav,ĉu sifiri
4,geo/train_360.wav,sur la ple granda me fedes nineon grom sablo
...,...,...
11995,augmented_audio/train_5964_aug.wav,tamin la ditalojn poste
11996,geo/train_5191.wav,sangta domenko hafas endustreajn branĉojn pre ...
11997,geo/train_5390.wav,aparti la polico mortekes protistanton dum tum...
11998,geo/train_860.wav,ruvus iggomprines


In [77]:
# save the combined to csv
df_combined.to_csv('data/train_aug.csv', index=False)