# Data Augmentation 

## Import modules and implementation of data augmentation functions

In [1]:
import pandas as pd
import librosa
import csv
from sklearn.model_selection import train_test_split
import tensorflow as tf
from pandarallel import pandarallel
import numpy as np
import cv2
import h5py

pd.options.mode.chained_assignment = None  # default='warn'

def load_soundfile_from_drive(filename, path='public_dataset_cleaned_segmented//'):
    x , sr = librosa.load(path + filename)
    return x, sr

def addNoise(data):
    noise_factor = 0.05*np.random.rand()
    sampling_rate = 22050
    noise = np.random.randn(len(data))
    augmented_data = data + noise_factor * noise
    # Cast back to same data type
    augmented_data = augmented_data.astype(type(data[0]))
    return augmented_data

def shiftTime(data):
    sampling_rate = 22050
    shift_max = 8
    shift = np.random.randint(sampling_rate * shift_max)
    direction = np.random.randint(0, 2)
    if direction == 1:
        shift = -shift
    augmented_data = np.roll(data, shift)
    # Set to silence for heading/ tailing
    if shift > 0:
        augmented_data[:shift] = 0
    else:
        augmented_data[shift:] = 0
    return augmented_data

def changeSpeed(data):
    sampling_rate = 22050
    speed_factor = 1 + 0.4*(np.random.rand() - 0.5)
    augmented_data = librosa.effects.time_stretch(data, speed_factor)
    return augmented_data

def changePitch(data):
    pitch_factor = 1 + 0.4 * (np.random.rand() - 0.5)
    sampling_rate = 22050
    augmented_data = librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)
    return augmented_data

def generateMelSpectrogram(data):
    x = data
    sr = 22050
    M = librosa.feature.melspectrogram(y=x, sr=sr, n_mels=128)
    M_db = librosa.power_to_db(M, ref=np.max) * -1
    dim = (320,240) # resize to dimensions from https://www.sciencedirect.com/science/article/pii/S2352914820303026
    resized = cv2.resize(M_db, dim, interpolation=cv2.INTER_LINEAR)
    return resized

def expandDimensions(data):
    return np.expand_dims(data, axis=2)

## Use of a pandas extension for parallel processing in order to decrease waiting times

In [2]:
pandarallel.initialize(use_memory_fs=False)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


## Reading in data

In [3]:
data = pd.read_csv("public_dataset_cleaned_segmented//labels.csv", names=["soundfilename","status","cough_value"])

In [4]:
data.dropna(inplace=True)

In [5]:
data["sounddata"] = data["soundfilename"].apply(lambda x: load_soundfile_from_drive(x,"public_dataset_cleaned_segmented//")[0]) 

In [7]:
data

Unnamed: 0,soundfilename,status,cough_value,sounddata
0,28632a0e-f38e-4195-a566-1cc3ad08e5b90.wav,healthy,0.9967,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,28632a0e-f38e-4195-a566-1cc3ad08e5b91.wav,healthy,0.9967,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,28632a0e-f38e-4195-a566-1cc3ad08e5b92.wav,healthy,0.9967,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6,082991fb-ee66-4970-91dd-50890f4cc9b10.wav,COVID-19,0.9431,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8,082991fb-ee66-4970-91dd-50890f4cc9b11.wav,COVID-19,0.9431,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...
46970,e30be719-4bdc-46c5-b515-7549d06f12a20.wav,symptomatic,0.9984,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
46972,e30be719-4bdc-46c5-b515-7549d06f12a21.wav,symptomatic,0.9984,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
46974,e30be719-4bdc-46c5-b515-7549d06f12a22.wav,symptomatic,0.9984,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
46976,e3157d31-7cc6-4a85-a041-793bbe4cc0ce0.wav,healthy,0.9890,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [8]:
data = data.reset_index(drop=True)

In [9]:
pos_data = data[data["status"]=="COVID-19"]
neg_data = data[data["status"]!="COVID-19"]

pos_data_train, pos_data_test = train_test_split(pos_data, test_size=0.3)
print("Covid positive test data: ",len(pos_data_test))
neg_data_train, neg_data_test = train_test_split(neg_data, test_size=0.1)
print("Covid negative test data: ",len(neg_data_test))

Covid positive test data:  582
Covid negative test data:  2156


## Data augmentation part

In [10]:
pos_data_train_noised = pos_data_train
pos_data_train_noised["sounddata"] = pos_data_train_noised["sounddata"].parallel_apply(lambda x: addNoise(x))
#pos_data_train_noised["augmentation"] = "Noise"
pos_data_train_aug = pos_data_train.append(pos_data_train_noised)

In [10]:
pos_data_train_aug = pos_data_train

In [11]:
pos_data_train_pitched = pos_data_train_aug
pos_data_train_pitched["sounddata"] = pos_data_train_pitched["sounddata"].parallel_apply(lambda x: changePitch(x))
#pos_data_train_pitched["augmentation"] = "pitch"
pos_data_train_aug = pos_data_train_aug.append(pos_data_train_pitched)

In [11]:
pos_data_train_shifted = pos_data_train_aug
pos_data_train_shifted["sounddata"] = pos_data_train_shifted["sounddata"].parallel_apply(lambda x: shiftTime(x))
#pos_data_train_shifted["augmentation"] = "shift"
pos_data_train_aug = pos_data_train_aug.append(pos_data_train_shifted)

In [11]:
pos_data_train_speed = pos_data_train_aug
pos_data_train_speed["sounddata"] = pos_data_train_speed["sounddata"].parallel_apply(lambda x: changeSpeed(x))
#pos_data_train_speed["augmentation"] = "speed"
pos_data_train_aug = pos_data_train_aug.append(pos_data_train_speed)

In [12]:
pos_data_train_aug.reset_index(drop=True)

Unnamed: 0,soundfilename,status,cough_value,sounddata,augmentation
0,a7758e1f-e7b5-42cc-a66d-99966ea8a44e1.wav,COVID-19,0.8515,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",shift
1,7f8e60f4-5cbd-4ec6-bc73-e8ab2b6c3abd0.wav,COVID-19,0.9958,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",shift
2,3be23405-2584-49fd-8f74-a22fbeeea0440.wav,COVID-19,0.0511,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",shift
3,2c38b55e-7a52-4b23-a4a5-f0da83cd2cba1.wav,COVID-19,0.9377,"[-0.00036621094, 0.0006713867, -0.0040893555, ...",shift
4,95e40d65-5398-4dd8-bda8-5da98d06ed800.wav,COVID-19,0.4218,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",shift
...,...,...,...,...,...
2705,ab228702-1c84-4514-8ea9-7a207b1919c91.wav,COVID-19,0.9911,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",shift
2706,41a9aa47-9814-4a0f-a0a3-f3d919bb893d1.wav,COVID-19,0.1800,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",shift
2707,0ed846f2-5ce3-4f7f-9f60-b0258ab2047e1.wav,COVID-19,0.0352,"[0.0017089844, 0.0062561035, 0.009246826, 0.01...",shift
2708,36e4f76f-ebb6-45e4-b246-64c8fbb9e7db2.wav,COVID-19,0.5202,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",shift


In [13]:
train_set = pos_data_train_aug.append(neg_data_train, ignore_index=True)
test_set = pos_data_test.append(neg_data_test, ignore_index=True)

In [14]:
train_set.reset_index(drop=True)
test_set.reset_index(drop=True)

Unnamed: 0,soundfilename,status,cough_value,sounddata
0,e136f725-f618-40e9-82e9-d4712a5479dd1.wav,COVID-19,0.7330,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,d5b05c83-3dab-4fc9-8db7-9f46bcb06cac1.wav,COVID-19,0.7904,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,920ae37e-af2e-4e43-a80e-2150a66db6b10.wav,COVID-19,0.9949,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,e8537e2a-98b1-4ddd-8237-3835215218b43.wav,COVID-19,0.9956,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,814a5c9e-6174-4efa-a378-a990ac566ae21.wav,COVID-19,0.9930,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...
2733,dd3e8166-0375-4882-b037-add3a813f84f1.wav,healthy,0.8056,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2734,11324b9a-7584-42eb-95af-fbb39dcfc1054.wav,healthy,0.9949,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2735,ff18f5dd-fc75-4f22-be2e-374eb3e2372d4.wav,healthy,0.4838,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2736,b17e7db8-041b-40e9-81f3-8771bafc010a4.wav,healthy,0.6983,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


## Generating mel spectrograms from augmented sound data

In [15]:
train_set["Melspecs"] = train_set["sounddata"].parallel_apply(lambda x: generateMelSpectrogram(x))

In [16]:
test_set["Melspecs"] = test_set["sounddata"].parallel_apply(lambda x: generateMelSpectrogram(x))

In [17]:
train_set["Melspecs"] = train_set["Melspecs"].parallel_apply(lambda x: expandDimensions(x))

In [18]:
test_set["Melspecs"] = test_set["Melspecs"].parallel_apply(lambda x: expandDimensions(x))

In [19]:
train_set.drop(["sounddata","soundfilename","augmentation"],axis = 1, inplace=True)
test_set.drop(["sounddata","soundfilename","augmentation"],axis=1, inplace=True)

In [20]:
train_set

Unnamed: 0,status,cough_value,augmentation,Melspecs
0,COVID-19,0.8515,shift,"[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0..."
1,COVID-19,0.9958,shift,"[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0..."
2,COVID-19,0.0511,shift,"[[[72.59998], [72.59998], [72.59998], [72.5999..."
3,COVID-19,0.9377,shift,"[[[7.0643253], [7.7245693], [10.176903], [12.6..."
4,COVID-19,0.4218,shift,"[[[80.0], [80.0], [80.0], [80.0], [80.0], [80...."
...,...,...,...,...
22102,symptomatic,0.9926,,"[[[80.0], [80.0], [80.0], [80.0], [80.0], [80...."
22103,healthy,0.3102,,"[[[80.0], [80.0], [80.0], [80.0], [80.0], [80...."
22104,healthy,0.4330,,"[[[80.0], [80.0], [80.0], [80.0], [80.0], [80...."
22105,healthy,0.9961,,"[[[80.0], [80.0], [80.0], [80.0], [80.0], [80...."


## Save datasets to pickle file

In [21]:
train_set.to_pickle("trainshift.pkl")
test_set.to_pickle("testshift.pkl")