# Preprocess json to csv

- map labels and available files and save in `csv` format



# Mount Google Drive

In [1]:
import os
from google.colab import drive

drive.mount('/content/gdrive')
google_drive_path = "/content/gdrive/MyDrive/"

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
# change directory to the project path

project_path = os.path.join(google_drive_path, "Colab Notebooks/SER/")
os.chdir(project_path)
os.listdir("./")

['utils',
 'fonts',
 'meta_data',
 'dataset',
 '.git',
 '.env',
 'libs',
 'README.md',
 'models',
 '.gitignore',
 'logs',
 'SER Model idea.drawio',
 'deprecated',
 'Data distribution.ipynb',
 'Notebook Template.ipynb',
 'Notebook Template Train.ipynb',
 'Untitled0.ipynb',
 'Experiment Preprocessing.ipynb',
 'Preprocess Json to CSV.ipynb']

# Prepare ENV/Load libraries

In [3]:
# !rm -rf tmp && git clone https://github.com/chuan-khuna/my-python-utils.git tmp && cp -R tmp/utils ./ && cp -R tmp/fonts ./ && rm -rf tmp

In [4]:
!python3 --version

Python 3.8.16


In [5]:
!apt install --allow-change-held-packages libcudnn8=8.1.0.77-1+cuda11.2
!pip uninstall -y -q tensorflow keras tensorflow-estimator tensorflow-text
!pip install -q tensorflow_datasets
!pip install -q -U tensorflow-text tensorflow

Reading package lists... Done
Building dependency tree       
Reading state information... Done
libcudnn8 is already the newest version (8.1.0.77-1+cuda11.2).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 18 not upgraded.
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tf2onnx 1.13.0 requires flatbuffers<3.0,>=1.12, but you have flatbuffers 22.12.6 which is incompatible.
onnx 1.13.0 requires protobuf<4,>=3.20.2, but you have protobuf 3.19.6 which is incompatible.[0m


In [6]:
!pip install matplotlib seaborn -Uq
!pip install pythainlp -q
!pip install keras-tuner -q
!pip install pyYAML -q

# !pip install tensorflow -Uq
!pip install tensorflow-io[tensorflow] tf2onnx onnxruntime -Uq

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-text 2.11.0 requires tensorflow<2.12,>=2.11.0; platform_machine != "arm64" or platform_system != "Darwin", but you have tensorflow 2.9.0 which is incompatible.[0m


# Import Libraries

In [7]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib
from matplotlib import patheffects
import seaborn as sns

import json
import yaml
import re
from tqdm.notebook import trange, tqdm

import datetime
import time
from multiprocessing import Pool

In [8]:
import tensorflow as tf
import tensorflow_io as tfio
import tensorflow_datasets as tfds
import keras_tuner

seed_ = 20200218
tf.random.set_seed(seed_)
np.random.seed(seed_)

from sklearn.metrics import confusion_matrix, classification_report

In [9]:
from utils.vis_utils import *

font_dir = [f"{google_drive_path}/code_assets/fonts/"]
mpl_import_fonts(font_dir)

In [10]:
import librosa

## Ensure that matplotlib can use subplot mosaic

In [11]:
def check_version(version_str, major, minor):
    print(version_str)
    version = [int(i) for i in version_str.split('.')]
    assert version[0] >= major and version[1] >= minor

check_version(matplotlib.__version__, 3, 6)
check_version(sns.__version__, 0, 12)

del check_version

matplotlib.__version__, sns.__version__, tf.__version__

3.6.2
0.12.2


('3.6.2', '0.12.2', '2.9.0')

## View hardware spec

In [12]:
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [13]:
tf.config.list_physical_devices('GPU')

for device in tf.config.experimental.list_physical_devices('GPU'):
    tf.config.experimental.set_memory_growth(device, True)

In [14]:
def print_ds(ds, take_n=10):
    for elem in ds.take(take_n).as_numpy_iterator():
        print(elem)

# Preprocess Json to CSV

In [15]:
raw_df = pd.read_json("./meta_data/emotion_label.json", orient='index')
raw_df = raw_df.reset_index().join(pd.json_normalize(raw_df[0]))
raw_df = raw_df.drop(columns=[0, 'annotated'])
raw_df = raw_df.rename(columns={'index': 'file'})

In [16]:
raw_df = pd.read_csv("./meta_data/emotion_label.csv")

In [17]:
raw_df

Unnamed: 0,file,assigned_emo,majority_emo,agreement
0,s001_con_actor001_impro1_1.flac,Neutral,Neutral,1.000000
1,s001_con_actor001_impro1_10.flac,Neutral,Neutral,1.000000
2,s001_con_actor001_impro1_11.flac,Neutral,Neutral,0.857143
3,s001_con_actor001_impro1_12.flac,Neutral,Neutral,1.000000
4,s001_con_actor001_impro1_13.flac,Neutral,Neutral,0.875000
...,...,...,...,...
27849,z020_mic_actor076_script3_2_3b.flac,Happy,Happy,0.600000
27850,z020_mic_actor076_script3_2_4a.flac,Sad,Frustrated,0.750000
27851,z020_mic_actor076_script3_2_4b.flac,Sad,Frustrated,0.500000
27852,z020_mic_actor076_script3_2_5a.flac,Frustrated,,0.000000


# Unzip file from Google Drive to VM storage

I hypothesise that Google Drive has some limitations to read a large amount of files. So I decide to extract files in VM storage instead

## Old: Google drive takes long time to run through all files?

- limit?
    - for 1000 files Google Drive takes ~15sec to run
    - So 27k files = 15*27 <- about 10 mins -- but it takes much longer than I have calculated.
- copy file to temp (VM) location
- this may take ~10-15min to copy files; ~5-6 minutes to run `get_duration`
- result:
    - status bar at the bottom of Colab window: the timer keep counting vs freezed when access from Google Drive

In [18]:
# !rm -rf /content/dataset/

In [19]:
def unzip_dataset(zipfile, target_loc):
    """Unzip a zip file from google drive to VM storage

    I hypothesise that Google Drive has some limits to read files.
    So I decided to extract file to VM storage instead

    ```python
    target_loc = dataset_path = "/content/dataset/"
    ```

    Args:
        zipfile (str): _description_
        target_loc (str): target root folder to place content in the zip file.
    """

    # create root path to store the dataset
    if not os.path.exists(target_loc):
        os.makedirs(target_loc, exist_ok=True)

    subfolder = zipfile.split('/')[-1].split('.zip')[0]
    target_path = os.path.join(target_loc, subfolder)
    print(f"Unzip {zipfile} to {target_path}")
    os.system(f"unzip {zipfile} -d {target_path}")

path_to_zipfiles = "./dataset/"
dataset_path = "/content/dataset/"

if not(os.path.exists(dataset_path) and len(os.listdir(dataset_path)) == 10):
    for filename in os.listdir(path_to_zipfiles):
        if filename.endswith(".zip"):
            zip_path = os.path.join(path_to_zipfiles, filename)
            unzip_dataset(zip_path, dataset_path)

print(os.listdir(dataset_path))

Unzip ./dataset/studio71-80.zip to /content/dataset/studio71-80
Unzip ./dataset/studio41-50.zip to /content/dataset/studio41-50
Unzip ./dataset/studio51-60.zip to /content/dataset/studio51-60
Unzip ./dataset/studio61-70.zip to /content/dataset/studio61-70
Unzip ./dataset/studio21-30.zip to /content/dataset/studio21-30
Unzip ./dataset/studio31-40.zip to /content/dataset/studio31-40
Unzip ./dataset/zoom1-10.zip to /content/dataset/zoom1-10
Unzip ./dataset/studio1-10.zip to /content/dataset/studio1-10
Unzip ./dataset/zoom11-20.zip to /content/dataset/zoom11-20
Unzip ./dataset/studio11-20.zip to /content/dataset/studio11-20


## Search for available files

In [20]:
audio_files = []

for path, subdirs, files in os.walk(dataset_path):
    for name in files:
        file = os.path.join(path, name)
        audio_files.append(file)

audio_files = [file for file in audio_files if file.endswith(".flac")]

# Merge available files with corresponding labels

In [21]:
df = pd.DataFrame({'path': audio_files})
df['file'] = df['path'].apply(lambda x: x.split('/')[-1])

In [22]:
df = df.join(raw_df.set_index('file'), on='file')
df

Unnamed: 0,path,file,assigned_emo,majority_emo,agreement
0,/content/dataset/studio11-20/studio013/middle/...,s013_middle_actor026_impro2_6.flac,,,
1,/content/dataset/studio11-20/studio013/middle/...,s013_middle_actor025_impro2_29.flac,,,
2,/content/dataset/studio11-20/studio013/middle/...,s013_middle_actor025_impro4_6.flac,,,
3,/content/dataset/studio11-20/studio013/middle/...,s013_middle_actor025_impro2_12.flac,,,
4,/content/dataset/studio11-20/studio013/middle/...,s013_middle_actor026_impro1_14.flac,,,
...,...,...,...,...,...
61967,/content/dataset/studio21-30/studio025/con/s02...,s025_con_actor090_script2_1_4a.flac,Sad,Sad,0.9
61968,/content/dataset/studio21-30/studio025/con/s02...,s025_con_actor089_script1_1_1a.flac,Neutral,Neutral,0.6
61969,/content/dataset/studio21-30/studio025/con/s02...,s025_con_actor090_impro7_9.flac,Frustrated,,0.0
61970,/content/dataset/studio21-30/studio025/con/s02...,s025_con_actor089_impro10_9.flac,Angry,Frustrated,0.8


In [23]:
# drop nan labels
df = df.dropna().reset_index(drop=True)
df

Unnamed: 0,path,file,assigned_emo,majority_emo,agreement
0,/content/dataset/studio11-20/studio013/con/s01...,s013_con_actor025_impro4_7.flac,Angry,Angry,0.642857
1,/content/dataset/studio11-20/studio013/con/s01...,s013_con_actor025_impro4_15.flac,Angry,Frustrated,0.857143
2,/content/dataset/studio11-20/studio013/con/s01...,s013_con_actor025_impro2_24.flac,Happy,Happy,0.857143
3,/content/dataset/studio11-20/studio013/con/s01...,s013_con_actor026_impro4_16.flac,Frustrated,Frustrated,0.750000
4,/content/dataset/studio11-20/studio013/con/s01...,s013_con_actor025_script3_2_4b.flac,Sad,Sad,1.000000
...,...,...,...,...,...
27849,/content/dataset/studio21-30/studio025/con/s02...,s025_con_actor090_script2_1_4a.flac,Sad,Sad,0.900000
27850,/content/dataset/studio21-30/studio025/con/s02...,s025_con_actor089_script1_1_1a.flac,Neutral,Neutral,0.600000
27851,/content/dataset/studio21-30/studio025/con/s02...,s025_con_actor090_impro7_9.flac,Frustrated,,0.000000
27852,/content/dataset/studio21-30/studio025/con/s02...,s025_con_actor089_impro10_9.flac,Angry,Frustrated,0.800000


In [24]:
df.to_csv("./meta_data/dataset_raw.csv", index=False)

# Get each audio file's duration and test whether the file is corrupted

- test my preprocessing function and list corrupted files that will make the function raise error

- get the duration of each file in order to know how to use padding and trimming when preprocess data

In [25]:
def read_audio_tensor(filename):
    content = tfio.IOTensor.graph(tf.int16).from_audio(filename)
    rate = tf.cast(content.rate, dtype=tf.int64)
    audio_tensor = content.to_tensor()
    audio_tensor = tf.squeeze(audio_tensor[:, 0])
    return audio_tensor

In [26]:
def get_duration(filename):
    try:
        y, sr = librosa.load(filename)
        duration = librosa.get_duration(y=y, sr=sr)
        return np.round(duration, 3)
    except Exception as e:
        print(filename)
        print(e)
        return -1

def get_duration_tf(filename):
    # how to use tfio with tf.data.Dataset
    # ref: https://github.com/tensorflow/io/issues/581
    try:
        # content = tfio.audio.AudioIOTensor(filename, dtype=tf.int16)
        content = tfio.IOTensor.graph(tf.int16).from_audio(filename)
        rate = tf.cast(content.rate, dtype=tf.int64)
        
        audio_tensor = content.to_tensor()
        audio_tensor = tf.squeeze(audio_tensor[:, 0])

        duration = tf.cast(tf.shape(audio_tensor)[0], dtype=tf.int32)/tf.cast(rate, dtype=tf.int32)
        
        return duration, filename
    except Exception as e:
        print(filename)
        print(e)
        return -1, filename

In [27]:
def multiprocessing_get_duration(files):
    with Pool(4) as p:
        durations = p.map(get_duration_tf, files)
    return durations

In [28]:
st_time = time.time()

durations = multiprocessing_get_duration(df['path'])

print(round(time.time() - st_time, 2))

288.37


In [29]:
duration_df = pd.DataFrame(durations, columns=['duration', 'path'])

# from tensor to float
duration_df['duration'] = duration_df['duration'].apply(lambda x: np.round(np.array(x), 3))

# get only file name
duration_df['file'] = duration_df['path'].apply(lambda p: p.split('/')[-1])
duration_df = duration_df[['file', 'duration']]

In [30]:
duration_df

Unnamed: 0,file,duration
0,s013_con_actor025_impro4_7.flac,4.780
1,s013_con_actor025_impro4_15.flac,0.807
2,s013_con_actor025_impro2_24.flac,3.752
3,s013_con_actor026_impro4_16.flac,1.832
4,s013_con_actor025_script3_2_4b.flac,7.492
...,...,...
27849,s025_con_actor090_script2_1_4a.flac,8.854
27850,s025_con_actor089_script1_1_1a.flac,4.416
27851,s025_con_actor090_impro7_9.flac,1.085
27852,s025_con_actor089_impro10_9.flac,3.404


In [31]:
duration_df.to_csv("./meta_data/file_durations.csv", index=False)

# Join duration data with file-label data

In [32]:
df = df.join(duration_df.set_index('file'), on='file')
df

Unnamed: 0,path,file,assigned_emo,majority_emo,agreement,duration
0,/content/dataset/studio11-20/studio013/con/s01...,s013_con_actor025_impro4_7.flac,Angry,Angry,0.642857,4.780
1,/content/dataset/studio11-20/studio013/con/s01...,s013_con_actor025_impro4_15.flac,Angry,Frustrated,0.857143,0.807
2,/content/dataset/studio11-20/studio013/con/s01...,s013_con_actor025_impro2_24.flac,Happy,Happy,0.857143,3.752
3,/content/dataset/studio11-20/studio013/con/s01...,s013_con_actor026_impro4_16.flac,Frustrated,Frustrated,0.750000,1.832
4,/content/dataset/studio11-20/studio013/con/s01...,s013_con_actor025_script3_2_4b.flac,Sad,Sad,1.000000,7.492
...,...,...,...,...,...,...
27849,/content/dataset/studio21-30/studio025/con/s02...,s025_con_actor090_script2_1_4a.flac,Sad,Sad,0.900000,8.854
27850,/content/dataset/studio21-30/studio025/con/s02...,s025_con_actor089_script1_1_1a.flac,Neutral,Neutral,0.600000,4.416
27851,/content/dataset/studio21-30/studio025/con/s02...,s025_con_actor090_impro7_9.flac,Frustrated,,0.000000,1.085
27852,/content/dataset/studio21-30/studio025/con/s02...,s025_con_actor089_impro10_9.flac,Angry,Frustrated,0.800000,3.404


In [33]:
df.to_csv("./meta_data/dataset.csv", index=False)