# Preprocess json to csv

- map labels and available files and save in `csv` format



# Mount Google Drive

In [1]:
import os
from google.colab import drive

drive.mount('/content/gdrive')
google_drive_path = "/content/gdrive/MyDrive/"

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
# change directory to the project path

project_path = os.path.join(google_drive_path, "Colab Notebooks/SER/")
os.chdir(project_path)
os.listdir("./")

['utils',
 'fonts',
 'meta_data',
 'dataset',
 '.git',
 '.env',
 'libs',
 'README.md',
 'models',
 '.gitignore',
 'logs',
 'SER Model idea.drawio',
 'Notebook Template.ipynb',
 'deprecated',
 'Untitled0.ipynb',
 'Preprocess Json to CSV.ipynb']

# Prepare ENV/Load libraries

In [3]:
# !rm -rf tmp && git clone https://github.com/chuan-khuna/my-python-utils.git tmp && cp -R tmp/utils ./ && cp -R tmp/fonts ./ && rm -rf tmp

In [4]:
!python3 --version

Python 3.8.16


In [5]:
!apt install --allow-change-held-packages libcudnn8=8.1.0.77-1+cuda11.2
!pip uninstall -y -q tensorflow keras tensorflow-estimator tensorflow-text
!pip install -q tensorflow_datasets
!pip install -q -U tensorflow-text tensorflow

Reading package lists... Done
Building dependency tree       
Reading state information... Done
libcudnn8 is already the newest version (8.1.0.77-1+cuda11.2).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 18 not upgraded.
[K     |████████████████████████████████| 1.1 MB 5.2 MB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tf2onnx 1.13.0 requires flatbuffers<3.0,>=1.12, but you have flatbuffers 22.12.6 which is incompatible.
onnx 1.13.0 requires protobuf<4,>=3.20.2, but you have protobuf 3.19.6 which is incompatible.[0m
[?25h

In [6]:
!pip install matplotlib seaborn -Uq
!pip install pythainlp -q
!pip install keras-tuner -q
!pip install pyYAML -q

# !pip install tensorflow -Uq
!pip install tensorflow-io[tensorflow] tf2onnx onnxruntime -Uq

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-text 2.11.0 requires tensorflow<2.12,>=2.11.0; platform_machine != "arm64" or platform_system != "Darwin", but you have tensorflow 2.9.0 which is incompatible.[0m


# Import Libraries

In [7]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib
from matplotlib import patheffects
import seaborn as sns

import json
import yaml
import re
from tqdm.notebook import trange, tqdm

import datetime
import time
from multiprocessing import Pool

In [8]:
import tensorflow as tf
import tensorflow_io as tfio
import tensorflow_datasets as tfds
import keras_tuner

seed_ = 20200218
tf.random.set_seed(seed_)
np.random.seed(seed_)

from sklearn.metrics import confusion_matrix, classification_report

In [9]:
from utils.vis_utils import *

font_dir = [f"{google_drive_path}/code_assets/fonts/"]
mpl_import_fonts(font_dir)

In [10]:
import librosa

## Ensure that matplotlib can use subplot mosaic

In [11]:
def check_version(version_str, major, minor):
    print(version_str)
    version = [int(i) for i in version_str.split('.')]
    assert version[0] >= major and version[1] >= minor

check_version(matplotlib.__version__, 3, 6)
check_version(sns.__version__, 0, 12)

del check_version

matplotlib.__version__, sns.__version__, tf.__version__

3.6.2
0.12.2


('3.6.2', '0.12.2', '2.9.0')

## View hardware spec

In [12]:
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [13]:
tf.config.list_physical_devices('GPU')

for device in tf.config.experimental.list_physical_devices('GPU'):
    tf.config.experimental.set_memory_growth(device, True)

In [14]:
def print_ds(ds, take_n=10):
    for elem in ds.take(take_n).as_numpy_iterator():
        print(elem)

# Preprocess Json to CSV

In [15]:
raw_df = pd.read_json("./meta_data/emotion_label.json", orient='index')
raw_df = raw_df.reset_index().join(pd.json_normalize(raw_df[0]))
raw_df = raw_df.drop(columns=[0, 'annotated'])
raw_df = raw_df.rename(columns={'index': 'file'})

In [16]:
raw_df = pd.read_csv("./meta_data/emotion_label.csv")

In [17]:
raw_df

Unnamed: 0,file,assigned_emo,majority_emo,agreement
0,s001_con_actor001_impro1_1.flac,Neutral,Neutral,1.000000
1,s001_con_actor001_impro1_10.flac,Neutral,Neutral,1.000000
2,s001_con_actor001_impro1_11.flac,Neutral,Neutral,0.857143
3,s001_con_actor001_impro1_12.flac,Neutral,Neutral,1.000000
4,s001_con_actor001_impro1_13.flac,Neutral,Neutral,0.875000
...,...,...,...,...
27849,z020_mic_actor076_script3_2_3b.flac,Happy,Happy,0.600000
27850,z020_mic_actor076_script3_2_4a.flac,Sad,Frustrated,0.750000
27851,z020_mic_actor076_script3_2_4b.flac,Sad,Frustrated,0.500000
27852,z020_mic_actor076_script3_2_5a.flac,Frustrated,,0.000000


# Search for available files

In [18]:
audio_files = []

for path, subdirs, files in os.walk("./dataset/"):
    for name in files:
        file = os.path.join(path, name)
        audio_files.append(file)

audio_files = [file for file in audio_files if file.endswith(".flac")]

# Merge available files with corresponding labels

In [19]:
df = pd.DataFrame({'path': audio_files})
df['file'] = df['path'].apply(lambda x: x.split('/')[-1])

In [20]:
df = df.join(raw_df.set_index('file'), on='file')
df

Unnamed: 0,path,file,assigned_emo,majority_emo,agreement
0,./dataset/zoom11-20/zoom011/mic/z011_mic_actor...,z011_mic_actor057_script1_1_2b.flac,Angry,Frustrated,0.625
1,./dataset/zoom11-20/zoom011/mic/z011_mic_actor...,z011_mic_actor057_script1_1_3a.flac,Happy,Neutral,0.750
2,./dataset/zoom11-20/zoom011/mic/z011_mic_actor...,z011_mic_actor057_script1_1_3b.flac,Happy,Happy,0.750
3,./dataset/zoom11-20/zoom011/mic/z011_mic_actor...,z011_mic_actor057_script1_1_4a.flac,Sad,Neutral,0.800
4,./dataset/zoom11-20/zoom011/mic/z011_mic_actor...,z011_mic_actor057_script1_1_4b.flac,Sad,Neutral,0.875
...,...,...,...,...,...
60464,./dataset/studio61-70/studio069/middle/s069_mi...,s069_middle_actor178_impro13_13.flac,,,
60465,./dataset/studio61-70/studio069/middle/s069_mi...,s069_middle_actor177_impro13_13.flac,,,
60466,./dataset/studio61-70/studio069/middle/s069_mi...,s069_middle_actor177_impro11_7.flac,,,
60467,./dataset/studio61-70/studio069/middle/s069_mi...,s069_middle_actor177_impro15_10.flac,,,


In [21]:
# drop nan labels
df = df.dropna().reset_index(drop=True)
df

Unnamed: 0,path,file,assigned_emo,majority_emo,agreement
0,./dataset/zoom11-20/zoom011/mic/z011_mic_actor...,z011_mic_actor057_script1_1_2b.flac,Angry,Frustrated,0.625000
1,./dataset/zoom11-20/zoom011/mic/z011_mic_actor...,z011_mic_actor057_script1_1_3a.flac,Happy,Neutral,0.750000
2,./dataset/zoom11-20/zoom011/mic/z011_mic_actor...,z011_mic_actor057_script1_1_3b.flac,Happy,Happy,0.750000
3,./dataset/zoom11-20/zoom011/mic/z011_mic_actor...,z011_mic_actor057_script1_1_4a.flac,Sad,Neutral,0.800000
4,./dataset/zoom11-20/zoom011/mic/z011_mic_actor...,z011_mic_actor057_script1_1_4b.flac,Sad,Neutral,0.875000
...,...,...,...,...,...
27326,./dataset/studio61-70/studio069/con/s069_con_a...,s069_con_actor177_impro11_13.flac,Frustrated,Neutral,0.571429
27327,./dataset/studio61-70/studio069/con/s069_con_a...,s069_con_actor177_impro11_20.flac,Frustrated,Frustrated,0.714286
27328,./dataset/studio61-70/studio069/con/s069_con_a...,s069_con_actor177_impro15_3.flac,Sad,Sad,1.000000
27329,./dataset/studio61-70/studio069/con/s069_con_a...,s069_con_actor178_impro15_9.flac,Angry,Frustrated,0.625000


In [22]:
df.to_csv("./meta_data/dataset_raw.csv", index=False)

# Get Audio duration(s)

- It may take long time to run (1000 files = 15 sec using `tensorflow io`)

- knowing how long audio is can help us to determine how to pad/trim

In [23]:
def read_audio_tensor(filename):
    content = tfio.IOTensor.graph(tf.int16).from_audio(filename)
    rate = tf.cast(content.rate, dtype=tf.int64)
    audio_tensor = content.to_tensor()
    audio_tensor = tf.squeeze(audio_tensor[:, 0])
    return audio_tensor

In [24]:
def get_duration(filename):
    try:
        y, sr = librosa.load(filename)
        duration = librosa.get_duration(y=y, sr=sr)
        return np.round(duration, 3)
    except Exception as e:
        print(filename)
        print(e)
        return -1

def get_duration_tf(filename):
    # how to use tfio with tf.data.Dataset
    # ref: https://github.com/tensorflow/io/issues/581
    try:
        # content = tfio.audio.AudioIOTensor(filename, dtype=tf.int16)
        content = tfio.IOTensor.graph(tf.int16).from_audio(filename)
        rate = tf.cast(content.rate, dtype=tf.int64)
        
        audio_tensor = content.to_tensor()
        audio_tensor = tf.squeeze(audio_tensor[:, 0])

        duration = tf.cast(tf.shape(audio_tensor)[0], dtype=tf.int32)/tf.cast(rate, dtype=tf.int32)
        
        return duration, filename
    except Exception as e:
        print(filename)
        print(e)
        return -1, filename

In [25]:
st_time = time.time()

durations = df['path'].head(100).apply(get_duration_tf)

print(time.time() - st_time)

31.05422592163086


In [26]:
duration_df = pd.DataFrame(np.array(durations).tolist(), columns=['duration', 'path'])
duration_df['duration'] = duration_df['duration'].apply(lambda x: np.array(x))

In [27]:
duration_df.to_csv("./meta_data/file_durations.csv", index=False)