## 参考emotion

In [1]:
import json
import os
import yaml
import pandas as pd
from s3prl.downstream.emotion.dataset import IEMOCAPDataset
from s3prl.downstream.emotion.expert import DownstreamExpert
from s3prl.downstream.emotion.model import *

In [2]:
SAMPLE_RATE = 16000

yaml_path = "emotion_config.yaml"

# train_set path
train_set_path = '../Data/train'
train_label_path = "../Data/training-groundtruth.csv"

# test_set path
sample_path = "../Data/sample"
sample_label_path = "../Data/sample-groundtruth.csv"
Ivanova_path = "../Data/Ivanova"
need_MCI = False


# 生成的json 存放位置
train_json_path = "../Data/train.json"
test_json_path = "../Data/test.json"

## 创建json文件，存储path, label

In [3]:
symbols_dict = {'Control':0, "ProbableAD": 1}
def get_train_json():
    """ return a dictionary:  dict = { label: symbol_dict,
                                       data: { {path:str, label:str}, {}, {} } } """
    # 判断json是否已经存在
    if os.path.exists(train_json_path):
        return

    data = {"labels": symbols_dict} # 创建一个字典

    # get wavs_path
    wavs_path = [x for x in os.listdir(train_set_path)]

    y_data = pd.read_csv(train_label_path, usecols=[0,4])
    y_dict = y_data.set_index('adressfname').to_dict()['dx']

    data['meta_data'] = [{"path": k, "label": y_dict[k[: -4]]} for k in wavs_path]
    print(f"train_data count = {len(data['meta_data'])}")

    with open(train_json_path, 'w') as f:
        json.dump(data, f, ensure_ascii=False,indent=1)


In [29]:
def get_test_json():
    # 判断json是否已经存在
    if os.path.exists(test_json_path):
        return

    data = {"labels": symbols_dict} # 创建一个字典
    # test数据集有两个：sample Ivanova

    # get sample_list
    wavs_path = [x for x in os.listdir(sample_path)]

    y_data = pd.read_csv(sample_label_path, usecols=[0,4])
    y_dict = y_data.set_index('adressfname').to_dict()['dx']

    sample_list = [{"path": k, "label": y_dict[k[: -4]]} for k in wavs_path]

    # get Ivanova_list
    types = os.listdir(Ivanova_path)
    Ivanova_list = []
    for type in types:
        type_files_path = os.path.join(Ivanova_path,type)
        for file in os.listdir(type_files_path):
            file_path = os.path.join(type, file)
            file_label = type
            Ivanova_list.append({"path": file_path, "label": file_label})

    data['meta_data'] = sample_list + Ivanova_list
    print(f"test_data count = {len(data['meta_data'])}")
    with open(test_json_path, 'w') as f:
        json.dump(data, f, ensure_ascii=False,indent=1)


In [30]:
get_test_json()

test_data count = 369


In [31]:
get_train_json()

train_data count = 237


In [21]:
# symbols_dict = {'MCI': -1, "Control": 0, "ProbableAD": 1}
symbols_dict = {'Control':0, "ProbableAD": 1}
def get_eng_spa_dict(country: str):
    """ return a dictionary:  dict = { label: symbol_dict,
                                       data: { {path:str, label:str}, {}, {} } } """
    data = {"labels": symbols_dict} # 创建一个字典
    if country == "English":
        file_path = Eng_path
        label_path = Eng_labels_path
        json_path = Eng_json_path
    elif country == "Spain":
        file_path = Spa_path
        label_path = Spa_labels_path
        json_path = Spa_json_path
    else:
        raise ValueError(f"Unknown contry")

    # 判断json是否已经存在
    if os.path.exists(json_path):
        return

    # get wavs_path
    wavs_path = [x for x in os.listdir(file_path)]

    y_data = pd.read_csv(label_path, usecols=[0,4])
    y_dict = y_data.set_index('adressfname').to_dict()['dx']

    data['meta_data'] = [{"path": k, "label": y_dict[k[: -4]]} for k in wavs_path]

    with open(json_path, 'w') as f:
        json.dump(data, f, ensure_ascii=False,indent=1)

In [23]:
get_eng_spa_dict("Spain")
get_eng_spa_dict("English")

In [6]:
A = IEMOCAPDataset(data_dir=Eng_path, meta_path= Eng_json_path)
print(Eng_path, Eng_json_path)

./data/train ./data/English.json


In [7]:
print(len(A))
print(A[0], A[1][0].shape)
print(A[24], A[24][0].shape)

237
(array([ 0.00036621,  0.0012207 ,  0.0020752 , ..., -0.00091553,
       -0.00079346, -0.00112915], dtype=float32), 0, 'adrso002') (537217,)
(array([0., 0., 0., ..., 0., 0., 0.], dtype=float32), 1, 'adrso027') (1319915,)


In [14]:
with open(yaml_path, "r") as f:
    cfg = f.read()
    config = yaml.load(cfg, Loader=yaml.SafeLoader)
print(type(config['downstream_expert']))
B = DownstreamExpert(upstream_dim=1, downstream_expert=config['downstream_expert'], expdir="./emotion")

<class 'dict'>
[Expert] - using the testing fold: "fold1". Ps. Use -o config.downstream_expert.datarc.test_fold=fold2 to change test_fold in config.
[Expert] - Training path: data/English.json
[Expert] - Testing path: data/Spanish.json


In [15]:
train_loader, dev_loader, test_loader = B.get_train_dataloader(), B.get_dev_dataloader(), B.get_test_dataloader()

In [18]:
len(train_loader.dataset), len(dev_loader.dataset), len(test_loader.dataset)

(189, 48, 8)

In [19]:
B.forward()

TypeError: forward() missing 5 required positional arguments: 'mode', 'features', 'labels', 'filenames', and 'records'