### Import audio and transcriptions data

In [None]:
#| eval: false

from glob import glob
from pathlib import Path
from tqdm import tqdm

# audios = glob('selected/*.wav')
# transcripts = glob('Panda_Express_Transcripts/*/*/*.txt')
audios = glob(f'{Path.home()}/.cache/panda/audio_slices/*.wav')
transcripts = glob(f'{Path.home()}/.cache/panda/trans_slices/*.txt')
audios_data = {}
for a in tqdm(audios):
    filename = a.split('/')[-1]
    id = filename.replace('.wav', '')
    trans_file = [t for t in transcripts if id == t.split('/')[-1].replace('.txt', '')]
    if len(trans_file) > 1:
        for file in trans_file:
            with open(file, 'r') as f:
                print(f"{file} ---- {id}")
                print(f.read())
        print('-------------------------------')
    if len(trans_file) == 0:
        print(id)
        continue
    with open(trans_file[0], 'r') as f:
        text = f.read()
    audios_data[id] = {'audio_file': a, 'trans_file': trans_file[0], 'transcript': text}
print(f"Trackers: {len(audios_data.keys())}")

100%|██████████| 29887/29887 [05:31<00:00, 90.25it/s]

Trackers: 29887





### Import items tag data

In [None]:
#| eval: false

import json

with open('pd-112705/12-tag.json', 'r') as f:
    config = json.load(f)
tags = [{'tag_class': t['tag_class'], 'tags': t['tags'], 'files': []} for t in config]
print(f"items: {len(tags)}")


items: 474


### Group transcriptions and audio by items

In [None]:
#| eval: false

from collections import defaultdict
import re
from random import uniform, seed

seed(1991)

for tag in tags:
    for k, v in audios_data.items():
        for t in tag['tags']:
            if t in v['transcript']:
                tag['files'].append(k)

data = {}
for tag in tags:
    t = tag['tag_class'].lower()
    t = re.sub(r'\bdefault\b|\btag class\b|\basr\b|\bfollowup\b|\broot\b|\bunavailable\b|\bcorrection\b|\bfollow-up\b', '', t).strip()
    if data.get(t):
        data[t]['tags'].extend(tag['tags'])
        data[t]['files'].extend(tag['files'])
    else:
        data[t] = {}
        data[t]['tags'] = tag['tags']
        data[t]['files'] = tag['files']

for k, v in data.items():
    v['tags'] = list(set(v['tags']))
    v['files'] = list(set(v['files']))
    v['split'] = [uniform(0, 1) for _ in range(len(v['files']))]
    v['split'] = ['train' if r < 0.8 else r for r in v['split']]
    v['split'] = ['val' if (isinstance(r, float)) and (r < 0.9) else r for r in v['split']]
    v['split'] = ['test' if isinstance(r, float) else r for r in v['split']]

sorted([(k, len(v['files'])) for k, v in data.items()], key=lambda x: x[1], reverse=True)[:5]



[('the original orange chicken', 2005),
 ('kung pao chicken', 1585),
 ('chow mein', 1504),
 ('grilled teriyaki chicken', 1487),
 ('mushroom chicken', 1385)]

In [None]:
#| eval: false

with open('tags_data.json', 'w') as f:
    json.dump(data, f)

In [None]:
#| eval: false

tags_pool = [k for k, v in data.items() if 'chow mein' in v['tags']]
tags_pool

['chow mein']

### Add label

### Make a dataset for each item

In [None]:
#| eval: false

from pathlib import Path
import shutil
import pandas as pd

records = []
for k, v in data.items():
    current_records = []
    for f, s in zip(v['files'], v['split']):
        # curr_path = f"{Path.home()}/.cache/panda/audio_slices/{f}.wav"
        new_path = f"{Path.home()}/.cache/panda/audio_slices/{f}.wav"
        # shutil.copyfile(curr_path, new_path)
        current_records.append((k, new_path, s))
    records.extend(current_records)

df = pd.DataFrame.from_records(records, columns=['label', 'path', 'split'])
Path(f"dataset").mkdir(parents=True, exist_ok=True)
train = df.query(f"split == 'train'")
val = df.query(f"split == 'val'")
test = df.query(f"split == 'test'")
train.to_csv('dataset/slices_train.csv')
val.to_csv('dataset/slices_val.csv')
test.to_csv('dataset/slices_test.csv')

In [None]:
#| eval: false

df.head()

Unnamed: 0,label,path,split
0,aquafina,/home/jovyan/.cache/panda/audio_slices/water_0...,train
1,aquafina,/home/jovyan/.cache/panda/audio_slices/vitamin...,train
2,aquafina,/home/jovyan/.cache/panda/audio_slices/water_0...,val
3,aquafina,/home/jovyan/.cache/panda/audio_slices/waters_...,train
4,aquafina,/home/jovyan/.cache/panda/audio_slices/water_0...,test
