In [5]:
import os
import pandas as pd

from torch.utils.data import DataLoader

from lhotse import Fbank, FbankConfig, load_manifest_lazy
from lhotse.manipulation import combine
from lhotse.dataset import DynamicBucketingSampler, K2SpeechRecognitionDataset, OnTheFlyFeatures

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
split = 'train'
FLEURS_PATH = '/exp/ddegenaro/fleurs'

In [7]:
langs = ['ar_eg', 'en_us', 'es_419', 'fr_fr', 'pt_br', 'ru_ru']
fnames = [
    f'fleurs-{lang}_recordings_{split}.jsonl.gz'
    for lang in langs
]
paths = [
    os.path.join(FLEURS_PATH, langs[i], fnames[i])
    for i in range(len(langs))
]

In [3]:
recordings = [
    load_manifest_lazy(path)
    for path in paths
]
supervisions = [
    load_manifest_lazy(path.replace('recordings', 'supervisions'))
    for path in paths
]

In [8]:
paths

['/exp/ddegenaro/fleurs/ar_eg/fleurs-ar_eg_recordings_train.jsonl.gz',
 '/exp/ddegenaro/fleurs/en_us/fleurs-en_us_recordings_train.jsonl.gz',
 '/exp/ddegenaro/fleurs/es_419/fleurs-es_419_recordings_train.jsonl.gz',
 '/exp/ddegenaro/fleurs/fr_fr/fleurs-fr_fr_recordings_train.jsonl.gz',
 '/exp/ddegenaro/fleurs/pt_br/fleurs-pt_br_recordings_train.jsonl.gz',
 '/exp/ddegenaro/fleurs/ru_ru/fleurs-ru_ru_recordings_train.jsonl.gz']

In [9]:
recordings_df = pd.read_json(
    paths[1],
    lines=True,
    compression='gzip'
)
supervisions_df = pd.read_json(
    paths[1].replace('recordings', 'supervisions'),
    lines=True,
    compression='gzip'
)

In [23]:
supervisions_df

Unnamed: 0,id,recording_id,start,duration,channel,text,language,speaker,gender,custom
0,1.511116e+23,1.155955e+19,0,7.50,0,sir richard branson's virgin group had a bid f...,en_us,1511,FEMALE,{'raw_text': 'Sir Richard Branson's Virgin Gro...
1,5.151452e+22,4.521018e+18,0,12.36,0,the approach to obtaining information was diff...,en_us,5151,FEMALE,{'raw_text': 'The approach to obtaining inform...
2,2.115197e+21,5.196819e+18,0,8.58,0,hsieh implied during the election that ma migh...,en_us,211,FEMALE,{'raw_text': 'Hsieh implied during the electio...
3,5.811316e+22,1.315974e+19,0,9.66,0,according to ansa police were concerned by a c...,en_us,581,FEMALE,"{'raw_text': 'According to Ansa, ""police were ..."
4,3.931618e+22,6.183930e+18,0,7.44,0,with the battle for france over germany began ...,en_us,3931,MALE,"{'raw_text': 'With the battle for France over,..."
...,...,...,...,...,...,...,...,...,...,...
2597,1.024280e+23,8.004854e+18,0,8.40,0,tokyo will be the only asian city to have host...,en_us,10242,MALE,{'raw_text': 'Tokyo will be the only Asian cit...
2598,1.331248e+23,4.815251e+18,0,7.44,0,while project based learning should make learn...,en_us,13312,MALE,{'raw_text': 'While project based learning sho...
2599,5.402520e+22,5.202453e+18,0,4.62,0,the colonists seeing this activity had also ca...,en_us,5402,MALE,"{'raw_text': 'The Colonists, seeing this activ..."
2600,1.144239e+22,3.889964e+17,0,15.12,0,no major damage or injuries have been reported...,en_us,11442,FEMALE,{'raw_text': 'No major damage or injuries have...


In [10]:
recordings_df.head()
recordings_df['supervisions'] = supervisions_df

Unnamed: 0,id,sources,sampling_rate,num_samples,duration,channel_ids
0,1.155955e+19,"[{'type': 'file', 'channels': [0], 'source': '...",16000,120000,7.5,[0]
1,4.521018e+18,"[{'type': 'file', 'channels': [0], 'source': '...",16000,197760,12.36,[0]
2,5.196819e+18,"[{'type': 'file', 'channels': [0], 'source': '...",16000,137280,8.58,[0]
3,1.315974e+19,"[{'type': 'file', 'channels': [0], 'source': '...",16000,154560,9.66,[0]
4,6.18393e+18,"[{'type': 'file', 'channels': [0], 'source': '...",16000,119040,7.44,[0]


In [13]:
TEST_PATH = '/expscratch/mwiesner/scale23/scale2023/icefall/tools/icefall/egs/scale24/ASR/data/manifests/english/cuts_fleurs_dev.jsonl.gz'

In [17]:
test_df = pd.read_json(
    TEST_PATH,
    lines=True,
    compression='gzip'
)
test_df.head()

Unnamed: 0,id,start,duration,channel,supervisions,recording,type
0,1.590115e+24,0,11.46,0,"[{'id': '1590_1_15158676295442294624', 'record...","{'id': '15158676295442294624', 'sources': [{'t...",MonoCut
1,1.590213e+24,0,10.58,0,"[{'id': '1590_2_12952903060751652532', 'record...","{'id': '12952903060751652532', 'sources': [{'t...",MonoCut
2,1.544116e+24,0,12.84,0,"[{'id': '1544_1_16131823300806444840', 'record...","{'id': '16131823300806444840', 'sources': [{'t...",MonoCut
3,1.544226e+23,0,9.12,0,"[{'id': '1544_2_2606692427476446963', 'recordi...","{'id': '2606692427476446963', 'sources': [{'ty...",MonoCut
4,1.544328e+23,0,9.12,0,"[{'id': '1544_3_2812938565630042744', 'recordi...","{'id': '2812938565630042744', 'sources': [{'ty...",MonoCut


In [22]:
test_df['supervisions'][0][0] # list of dict

{'id': '1590_1_15158676295442294624',
 'recording_id': '15158676295442294624',
 'start': 0.0,
 'duration': 11.46,
 'channel': 0,
 'text': "the main local beer is number one' it is not a complex beer but pleasant and refreshing the other local beer is called manta",
 'language': 'en_us',
 'speaker': '1590_1',
 'gender': 'MALE',
 'custom': {'raw_text': 'The main local beer is \'Number One\', it is not a complex beer, but pleasant and refreshing. The other local beer is called "Manta".'}}