# Make data splits

In [None]:
from pathlib import Path

RANDOM_STATE=42

# IndicSUPERB directory
isb_dir = Path("/workspace/data/IndicSUPERB/")

## Punjabi data (target language)

In [2]:
import torchaudio
import pandas as pd

pa_dir = isb_dir / "punjabi"

pa_audio_df = pd.DataFrame({
    'path' : sorted(pa_dir.glob("audio/*.wav"))
}).sample(frac=1.0, random_state=RANDOM_STATE)

pa_audio_df['num_frames'] = pa_audio_df.path.apply(lambda p: torchaudio.info(p).num_frames)

pa_audio_df['path'] = pa_audio_df['path'].apply(lambda p: str(p.name))

pa_audio_df

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,path,num_frames
68813,844424933070449-1194-f.wav,140063
59674,844424932868816-775-f.wav,75790
83342,844424933635741-1199-m.wav,82106
27670,844424931382953-270-f.wav,100682
72831,844424933136076-196-f.wav,57957
...,...,...
6265,844424930708146-101-f.wav,116286
54886,844424932732343-496-f.wav,72818
76820,844424933153156-180-f.wav,88793
860,844424930469993-1158-f.wav,86936


In [3]:
pa_selected_indices = set()

### Select 2 hour test set

In [4]:
pa_test_2h = pa_audio_df.query("num_frames.cumsum() <= (16_000 * 60 * 60 * 2)")

pa_selected_indices = pa_selected_indices.union(set(pa_test_2h.index))

In [5]:
pa_test_2h

Unnamed: 0,path,num_frames
68813,844424933070449-1194-f.wav,140063
59674,844424932868816-775-f.wav,75790
83342,844424933635741-1199-m.wav,82106
27670,844424931382953-270-f.wav,100682
72831,844424933136076-196-f.wav,57957
...,...,...
84119,844424933638938-886-m.wav,67617
41725,844424931566967-594-f.wav,75419
33070,844424931450718-1031-f.wav,60558
14623,844424930998225-389-m.wav,66874


### Select 1 hour validation set

In [6]:
pa_valid_1h = pa_audio_df[ ~pa_audio_df.index.isin(pa_selected_indices) ].query("num_frames.cumsum() <= (16_000 * 60 * 60 * 1)")

pa_selected_indices = pa_selected_indices.union(set(pa_valid_1h.index))

In [7]:
pa_valid_1h

Unnamed: 0,path,num_frames
17282,844424931120531-812-f.wav,98453
60501,844424932888217-850-f.wav,57586
56028,844424932784328-66-f.wav,114428
16249,844424931067342-812-f.wav,80991
36505,844424931505758-249-f.wav,104397
...,...,...
14882,844424931005405-1031-f.wav,81363
12689,844424930924365-930-f.wav,127431
59578,844424932867791-1200-f.wav,86936
1830,844424930566725-396-f.wav,93623


Expect `pa_selected_indices` to be sum of `test` and `valididation` data frame lengths

In [8]:
len(pa_selected_indices) == len(pa_valid_1h) + len(pa_test_2h)

True

Use remaining data not assigned to test or validation sets.

In [9]:
pa_not_selected = pa_audio_df[ ~pa_audio_df.index.isin(pa_selected_indices) ]

### Select pre-training/fine-tuning subsets

#### Select pre-training sets

In [10]:
pa_pretrain_70h = pa_not_selected.query("num_frames.cumsum() <= (16_000 * 60 * 60 * 70)")

In [11]:
pa_pretrain_10h = pa_not_selected.query("num_frames.cumsum() <= (16_000 * 60 * 60 * 10)")

#### Select 1h fine-tuning set

In [12]:
pa_finetune_1h = pa_not_selected.query("num_frames.cumsum() <= (16_000 * 60 * 60 * 1)")

### Write pretraining data to repo

In [13]:
data_out = Path("/workspace/data/manifests/pretrain/")

manifests_dict = {
    'punjabi_train-70h': pa_pretrain_70h,
    'punjabi_train-10h': pa_pretrain_10h,
    'punjabi_valid-1h': pa_valid_1h
}

for (tsv_name, tsv_data) in manifests_dict.items():
    manifest_tsv = data_out / (tsv_name + ".tsv")
    
    with open(manifest_tsv, 'w') as f:
        f.write("/workspace/data/IndicSUPERB/punjabi/audio\n")

    tsv_data.to_csv(manifest_tsv, sep="\t", index=False, header=False, mode='a')

### Prepare fine-tuning data

In [14]:
supervised_all = pd.concat([
    pa_test_2h,
    pa_valid_1h,
    pa_finetune_1h
])

In [15]:
transcriptions = pd.read_csv("/workspace/data/IndicSUPERB_punjabi_transcriptions.tsv", sep="\t")

transcriptions = transcriptions[ transcriptions.file.isin(supervised_all.path) ]

transcriptions.transcript = transcriptions.transcript.str.replace(" ", "|")

transcriptions

Unnamed: 0.1,Unnamed: 0,file,transcript
5,5126,844424933061065-1200-f.wav,ਮਾਪਿਆਂ|ਜਾਂ|ਦੇਖਭਾਲ|ਕਰਨ|ਵਾਲਿਆਂ|ਦੀ|ਰਿਪੋਰਟ|ਤਿਆਰ|ਕਰ...
43,5164,844424932719276-496-f.wav,ਧੱਬੇ|ਨੂੰ|ਹਟਾਉਣ|ਦੀ|ਗੁੰਝਲਤਾ|ਨੂੰ|ਉਹ|ਪ੍ਰਾਪਤ|ਕੀਤਾ|ਗ...
48,5169,844424933057478-704-f.wav,ਇਸ|ਮਾਮਲੇ|ਤੇ|ਪੁਲਿਸ|ਪੂਰੀ|ਕਾਰਵਾਈ|ਕਰ|ਰਹੀ|ਹੈ
84,5205,844424931500820-1200-f.wav,ਇਕ|ਕੁੜੀ|ਹੈ|ਜੋ|ਤਬਦੀਲ|ਨਾ|ਕਰੇਗਾ|ਦਾ|ਪਤਾ|ਕਰਨ|ਲਈ|ਕਰਨ...
141,5262,844424931282882-664-f.wav,ਇਹ|ਵਿਕਲਪਿਕ|ਚੀਜ਼|ਵੱਡੇ|ਪਰਿਵਾਰਾਂ|ਅਤੇ|ਸਮੂਹਾਂ|ਦੇ|ਲਈ...
...,...,...,...
86752,91873,844424933650549-601-m.wav,ਮੁਨਰੋ|ਇਸਦੇ|ਬੁਨਿਆਦ|ਰੱਖਣ|ਦੇ|ਸਮੇਂ|ਸੰਯੁਕਤ|ਰਾਜ|ਅਮਰੀ...
86755,91876,844424933610704-1-m.wav,ਨਿਊਜ਼|ਚੈਨਲ|ਨੇ|ਨਵੇਂ|ਟਰਾਂਸਜੈਂਡਰ|ਐਂਕਰਾਂ|ਨੂੰ|ਕੰਮ|ਲ...
86758,91879,844424933633726-601-m.wav,ਭਾਵੇਂ|ਉਸ|ਲੋੜ|ਦੀ|ਹੋਂਦ|ਹੋਵੇ|ਜਾਂ|ਨਾ|ਹੋਵੇ
86772,91893,844424933647743-601-m.wav,ਲੋਕਧਾਰਾ|ਅਧਿਐਨ|ਦੀਆਂ|ਨਵੀਆਂ|ਤਕਨੀਕਾਂ|ਪ੍ਰਾਪਤੀਆਂ|ਤੇ|...


In [16]:
from collections import Counter
from tqdm.contrib.concurrent import process_map

def get_vocab(texts_list):
    
    def sum_counters(counter_list):
        if len(counter_list) > 10:
            counter_0 = sum_counters(counter_list[:int(len(counter_list)/2)])
            counter_1 = sum_counters(counter_list[int(len(counter_list)/2):])
            return sum([counter_0, counter_1], Counter())
        else:
            return sum(counter_list, Counter())
    
    char_counts = process_map(Counter, texts_list, chunksize=1000)
    char_aggs = sum_counters(char_counts)
    
    return char_aggs

In [17]:
ltr_counts = get_vocab(transcriptions.transcript)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2686/2686 [00:00<00:00, 75882.53it/s]


In [18]:
finetune_data_out = Path("data/manifests/finetune_punjabi-1h/")

dict_ltr_file = finetune_data_out / "dict.ltr.txt"

with open(dict_ltr_file, 'w') as f:
    f.writelines([ f"{ltr} {count}\n" for (ltr, count) in ltr_counts.most_common() ])

In [19]:
finetune_data_dict = {
    'train-1h': pa_finetune_1h,
    'valid-1h': pa_valid_1h,
    'test-2h': pa_test_2h
}

for split, split_files in finetune_data_dict.items():

    data = split_files.rename(columns={'path':'file'}).merge(transcriptions)
    
    ltr_file = finetune_data_out / f"{split}.ltr"
    
    with open(ltr_file, 'w') as f:
        f.writelines([ "|" + " ".join(t) + "|\n" for t in data.transcript.str.split("") ])

    wrd_file = finetune_data_out / f"{split}.wrd"

    with open(wrd_file, 'w') as f:
        f.writelines("\n".join(data.transcript.str.replace("|", " ", regex=False)) + "\n")

    tsv_file = finetune_data_out / f"{split}.tsv"

    with open(tsv_file, 'w') as f:
        f.write("/workspace/data/IndicSUPERB/punjabi/audio\n")
        f.writelines([ f"{file}\t{n_frames}\n" for (file, n_frames) in zip(data.file, data.num_frames) ])

## Transfer languages

### Hindi

In [23]:
hi_dir = isb_dir / "hindi"

hi_audio_df = pd.DataFrame({
    'path' : sorted(hi_dir.glob("audio/*.wav"))
})

hi_audio_df['num_frames'] = hi_audio_df.path.apply(lambda p: torchaudio.info(p).num_frames)

hi_audio_df['path'] = hi_audio_df['path'].apply(lambda p: str(p.name))

hi_audio_df

Unnamed: 0,path,num_frames
0,844424930324966-261-f.wav,97710
1,844424930324970-261-f.wav,82106
2,844424930324972-261-f.wav,60929
3,844424930324975-261-f.wav,68731
4,844424930324976-261-f.wav,112571
...,...,...
96248,844424933583343-256-m.wav,76162
96249,844424933583344-256-m.wav,43096
96250,844424933583345-256-m.wav,62787
96251,844424933583346-256-m.wav,74676


#### Select 3 different subsets of Hindi data

In [27]:
for SEED in [1, 2, 3]:
    hi_sample_df = hi_audio_df \
        .sample(frac=1.0, random_state=SEED) \
        .query("num_frames.cumsum() <= (16_000 * 60 * 60 * 60)")
    
    tsv_name = f"hindi_train-60h-seed-{SEED}"
    manifest_tsv = data_out / (tsv_name + ".tsv")

    with open(manifest_tsv, 'w') as f:
        f.write("/workspace/data/IndicSUPERB/hindi/audio\n")

    hi_sample_df.to_csv(manifest_tsv, sep="\t", index=False, header=False, mode='a')