In [None]:
import glob
import os
import pandas as pd

## Embeddings

In [None]:
!wget -O 'embeddings/word2vec_cbow_s50.zip' 'http://143.107.183.175:22980/download.php?file=embeddings/word2vec/cbow_s50.zip'
!wget -O 'embeddings/word2vec_skip_s50.zip' 'http://143.107.183.175:22980/download.php?file=embeddings/word2vec/skip_s50.zip'
!wget -O 'embeddings/wang2vec_cbow_s50.zip' 'http://143.107.183.175:22980/download.php?file=embeddings/wang2vec/cbow_s50.zip'
!wget -O 'embeddings/wang2vec_skip_s50.zip' 'http://143.107.183.175:22980/download.php?file=embeddings/wang2vec/skip_s50.zip'

In [None]:
!cp embeddings /content/drive/My Drive/TCC_data/embeddings/

## Mozila

In [None]:
!wget 'https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-5.1-2020-06-22/pt.tar.gz'
!tar -xzf pt.tar.gz

df = pd.read_csv('cv-corpus-5.1-2020-06-22/pt/validated.tsv', sep='\t')
dfs = dict()

genders = df['gender'].dropna().unique()
ages = df['age'].dropna().unique()

for gender in genders:
    _df = df[df['gender'] == gender]
    for age in ages:
        dfs['{}_{}'.format(gender, age)] = _df[_df['age'] == age]

final_df = pd.DataFrame(columns=df.columns)
for v in dfs.values():
    final_df = final_df.append(v.sample(min(1500, len(v))))

final_df = final_df.sample(frac=1)
final_df.reset_index(inplace=True, drop=True)
final_df.rename(columns={'path': 'filepath'}, inplace=True)

# os.makedirs('mozilla/wav')
for f in final_df['filepath'].values:
    os.rename('/content/cv-corpus-5.1-2020-06-22/pt/clips/{}'.format(f), 'mozilla/wav/{}'.format(f))

final_df['filepath'] = final_df['filepath'].apply(lambda x: 'mozilla/wav/{}'.format(x))
final_df.to_csv('mozilla/sentences.tsv', sep='\t', index=False)

In [None]:
!zip -r mozilla.zip mozilla
!cp mozilla.zip '/content/drive/My Drive/TCC_data/corpus/'

## Voxforge

In [None]:
%%bash
#!/bin/bash

wget -r -l1  http://www.repository.voxforge1.org/downloads/pt/Trunk/Audio/Original/48kHz_16bit/
mkdir voxforge
mkdir voxforge/wav
cd www.repository.voxforge1.org
find . -name '*.tgz' -exec tar zxvf '{}' \;
find . -name '*.wav' -exec mv --backup=t '{}' ../voxforge/wav \;
find . -name 'prompts-original' -exec cat '{}' >> ../voxforge/sentences.txt \;
cd ../voxforge/wav
rename 's/((?:\..+)?)\.~(\d+)~$/_$2$1/' *.~*~
rm ar*.wav
rm rp*.wav
cd ../
sort sentences.txt | uniq -d > tmp && mv tmp sentences.txt
sed -i '/^ar-/d' sentences.txt
sed -i '/^rp-/d' sentences.txt

In [None]:
with open('voxforge/sentences.txt') as f:
    sentences = f.read().split('\n')
    sentences = [s.split(' ') for s in sentences]
    sentences = {s[0]: ' '.join(s[1:]).strip() for s in sentences}
    del sentences['']

In [None]:
filepath = []
sentence = []
for fname in glob.glob('voxforge/wav/*.wav'):
    _id = fname.split('/')[-1]
    _id = _id.split('.')[0].split('_')[0]
    filepath.append(fname)
    sentence.append(sentences[_id])

In [None]:
pd.DataFrame(
    {'filepath': filepath, 'sentence': sentence}
).to_csv('voxforge/sentences.tsv', sep='\t', index=False)

In [None]:
!rm voxforge/sentences.txt
!zip -r voxforge.zip voxforge
!cp voxforge.zip '/content/drive/My Drive/TCC_data/corpus/'

## Código de Defesa do Consumidor

In [None]:
%%bash
#!/bin/bash

wget https://gitlab.com/fb-audio-corpora/codigodefesaconsumidor16k/-/archive/master/codigodefesaconsumidor16k-master.zip
unzip codigodefesaconsumidor16k-master.zip
mkdir cod_def_cons
mkdir cod_def_cons/wav
mv codigodefesaconsumidor16k-master/*.wav cod_def_cons/wav

In [None]:
files = glob.glob('cod_def_cons/wav/*.wav')

filepath = []
sentence = []
for a in files:
    fname = a.split('/')[-1].split('.')[0]
    with open('codigodefesaconsumidor16k-master/{}.txt'.format(fname)) as f:
        sent = f.read().strip()
    filepath.append(a)
    sentence.append(sent)

In [None]:
pd.DataFrame(
    {'filepath': filepath, 'sentence': sentence}
).to_csv('cod_def_cons/sentences.tsv', sep='\t', index=False)

In [None]:
!zip -r cod_def_cons.zip cod_def_cons
!cp cod_def_cons.zip '/content/drive/My Drive/TCC_data/corpus/'

## Constituição

In [None]:
%%bash
#!/bin/bash

wget https://gitlab.com/fb-audio-corpora/constituicao16k/-/archive/master/constituicao16k-master.zip
unzip constituicao16k-master.zip
mkdir constituicao
mkdir constituicao/wav
mv constituicao16k-master/*.wav constituicao/wav/

In [None]:
files = glob.glob('constituicao/wav/*.wav')

filepath = []
sentence = []
for a in files:
    fname = a.split('/')[-1].split('.')[0]
    with open('constituicao16k-master/{}.txt'.format(fname)) as f:
        sent = f.read().strip()
    filepath.append(a)
    sentence.append(sent)

In [None]:
pd.DataFrame(
    {'filepath': filepath, 'sentence': sentence}
).to_csv('constituicao/sentences.tsv', sep='\t', index=False)

In [None]:
!zip -r constituicao.zip constituicao
!cp constituicao.zip '/content/drive/My Drive/TCC_data/corpus/'

In [None]:
%%bash
#!/bin/bash

wget https://gitlab.com/fb-audio-corpora/lapsbm16k/-/archive/master/lapsbm16k-master.zip
unzip lapsbm16k-master.zip
mkdir lapsbm
mkdir lapsbm/wav
mkdir lapsbm/txt
find lapsbm16k-master/ -name '*.wav' -exec mv '{}' lapsbm/wav \;
find lapsbm16k-master/ -name '*.txt' -exec mv '{}' lapsbm/txt \;

In [None]:
files = glob.glob('lapsbm/wav/*.wav')

filepath = []
sentence = []
for a in files:
    fname = a.split('/')[-1].split('.')[0]
    with open('lapsbm/txt/{}.txt'.format(fname)) as f:
        sent = f.read().strip()
    filepath.append(a)
    sentence.append(sent)

In [None]:
pd.DataFrame(
    {'filepath': filepath, 'sentence': sentence}
).to_csv('lapsbm/sentences.tsv', sep='\t', index=False)

In [None]:
!rm -rf lapsbm/txt/
!zip -r lapsbm.zip lapsbm
!cp lapsbm.zip '/content/drive/My Drive/TCC_data/corpus/'