# Preprocess Common Voice dataset

In this notebook, I download a small speech dataset and store it in a csv file for easy manipulation.

In [2]:
#make directory to store the data and go there
import os

if not(os.path.isdir('../data')):
    os.mkdir('../data')

os.chdir('../data')

The files are downloaded from [The Common Voice portuguese dataset](https://commonvoice.mozilla.org/pt/datasets). The version downloaded is the Common Voice Delta Segment 16.1, with 5 hours and 90 different speakers.

In [6]:
#unzip file
import tarfile

fileslist = list(os.walk('.'))[0][2]

with tarfile.open(fileslist[0], 'r:gz') as tar_handle:
    tar_handle.extractall(path=os.getcwd())

In [18]:
#open pandas dataframe with the 'validated' file

import pandas as pd

raw_df1 = pd.read_csv('.\\cv-corpus-16.1-delta-2023-12-06\pt\\validated.tsv',delimiter='\t')
raw_df2 = pd.read_csv('.\\cv-corpus-16.1-delta-2023-12-06\pt\\other.tsv',delimiter='\t')

raw_df = pd.concat([raw_df1,raw_df2],ignore_index=True)

raw_df

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accents,variant,locale,segment
0,28a0e5ec608af5cd213913d7983eff7c8a8f4dad4c47f6...,common_voice_pt_38600236.mp3,"Doze, treze, quatorze, quinze, dezesseis, deze...",3,0,,,,,pt,
1,3e030fe5ca1c7fd273d9f3ab2394bb93bb9b460b5ad12a...,common_voice_pt_38899159.mp3,"Bom dia, gostaria de falar com o Zé.",2,0,twenties,male,,Portuguese (Brasil),pt,
2,405c4b571432296f236aba3541f91aa1ae496d0a97d715...,common_voice_pt_38944527.mp3,"um, dois, três, quatro, cinco, seis, sete oito...",3,0,thirties,male,"Mineiro,Interior",Portuguese (Brasil),pt,
3,9f15e5d009052e2752891456f57a5b659dcab8e0bdde68...,common_voice_pt_38877240.mp3,O passeio foi agradável.,2,0,fifties,male,,,pt,
4,cb0e457a08c8e5b6bcba0731b1810349dd6b381a6d078a...,common_voice_pt_38566983.mp3,Deck de cartas ou cards,2,0,,,,,pt,
...,...,...,...,...,...,...,...,...,...,...,...
3389,e688244ae020b9896c8a30fb2819a7238b8e1563fecefc...,common_voice_pt_39584736.mp3,Nós agradecemos a ele,0,0,,,,,pt,
3390,e688244ae020b9896c8a30fb2819a7238b8e1563fecefc...,common_voice_pt_39584738.mp3,Nossa estratégia está alinhada com nossos obje...,0,0,,,,,pt,
3391,e688244ae020b9896c8a30fb2819a7238b8e1563fecefc...,common_voice_pt_39584739.mp3,Ainda tinha tempo para ir à papelaria comprar ...,0,0,,,,,pt,
3392,e688244ae020b9896c8a30fb2819a7238b8e1563fecefc...,common_voice_pt_39584741.mp3,Venture Capital é capital de risco.,0,0,,,,,pt,


In [19]:
#the client_id field indicates the speaker, so we group them by speaker
speakers = raw_df.client_id.unique()

print(speakers)
print(f'\nThis dataset has {len(speakers)} speakers.')

['28a0e5ec608af5cd213913d7983eff7c8a8f4dad4c47f62fc8ca0e3745eec8be99c1a7433d49b4518fb2263d559030b939dd6799e177958164ef94ee73bf5e93'
 '3e030fe5ca1c7fd273d9f3ab2394bb93bb9b460b5ad12ae45eac93f5c7c1ffcf438ef3a3a2676b470d4ef761cdae8fcf8a777dc91ae3e2665511fee6ac25fd1d'
 '405c4b571432296f236aba3541f91aa1ae496d0a97d715d922b58df7ca15365fdc05da4a1ec1b382c4e9f407544a09c818df3f7fdb766a93d01ecb452535c12c'
 '9f15e5d009052e2752891456f57a5b659dcab8e0bdde6897db3f3ffb98ae592dd9057b7990e4ebfe53c0aa80bcf07668069d0775ca07e3b7fd2687df7e90b0aa'
 'cb0e457a08c8e5b6bcba0731b1810349dd6b381a6d078a5e45eef9d7b2c8e8a544ad463687b6346a2977b71dc574c53ba45b65889e36db2af61bf36f4704fcde'
 'd2b517d9ef7aec6b63d41fefeee036f63e8342e743c4f3f2e47c85c98c06df33c402a8e43144c8916f4aeb570f852f65ccf950aa1eb244290300e00e0d3272d2'
 '5c1bf3b021aaea27558efe192d63172a57da33604898e930ef1b269be02f017eed9a93f837012b03af4f56e573de49fe74b6770ccdc905ea1b41dda3a3ea4128'
 '5e7d8d86471f5e3edf96c68b4f0b23d542376fc810cffa77840fe42531a34055167e55325e

In [26]:
processed_df = raw_df[['client_id','path']]

processed_df = processed_df.rename(columns={'client_id':'speaker'})

processed_df

Unnamed: 0,speaker,path
0,28a0e5ec608af5cd213913d7983eff7c8a8f4dad4c47f6...,common_voice_pt_38600236.mp3
1,3e030fe5ca1c7fd273d9f3ab2394bb93bb9b460b5ad12a...,common_voice_pt_38899159.mp3
2,405c4b571432296f236aba3541f91aa1ae496d0a97d715...,common_voice_pt_38944527.mp3
3,9f15e5d009052e2752891456f57a5b659dcab8e0bdde68...,common_voice_pt_38877240.mp3
4,cb0e457a08c8e5b6bcba0731b1810349dd6b381a6d078a...,common_voice_pt_38566983.mp3
...,...,...
3389,e688244ae020b9896c8a30fb2819a7238b8e1563fecefc...,common_voice_pt_39584736.mp3
3390,e688244ae020b9896c8a30fb2819a7238b8e1563fecefc...,common_voice_pt_39584738.mp3
3391,e688244ae020b9896c8a30fb2819a7238b8e1563fecefc...,common_voice_pt_39584739.mp3
3392,e688244ae020b9896c8a30fb2819a7238b8e1563fecefc...,common_voice_pt_39584741.mp3


In [27]:
processed_df['path'] = '..\\data.\\cv-corpus-16.1-delta-2023-12-06\\pt\\clips\\' + processed_df['path']

processed_df

Unnamed: 0,speaker,path
0,28a0e5ec608af5cd213913d7983eff7c8a8f4dad4c47f6...,..\data.\cv-corpus-16.1-delta-2023-12-06\pt\cl...
1,3e030fe5ca1c7fd273d9f3ab2394bb93bb9b460b5ad12a...,..\data.\cv-corpus-16.1-delta-2023-12-06\pt\cl...
2,405c4b571432296f236aba3541f91aa1ae496d0a97d715...,..\data.\cv-corpus-16.1-delta-2023-12-06\pt\cl...
3,9f15e5d009052e2752891456f57a5b659dcab8e0bdde68...,..\data.\cv-corpus-16.1-delta-2023-12-06\pt\cl...
4,cb0e457a08c8e5b6bcba0731b1810349dd6b381a6d078a...,..\data.\cv-corpus-16.1-delta-2023-12-06\pt\cl...
...,...,...
3389,e688244ae020b9896c8a30fb2819a7238b8e1563fecefc...,..\data.\cv-corpus-16.1-delta-2023-12-06\pt\cl...
3390,e688244ae020b9896c8a30fb2819a7238b8e1563fecefc...,..\data.\cv-corpus-16.1-delta-2023-12-06\pt\cl...
3391,e688244ae020b9896c8a30fb2819a7238b8e1563fecefc...,..\data.\cv-corpus-16.1-delta-2023-12-06\pt\cl...
3392,e688244ae020b9896c8a30fb2819a7238b8e1563fecefc...,..\data.\cv-corpus-16.1-delta-2023-12-06\pt\cl...


In [28]:
processed_df.to_csv('cv_rough.csv')