### Load dataset from common language to train language detector

In [1]:
!pip install transformers datasets soundfile -q


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
from datasets import load_dataset,concatenate_datasets


dataset = load_dataset("common_language")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [3]:
train= dataset["train"]
valid = dataset["validation"]
test = dataset["test"]
dataset= concatenate_datasets([train,valid,test])

In [4]:
dataset.column_names

['client_id', 'path', 'audio', 'sentence', 'age', 'gender', 'language']

In [5]:
columns_to_use = ["sentence","language"]

In [6]:
### Here we have a shorter version of the dataset
dataset= dataset.select_columns(columns_to_use)
dataset.add_column("sentiment",["foo"]*len(dataset))

Dataset({
    features: ['sentence', 'language', 'sentiment'],
    num_rows: 34045
})

In [7]:
### Choose which countries do you wish to add 
desired_languages = ["Spanish", "Portuguese", "Italian", "English", "French"]

In [8]:
languages = dataset.features["language"].names
id2Lang={}
lang2Id={}
desired_languages_idx=[]
for idx,lang in enumerate(languages):
    if lang in desired_languages:
        id2Lang[idx] =lang
        lang2Id[lang]= idx
        if idx not in desired_languages_idx:
            desired_languages_idx.append(idx)

In [9]:
### Here you have the id2lang and the lang2id mapping
print(id2Lang)
print(lang2Id)
print(desired_languages_idx)

{11: 'English', 14: 'French', 22: 'Italian', 32: 'Portuguese', 38: 'Spanish'}
{'English': 11, 'French': 14, 'Italian': 22, 'Portuguese': 32, 'Spanish': 38}
[11, 14, 22, 32, 38]


In [10]:
dataset = dataset.filter(lambda row: row["language"] in desired_languages_idx)

def process_idx_language(example):
    example["sentiment"] = id2Lang[example["language"]]
    return example

dataset = dataset.map(process_idx_language,remove_columns=["language"])


In [11]:
dataset[:10]

{'sentence': ['It is a charity school whose fees are calculated on a means test.',
  'John looked out the window of the show-house, happy he had mortgage approval, but worried about the repayments.',
  'At least forty-one protesters were killed in the ensuing engagement.',
  'It was so small and undeveloped.',
  'Do you mean it?',
  'The boy was also saddened; his friend was in pursuit of his destiny.',
  'Presently he made a faint sound in his throat.',
  'What you eat today walks and talks tomorrow.',
  'Where did you find that apple?',
  'The agreed text was in three parts.'],
 'sentiment': ['English',
  'English',
  'English',
  'English',
  'English',
  'English',
  'English',
  'English',
  'English',
  'English']}

In [12]:
dataset = dataset.rename_columns({"sentence":"text"})

In [13]:
print(f"Dataset have columns {dataset.features} first 10 rows:\n")
print(dataset[:10])

Dataset have columns {'text': Value(dtype='string', id=None), 'sentiment': Value(dtype='string', id=None)} first 10 rows:

{'text': ['It is a charity school whose fees are calculated on a means test.', 'John looked out the window of the show-house, happy he had mortgage approval, but worried about the repayments.', 'At least forty-one protesters were killed in the ensuing engagement.', 'It was so small and undeveloped.', 'Do you mean it?', 'The boy was also saddened; his friend was in pursuit of his destiny.', 'Presently he made a faint sound in his throat.', 'What you eat today walks and talks tomorrow.', 'Where did you find that apple?', 'The agreed text was in three parts.'], 'sentiment': ['English', 'English', 'English', 'English', 'English', 'English', 'English', 'English', 'English', 'English']}


In [14]:
id2label={}
label2id={}
for key in range(len(lang2Id.keys())):
    label2id[desired_languages[key]]=key
    id2label[key]=desired_languages[key]

In [15]:
print(label2id)
print(id2label)

{'Spanish': 0, 'Portuguese': 1, 'Italian': 2, 'English': 3, 'French': 4}
{0: 'Spanish', 1: 'Portuguese', 2: 'Italian', 3: 'English', 4: 'French'}


In [16]:
import json
dataset.to_csv(path_or_buf='./dataset.csv',index=False)
print(f"\nDataframe creation completed")
with open('label2id.json', 'w') as fp:
    json.dump(label2id, fp)
with open('id2label.json', 'w') as fp:
    json.dump(id2label, fp)

Creating CSV from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]


Dataframe creation completed
