In [73]:
#!pip install datasets transformers[sentencepiece]
#!pip install ipywidgets
#!pip install sklearn

# *Preprocessing*

Load the dataset named 'common_language' with the help of load_dataset() function. 

In [74]:
from datasets import load_dataset
from transformers import AutoTokenizer

# checkpoint used in preprocessing and modelling
checkpoint = 'xlm-roberta-base'

# dataset used
dataset_name = 'common_language' #'amazon_reviews_multi'

# cache dirs
# CACHE_DATASETS = './cache/datasets/'
# CACHE_PRETRAINED = './cache/pretrained/'

In [75]:
 #conda install -c conda-forge pysoundfile 

In [76]:
# load dataset
# train_dataset, test_dataset = load_dataset(dataset_name, 
# #                                            "all_languages", 
#                                            split=['train', 'test'], 
# #                                            cache_dir=CACHE_DATASETS
# #                                            download_mode="force_redownload"
#                                           )
# print(train_dataset)
# print(test_dataset)


dataset = load_dataset(dataset_name)
print(dataset)

Found cached dataset common_language (C:/Users/Deepak/.cache/huggingface/datasets/common_language/full/0.1.0/b27a81451e46e5e822b34bc81f15dfc4b70a94c78749fd8663068a580339ea42)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'age', 'gender', 'language'],
        num_rows: 22194
    })
    validation: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'age', 'gender', 'language'],
        num_rows: 5888
    })
    test: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'age', 'gender', 'language'],
        num_rows: 5963
    })
})


In [77]:
train_dataset = dataset['train']
test_dataset = dataset['test']
print(train_dataset)
print(test_dataset)

Dataset({
    features: ['client_id', 'path', 'audio', 'sentence', 'age', 'gender', 'language'],
    num_rows: 22194
})
Dataset({
    features: ['client_id', 'path', 'audio', 'sentence', 'age', 'gender', 'language'],
    num_rows: 5963
})


In [78]:
#pip install librosa

In [79]:
train_dataset[0]#['sentence','language']

{'client_id': 'ara_trn_sp_12',
 'path': 'C:/Users/Deepak/.cache/huggingface/datasets/downloads/extracted/f0a00eebba52ef054ccd0c7ed721f1351399293ff030218da392cb047d874bed/common_voice_kpd/Arabic/train/ara_trn_sp_12/common_voice_ar_20401372.wav',
 'audio': {'path': 'C:/Users/Deepak/.cache/huggingface/datasets/downloads/extracted/f0a00eebba52ef054ccd0c7ed721f1351399293ff030218da392cb047d874bed/common_voice_kpd/Arabic/train/ara_trn_sp_12/common_voice_ar_20401372.wav',
  'array': array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         2.66671705e-05, 2.05863598e-05, 1.20680525e-05], dtype=float32),
  'sampling_rate': 48000},
 'sentence': 'عليك أن تفي بوعدك.',
 'age': 'twenties',
 'gender': 'male',
 'language': 0}

In [80]:
new_dict_you_want = {key: train_dataset[0][key] for key in ('sentence','language')}
new_dict_you_want

{'sentence': 'عليك أن تفي بوعدك.', 'language': 0}

In [81]:
max_input_length = 514

# shuffle dataset
train_dataset = train_dataset.shuffle()
test_dataset = test_dataset.shuffle()

# tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint,
#                                           cache_dir=CACHE_PRETRAINED
                                         )

# tokenizer helper function
def tokenize(batch):
#     return tokenizer(batch['review_body'], truncation=True, max_length=max_input_length)
    return tokenizer(batch['sentence'], truncation=True, max_length=max_input_length)

# collator
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# tokenize dataset
train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

train_dataset = train_dataset.remove_columns(['client_id','path', 'sentence', 'age', 'gender', 'audio'])
train_dataset =  train_dataset.rename_column("language", "labels")
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset = test_dataset.remove_columns(['client_id','path', 'sentence', 'age', 'gender', 'audio'])
test_dataset =  test_dataset.rename_column("language", "labels")
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

print(train_dataset)
print(test_dataset)

  0%|          | 0/23 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 22194
})
Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 5963
})


In [82]:
# tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint,
#                                           cache_dir=CACHE_PRETRAINED
                                         )

# tokenizer helper function
def tokenize(batch):
#     return tokenizer(batch['review_body'], truncation=True, max_length=max_input_length)
    return tokenizer(batch['sentence'], truncation=True, max_length=max_input_length)

# collator
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [83]:
# save datasets to datasets folder

training_input_path = f'dataset/train'
train_dataset.save_to_disk(training_input_path)

test_input_path = f'dataset/test'
test_dataset.save_to_disk(test_input_path)

# Training

In [84]:
import numpy as np
from datasets import load_from_disk, load_metric
from transformers import AutoModelForSequenceClassification
from transformers import  Trainer, TrainingArguments
from transformers.trainer_utils import get_last_checkpoint

In [85]:
# hub model id
hub_model_id = f'language-detection-fine-tuned-on-{checkpoint}'

In [86]:
 # load datasets
train_dataset = load_from_disk(training_input_path)
test_dataset = load_from_disk(test_input_path)

print(f"loaded train_dataset length is: {len(train_dataset)}")
print(f"loaded test_dataset length is: {len(test_dataset)}")

# define metrics and metrics function
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

# Prepare model labels - useful in inference API
labels = train_dataset.features["labels"].names
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

loaded train_dataset length is: 22194
loaded test_dataset length is: 5963


In [87]:
train_dataset.features

{'labels': ClassLabel(names=['Arabic', 'Basque', 'Breton', 'Catalan', 'Chinese_China', 'Chinese_Hongkong', 'Chinese_Taiwan', 'Chuvash', 'Czech', 'Dhivehi', 'Dutch', 'English', 'Esperanto', 'Estonian', 'French', 'Frisian', 'Georgian', 'German', 'Greek', 'Hakha_Chin', 'Indonesian', 'Interlingua', 'Italian', 'Japanese', 'Kabyle', 'Kinyarwanda', 'Kyrgyz', 'Latvian', 'Maltese', 'Mangolian', 'Persian', 'Polish', 'Portuguese', 'Romanian', 'Romansh_Sursilvan', 'Russian', 'Sakha', 'Slovenian', 'Spanish', 'Swedish', 'Tamil', 'Tatar', 'Turkish', 'Ukranian', 'Welsh'], id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [88]:
# download model from model hub
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint, 
    num_labels=num_labels, 
    label2id=label2id, 
    id2label=id2label,
#     cache_dir=CACHE_PRETRAINED
)
# tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense

# Saving the model

In [89]:
pt_save_directory = "C:/Users/Deepak/Desktop/Trained_Models/XLM-Roberta-Fine-Tuned"

In [90]:
tokenizer.save_pretrained("C:/Users/Deepak/Desktop/Trained_Models/XLM-Roberta-Fine-Tuned")
model.save_pretrained("C:/Users/Deepak/Desktop/Trained_Models/XLM-Roberta-Fine-Tuned")

# Invoke pretrained model from the directory

In [91]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("C:/Users/Deepak/Desktop/Trained_Models/XLM-Roberta-Fine-Tuned")
tokenizer = AutoTokenizer.from_pretrained("C:/Users/Deepak/Desktop/Trained_Models/XLM-Roberta-Fine-Tuned")

In [92]:
classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

In [93]:
classifier("Nous sommes très heureux de vous présenter la bibliothèque Transformers.")

[{'label': 'Dhivehi', 'score': 0.029579907655715942}]

In [94]:
classifier("I am Deepak")

[{'label': 'Dhivehi', 'score': 0.029410764575004578}]

In [95]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline

tokenizer = AutoTokenizer.from_pretrained("C:/Users/Deepak/Desktop/Trained_Models/XLM-Roberta-Fine-Tuned")
model = AutoModelForSequenceClassification.from_pretrained("C:/Users/Deepak/Desktop/Trained_Models/XLM-Roberta-Fine-Tuned")

classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer)

In [96]:
res = classifier("I am Deepak")
print(res)

[{'label': 'Dhivehi', 'score': 0.029410764575004578}]
