<a href="https://colab.research.google.com/github/djalil21/sentiment-analysis-algerian-dialect/blob/master/BERTs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# install and import

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install transformers datasets emoji

In [None]:
!pip install --upgrade accelerate

In [None]:
!apt-get install git-lfs

In [None]:
import torch
torch.cuda.is_available()

In [None]:
import tensorflow as tf
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    DataCollatorWithPadding, 
    TrainingArguments, 
    Trainer, 
    pipeline
)
import re
import numpy as np
import pandas as pd
from emoji import EMOJI_DATA
import unicodedata
from sklearn.model_selection import train_test_split
from datasets import Dataset, load_metric
import pyarrow as pa

# Data

In [None]:
file_name="DZ.csv"
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/PFE/' +file_name,names=["text","label"])

#df =df.sample(n=5000)

In [None]:
df.head(10)

Unnamed: 0,text,label
0,اسوء هاتف !,negative
1,عيوب لي فيه يحمى بزاف وباتري عيانة فيه في كامي...,negative
2,صراحة ميصلحش شريتو وندمة,negative
3,خاوتي سمحولي بصح هاتف 0 وثقيل فالريزو,negative
4,أخي الهاتف راه عندي و يسخن بزاف واش نديرله,negative
5,يودي عندي هاذ برطابل هو مليح بصح كان دير بيه م...,negative
6,يسخون,negative
7,بعد عامين من الاستخدام الريزو ناقص بزاف في الا...,negative
8,عندي هذا الهاتف صلى شعل ويطفى وحدو ما السبب؟,negative
9,شريت هذا كابة بصح كي نعيط بيه منقدرش نكوبي حتى...,negative


In [None]:
df = df.dropna(axis=0)

In [None]:
df = df[df.label != "neutral"]

In [None]:
df.label.value_counts()

positive    9748
negative    5574
Name: label, dtype: int64

In [None]:
# search your emoji
def is_emoji(s):
    return s in EMOJI_DATA

def add_space(text):
  result = ''
  for char in text:
    if is_emoji(char):
      result += ' '
    result += char
  return result.strip()

In [None]:
# Define preprocessing util function
def text_preprocessing(text):


    text=str(text)
    # Normalize unicode encoding
    text = unicodedata.normalize('NFC', text)
    
    # Remove '@name'
    text = re.sub(r'(@.*?)[\s]', ' ', text)

    # Replace '&amp;' with '&'
    text = re.sub(r'&amp;', '&', text)

    # Remove trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    #Remove URLs
    text = re.sub('http://\S+|https://\S+', '',text)

    ## Convert text to lowercases
    text = text.lower()

    #remove tashkeel
    noise = re.compile(""" ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
    text = re.sub(noise, '', text)

    #remove repetetions
    text = text.replace('وو', 'و')
    text = text.replace('يي', 'ي')
    text = text.replace('اا', 'ا')

    #remove special arab letters
    text = re.sub("[إأٱآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)

    #keep 2 repeat
    text = re.sub(r'(.)\1+', r'\1\1', text)

    #maximum length 512
    #if len(text) > 512:
     #  text = text[:512]

    #emoji treatment
    text = add_space(text)

    return text

In [None]:
df['text']=df.text.apply(text_preprocessing)

In [None]:
df['label'] = df.label.map({"negative":0,"positive":1})

In [None]:
df.head()

Unnamed: 0,text,label
0,اسوء هاتف !,0
1,عيوب لي فيه يحمي بزاف وباتري عيانه فيه في كامي...,0
2,صراحه ميصلحش شريتو وندمه,0
3,خاوتي سمحولي بصح هاتف 0 وثقيل فالريزو,0
4,اخي الهاتف راه عندي و يسخن بزاف واش نديرله,0


In [None]:
train, test = train_test_split(df, test_size = 0.2)

In [None]:
train_ds = Dataset.from_pandas(train, split="train")
test_ds = Dataset.from_pandas(test, split="test")

# BERT base multilingual

In [None]:
torch.cuda.empty_cache()

In [None]:
NAME="bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(NAME)
model = AutoModelForSequenceClassification.from_pretrained(NAME, num_labels=2, id2label={0: 'negative', 1: 'positive'})

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

In [None]:
def preprocess_function(examples):
  return tokenizer(examples["text"],truncation=True, padding='max_length',max_length=128)

In [None]:
tokenized_train = train_ds.map(preprocess_function, batched=True)
tokenized_test = test_ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/12257 [00:00<?, ? examples/s]

Map:   0%|          | 0/3065 [00:00<?, ? examples/s]

In [None]:
tokenized_train

Dataset({
    features: ['text', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 12257
})

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
def compute_metrics(eval_pred):
   load_accuracy = load_metric("accuracy")
   load_f1 = load_metric("f1")
  
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
   return {"accuracy": accuracy, "f1": f1}


In [None]:
repo_name = NAME+"_2k"
 
training_args = TrainingArguments(
   output_dir=repo_name,
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=10,
   weight_decay=0.01,
   save_strategy="epoch",
)

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.4757
1000,0.353
1500,0.3088
2000,0.2336
2500,0.2172
3000,0.1686
3500,0.1474
4000,0.1298
4500,0.1091
5000,0.0762


TrainOutput(global_step=7670, training_loss=0.16306101868640646, metrics={'train_runtime': 2955.3635, 'train_samples_per_second': 41.474, 'train_steps_per_second': 2.595, 'total_flos': 8062380513868800.0, 'train_loss': 0.16306101868640646, 'epoch': 10.0})

In [None]:
trainer.evaluate()

  load_accuracy = load_metric("accuracy")


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

{'eval_loss': 0.9896340370178223,
 'eval_accuracy': 0.8518760195758565,
 'eval_f1': 0.8859869412355601,
 'eval_runtime': 23.0967,
 'eval_samples_per_second': 132.703,
 'eval_steps_per_second': 8.313,
 'epoch': 10.0}

In [None]:
pipe = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading (…)lve/main/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Xformers is not installed correctly. If you want to use memorry_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [None]:
pipe.save_pretrained(repo_name)

In [None]:
sentiment_model = pipeline('sentiment-analysis',model=repo_name)
sentiment_model(["عجبتني", "ماعجبتنيش"])

[{'label': 'POSITIVE', 'score': 0.6932519674301147},
 {'label': 'NEGATIVE', 'score': 0.6867501735687256}]

# Dziri BERT



In [None]:
torch.cuda.empty_cache()

In [None]:
NAME="alger-ia/dziribert"
tokenizer = AutoTokenizer.from_pretrained(NAME)
model = AutoModelForSequenceClassification.from_pretrained(NAME, num_labels=2, id2label={0: 'negative', 1: 'positive'})

Downloading (…)okenizer_config.json:   0%|          | 0.00/176 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/620 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of the model checkpoint at alger-ia/dziribert were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at alger-ia/dziribert and are

In [None]:
def preprocess_function(examples):
  return tokenizer(examples["text"],truncation=True, padding='max_length',max_length=128)

In [None]:
tokenized_train = train_ds.map(preprocess_function, batched=True)
tokenized_test = test_ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/12257 [00:00<?, ? examples/s]

Map:   0%|          | 0/3065 [00:00<?, ? examples/s]

In [None]:
tokenized_train

Dataset({
    features: ['text', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 12257
})

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
def compute_metrics(eval_pred):
   load_accuracy = load_metric("accuracy")
   load_f1 = load_metric("f1")
  
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
   return {"accuracy": accuracy, "f1": f1}


In [None]:
repo_name = NAME+"_2k"
 
training_args = TrainingArguments(
   output_dir=repo_name,
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=10,
   weight_decay=0.01,
   save_strategy="epoch",
)

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.3222
1000,0.2304
1500,0.178
2000,0.0852
2500,0.0763
3000,0.0401
3500,0.0275
4000,0.0282
4500,0.0198
5000,0.0138


TrainOutput(global_step=7670, training_loss=0.06938412503437703, metrics={'train_runtime': 2701.4809, 'train_samples_per_second': 45.371, 'train_steps_per_second': 2.839, 'total_flos': 8062380513868800.0, 'train_loss': 0.06938412503437703, 'epoch': 10.0})

In [None]:
trainer.evaluate()

{'eval_loss': 1.028145670890808,
 'eval_accuracy': 0.8792822185970636,
 'eval_f1': 0.9053708439897699,
 'eval_runtime': 23.3311,
 'eval_samples_per_second': 131.37,
 'eval_steps_per_second': 8.229,
 'epoch': 10.0}

In [None]:
pipe = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [None]:
pipe.save_pretrained(repo_name)

In [None]:
sentiment_model = pipeline('sentiment-analysis',model=repo_name)
sentiment_model(["عجبتني", "ماعجبتنيش"])

[{'label': 'POSITIVE', 'score': 0.6932519674301147},
 {'label': 'NEGATIVE', 'score': 0.6867501735687256}]

# AraBERT



In [None]:
torch.cuda.empty_cache()

In [None]:
NAME="aubmindlab/bert-base-arabertv02"
tokenizer = AutoTokenizer.from_pretrained(NAME)
model = AutoModelForSequenceClassification.from_pretrained(NAME, num_labels=2, id2label={0: 'negative', 1: 'positive'})

Downloading (…)okenizer_config.json:   0%|          | 0.00/381 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/825k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.64M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv02 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification wer

In [None]:
def preprocess_function(examples):
  return tokenizer(examples["text"],truncation=True, padding='max_length',max_length=128)

In [None]:
tokenized_train = train_ds.map(preprocess_function, batched=True)
tokenized_test = test_ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/12257 [00:00<?, ? examples/s]

Map:   0%|          | 0/3065 [00:00<?, ? examples/s]

In [None]:
tokenized_train

Dataset({
    features: ['text', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 12257
})

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
def compute_metrics(eval_pred):
   load_accuracy = load_metric("accuracy")
   load_f1 = load_metric("f1")
  
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
   return {"accuracy": accuracy, "f1": f1}


In [None]:
repo_name = NAME+"_2k"
 
training_args = TrainingArguments(
   output_dir=repo_name,
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=10,
   weight_decay=0.01,
   save_strategy="epoch",
)

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.4464
1000,0.3553
1500,0.3058
2000,0.2468
2500,0.2303
3000,0.1952
3500,0.165
4000,0.1562
4500,0.1367
5000,0.1212


TrainOutput(global_step=7670, training_loss=0.18257191209158966, metrics={'train_runtime': 2754.748, 'train_samples_per_second': 44.494, 'train_steps_per_second': 2.784, 'total_flos': 8062380513868800.0, 'train_loss': 0.18257191209158966, 'epoch': 10.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.8820879459381104,
 'eval_accuracy': 0.8574225122349103,
 'eval_f1': 0.8890017780035561,
 'eval_runtime': 23.09,
 'eval_samples_per_second': 132.742,
 'eval_steps_per_second': 8.315,
 'epoch': 10.0}

In [None]:
pipe = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [None]:
pipe.save_pretrained(repo_name)

In [None]:
sentiment_model = pipeline('sentiment-analysis',model=repo_name)
sentiment_model(["عجبتني", "ماعجبتنيش"])

[{'label': 'POSITIVE', 'score': 0.6932519674301147},
 {'label': 'NEGATIVE', 'score': 0.6867501735687256}]

# MarBERT


In [None]:
torch.cuda.empty_cache()

In [None]:
NAME="UBC-NLP/MARBERTv2"
tokenizer = AutoTokenizer.from_pretrained(NAME)
model = AutoModelForSequenceClassification.from_pretrained(NAME, num_labels=2, id2label={0: 'negative', 1: 'positive'})

Downloading (…)okenizer_config.json:   0%|          | 0.00/439 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.10M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/757 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/654M [00:00<?, ?B/s]

Some weights of the model checkpoint at UBC-NLP/MARBERTv2 were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initiali

In [None]:
def preprocess_function(examples):
  return tokenizer(examples["text"],truncation=True, padding='max_length',max_length=128)

In [None]:
tokenized_train = train_ds.map(preprocess_function, batched=True)
tokenized_test = test_ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/12257 [00:00<?, ? examples/s]

Map:   0%|          | 0/3065 [00:00<?, ? examples/s]

In [None]:
tokenized_train

Dataset({
    features: ['text', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 12257
})

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
def compute_metrics(eval_pred):
   load_accuracy = load_metric("accuracy")
   load_f1 = load_metric("f1")
  
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
   return {"accuracy": accuracy, "f1": f1}


In [None]:
repo_name = NAME+"_2k"
 
training_args = TrainingArguments(
   output_dir=repo_name,
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=10,
   weight_decay=0.01,
   save_strategy="epoch",
)

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.4091
1000,0.2834
1500,0.2422
2000,0.1794
2500,0.174
3000,0.1334
3500,0.1063


Step,Training Loss
500,0.4091
1000,0.2834
1500,0.2422
2000,0.1794
2500,0.174
3000,0.1334
3500,0.1063
4000,0.0986
4500,0.076
5000,0.0568


TrainOutput(global_step=7670, training_loss=0.12702134143575697, metrics={'train_runtime': 2837.5495, 'train_samples_per_second': 43.196, 'train_steps_per_second': 2.703, 'total_flos': 8062380513868800.0, 'train_loss': 0.12702134143575697, 'epoch': 10.0})

In [None]:
trainer.evaluate()

  load_accuracy = load_metric("accuracy")


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

{'eval_loss': 0.8705043792724609,
 'eval_accuracy': 0.8835236541598694,
 'eval_f1': 0.9108168873344991,
 'eval_runtime': 22.2771,
 'eval_samples_per_second': 137.585,
 'eval_steps_per_second': 8.619,
 'epoch': 10.0}

In [None]:
pipe = pipeline('sentiment-analysis')

In [None]:
pipe.save_pretrained(repo_name)

In [None]:
sentiment_model = pipeline('sentiment-analysis',model=repo_name)
sentiment_model(["عجبتني", "ماعجبتنيش"])

[{'label': 'POSITIVE', 'score': 0.6932519674301147},
 {'label': 'NEGATIVE', 'score': 0.6867501735687256}]

# Roberta

In [None]:
torch.cuda.empty_cache()

In [None]:
NAME="xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(NAME)
model = AutoModelForSequenceClassification.from_pretrained(NAME, num_labels=2, id2label={0: 'negative', 1: 'positive'})

In [None]:
def preprocess_function(examples):
  return tokenizer(examples["text"],truncation=True, padding='max_length',max_length=128)

In [None]:
tokenized_train = train_ds.map(preprocess_function, batched=True)
tokenized_test = test_ds.map(preprocess_function, batched=True)

In [None]:
tokenized_train

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
def compute_metrics(eval_pred):
   load_accuracy = load_metric("accuracy")
   load_f1 = load_metric("f1")
  
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
   return {"accuracy": accuracy, "f1": f1}


In [None]:
repo_name = NAME+"_2k"
 
training_args = TrainingArguments(
   output_dir=repo_name,
   learning_rate=2e-5,
   per_device_train_batch_size=8,
   per_device_eval_batch_size=8,
   num_train_epochs=10,
   weight_decay=0.01,
   save_strategy="epoch",
)

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
pipe = pipeline('sentiment-analysis')

In [None]:
pipe.save_pretrained(repo_name)

In [None]:
sentiment_model = pipeline('sentiment-analysis',model=repo_name)
sentiment_model(["عجبتني", "ماعجبتنيش"])

In [None]:
!rmdir UBC-NLP/

In [None]:
!rm -r bert-base-multilingual-cased_2k/