<a href="https://colab.research.google.com/github/fouad89/arabicNLP/blob/main/AraBERT_intent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#installing dependencies

In [11]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))
    !nvidia-smi

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB
Fri Jun 10 15:16:55 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   49C    P0    31W / 250W |      2MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------

In [12]:
#!pip install optuna==2.3.0
!pip install transformers==4.2.1
!pip install farasapy
!pip install pyarabic
!git clone https://github.com/aub-mind/arabert

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
fatal: destination path 'arabert' already exists and is not an empty directory.


In [13]:
# !git clone https://github.com/elnagara/HARD-Arabic-Dataset
# !git clone https://github.com/mahmoudnabil/ASTD
# !git clone https://github.com/nora-twairesh/AraSenti
# !git clone https://github.com/mohamedadaly/LABR
# !wget http://homepages.inf.ed.ac.uk/wmagdy/Resources/ArSAS.zip
# !unzip ArSAS.zip
# !unrar x '/content/HARD-Arabic-Dataset/data/unbalanced-reviews.rar'
# !unzip '/content/HARD-Arabic-Dataset/data/balanced-reviews.zip'
!git clone https://github.com/fouad89/arabicNLP

fatal: destination path 'arabicNLP' already exists and is not an empty directory.


In [14]:
!mkdir data
!mkdir train

mkdir: cannot create directory ‘data’: File exists
mkdir: cannot create directory ‘train’: File exists


#Creating training datasets

In [15]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

all_datasets= []

In [17]:
class Dataset:
    def __init__(
        self,
        name,
        train,
        test,
        label_list,
    ):
        self.name = name
        self.train = train
        self.test = test
        self.label_list = label_list

In [18]:
DATA_COLUMN = "text"
LABEL_COLUMN = "label"

## Intent Detection Original

In [33]:
# reading the data
file_path = "/content/drive/MyDrive/Thesis implementation/annotated_data/augmented_data.csv"
df = pd.read_csv(file_path, index_col=0)
df = df[['text', 'intent', 'strength']]
label_mapping = {
    0: 'NEG',
    1: 'POS'
}
label_list = ['NEG', 'POS']

# intent detection
df.intent.value_counts()
df_intent = df[['text', 'intent']]
df_intent['intent'] = df_intent['intent'].map(label_mapping)
print(df_intent.head())
# strength of intent
df_strength = df.loc[df['intent']==1,['text', 'strength']]
df_strength['strength'] = df_strength['strength'].map(label_mapping)
# splitting the data to train and test
intent_train, intent_test = train_test_split(df_intent, test_size=0.2,
                                              random_state=42, stratify=df_intent['intent'])

strength_train, strength_test = train_test_split(df_strength, test_size=0.2,
                                                 random_state=42, stratify=df_strength['strength'])
# convert to dataset
intent_dataset = Dataset('intent', intent_train, intent_test, label_list)
all_datasets.append(intent_dataset)
strength_dataset = Dataset('strength', strength_train, strength_test, label_list)
all_datasets.append(strength_dataset)

                                                      text intent
Column1                                                          
0.0      حين أهاجر موسم التعب هذا، سأعيد ترميم ما يشبه ...    NEG
1.0      @DrTabarakAhmed1 فد مرة تبارك ما كتبت اريد اها...    NEG
2.0      @DrTabarakAhmed1 خل اراجع تغريداتج اتأكد صار ك...    NEG
3.0                      @h_aw25 اكدر اهاجر بس اني ما اريد    NEG
4.0      راح أهاجر وأنسى كل هذا العذاب\nلأن حيل العيشه ...    POS


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [32]:
intent_dataset.train

Unnamed: 0_level_0,text,intent
Column1,Unnamed: 1_level_1,Unnamed: 2_level_1
138.0,بلادي وان جارت علي عزيزه.....قومي وان شحوا علي...,NEG
14.0,@alithesecond1 امنيتي الجديدة اهاجر,POS
2756.0,@alalam_news المالكي سبب من الفساد للعراق ما ف...,NEG
3731.0,يكفل الدستور العراقي حق التعلم المجاني. اين نح...,NEG
1041.0,حرامات ما عدنا زارا بالعراق 💔 كل ما اسافر اشتر...,NEG
...,...,...
3332.0,@soziraq شكد جميله تخبلين والله لو اعرفج او ان...,NEG
227.0,@EdyCohen حيا الله إسرائيل شعب ودولة محافظة عل...,NEG
480.0,أميرة زنكنة: هجرة الشباب من العراق النفطي وصمة...,NEG
243.0,@FatenJawad4 حوار شوووو كل حكوماتنا ع نفس فساد...,NEG


##HARD - Balanced

In [None]:
df_HARD = pd.read_csv("arabicNLP/data/Tweets.txt", sep="\t", header=None)

# df_HARD = df_HARD[["review","rating"]]  # we are interested in rating and review only
df_HARD.columns = [DATA_COLUMN, LABEL_COLUMN]
print(df_HARD[LABEL_COLUMN].value_counts())
# # code rating as +ve if > 3, -ve if less, no 3s in dataset

# hard_map = {
#     5: 'POS',
#     4: 'POS',
#     2: 'NEG',
#     1: 'NEG'
# }
label_encoder = LabelEncoder()
# df_HARD['label'] = label_encoder.fit_transform(df_HARD['label'])

# mapping display
# encoder_map = dict(zip( label_encoder.transform(label_encoder.classes_), label_encoder.classes_))
label_list_HARD = ['NEG', 'POS']

df_HARD = df_HARD[df_HARD.label.isin(label_list_HARD)]


# print classes
# print(encoder_map)
# print(label_encoder.classes_ )
# df_HARD[LABEL_COLUMN] = df_HARD[LABEL_COLUMN].apply(lambda x: hard_map[x])
train_HARD, test_HARD = train_test_split(df_HARD, test_size=0.2, random_state=42)

# convert to dataset
data_Hard = Dataset("HARD", train_HARD, test_HARD, label_list_HARD)
all_datasets.append(data_Hard)
# df_HARD

OBJ        6470
NEG        1642
NEUTRAL     805
POS         777
Name: label, dtype: int64


In [None]:
data_Hard.train.label.value_counts()

NEG    1322
POS     613
Name: label, dtype: int64

#Trainer

In [35]:
from arabert.preprocess import ArabertPreprocessor
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score , recall_score

from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, BertTokenizer
from transformers.data.processors import SingleSentenceClassificationProcessor
from transformers import Trainer , TrainingArguments
from transformers.trainer_utils import EvaluationStrategy
from transformers.data.processors.utils import InputFeatures
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.utils import resample
import logging
import torch
# import optuna 

In [36]:
logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger(__name__)

In [37]:
for x in all_datasets:
  print(x.name)

intent
strength


You can choose which model, and dataset from here along with the max sentence length

In [None]:
dataset_name = 'intent'
model_name = 'aubmindlab/bert-base-arabertv02'
task_name = 'classification'
max_len = 256

In [39]:
for d in all_datasets:
  if d.name==dataset_name:
    selected_dataset = d
    print('Dataset found')
    break

Dataset found


In [40]:
arabert_prep = ArabertPreprocessor(model_name.split("/")[-1])

selected_dataset.train[DATA_COLUMN] = selected_dataset.train[DATA_COLUMN].apply(lambda x:   arabert_prep.preprocess(x))
selected_dataset.test[DATA_COLUMN] = selected_dataset.test[DATA_COLUMN].apply(lambda x:   arabert_prep.preprocess(x))  

In [41]:
selected_dataset.train

Unnamed: 0_level_0,text,intent
Column1,Unnamed: 1_level_1,Unnamed: 2_level_1
138.0,بلادي وان جارت علي عزيزه . . قومي وان شحوا علي...,NEG
14.0,[مستخدم] امنيتي الجديدة اهاجر,POS
2756.0,[مستخدم] المالكي سبب من الفساد للعراق ما فاض ع...,NEG
3731.0,يكفل الدستور العراقي حق التعلم المجاني . اين ن...,NEG
1041.0,حرامات ما عدنا زارا بالعراق كل ما اسافر اشتري ...,NEG
...,...,...
3332.0,[مستخدم] شكد جميله تخبلين والله لو اعرفج او ان...,NEG
227.0,[مستخدم] حيا الله إسرائيل شعب ودولة محافظة على...,NEG
480.0,أميرة زنكنة : هجرة الشباب من العراق النفطي وصم...,NEG
243.0,[مستخدم] حوار شوو كل حكوماتنا ع نفس فساد واحد ...,NEG


In [42]:
class BERTDataset(Dataset):
    def __init__(self, text, target, model_name, max_len, label_map):
      super(BERTDataset).__init__()
      self.text = text
      self.target = target
      self.tokenizer_name = model_name
      self.tokenizer = AutoTokenizer.from_pretrained(model_name)
      self.max_len = max_len
      self.label_map = label_map
      

    def __len__(self):
      return len(self.text)

    def __getitem__(self,item):
      text = str(self.text[item])
      text = " ".join(text.split())


        
      input_ids = self.tokenizer.encode(
          text,
          add_special_tokens=True,
          max_length=self.max_len,
          truncation='longest_first'
      )     
    
      attention_mask = [1] * len(input_ids)

      # Zero-pad up to the sequence length.
      padding_length = self.max_len - len(input_ids)
      input_ids = input_ids + ([self.tokenizer.pad_token_id] * padding_length)
      attention_mask = attention_mask + ([0] * padding_length)    
      
      return InputFeatures(input_ids=input_ids, attention_mask=attention_mask, label=self.label_map[self.target[item]])

In [45]:
label_map = { v:index for index, v in enumerate(selected_dataset.label_list) }
print(label_map)
intent_train_dataset = BERTDataset(selected_dataset.train[DATA_COLUMN].to_list(),selected_dataset.train['intent'].to_list(),model_name,max_len,label_map)
intent_test_dataset = BERTDataset(selected_dataset.test[DATA_COLUMN].to_list(),selected_dataset.test['intent'].to_list(),model_name,max_len,label_map)



{'NEG': 0, 'POS': 1}


In [46]:
train_dataset.label_map

{'NEG': 0, 'POS': 1}

In [47]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=True, num_labels=len(label_map))

In [48]:
def compute_metrics(p): #p should be of type EvalPrediction
  preds = np.argmax(p.predictions, axis=1)
  assert len(preds) == len(p.label_ids)
  #print(classification_report(p.label_ids,preds))
  #print(confusion_matrix(p.label_ids,preds))

  macro_f1_pos_neg = f1_score(p.label_ids,preds,average='macro',labels=[0,1])
  macro_f1 = f1_score(p.label_ids,preds,average='macro')
  macro_precision = precision_score(p.label_ids,preds,average='macro')
  macro_recall = recall_score(p.label_ids,preds,average='macro')
  acc = accuracy_score(p.label_ids,preds)
  return {
      'macro_f1' : macro_f1,
      'macro_f1_pos_neg' : macro_f1_pos_neg,  
      'macro_precision': macro_precision,
      'macro_recall': macro_recall,
      'accuracy': acc
  }

#Regular Training

This paert allows you to do a regular training with no hyper parameter optimization

In [49]:
training_args = TrainingArguments("./train")
training_args.evaluate_during_training = True
training_args.adam_epsilon = 1e-8
training_args.learning_rate = 5e-5
training_args.fp16 = True
training_args.per_device_train_batch_size = 16
training_args.per_device_eval_batch_size = 16
training_args.gradient_accumulation_steps = 2
training_args.num_train_epochs= 5


steps_per_epoch = (len(selected_dataset.train)// (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps))
total_steps = steps_per_epoch * training_args.num_train_epochs
print(steps_per_epoch)
print(total_steps)
#Warmup_ratio
warmup_ratio = 0.1
training_args.warmup_steps = total_steps*warmup_ratio # or you can set the warmup steps directly 

training_args.evaluation_strategy = EvaluationStrategy.EPOCH
# training_args.logging_steps = 200
training_args.save_steps = 100000 #don't want to save any model, there is probably a better way to do this :)
training_args.seed = 42
training_args.disable_tqdm = False
training_args.lr_scheduler_type = 'cosine'

116
580


In [50]:
trainer = Trainer(
    model = model_init(),
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

Downloading:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv02 were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification wer

In [51]:
trainer.train()

Epoch,Training Loss,Validation Loss,Macro F1,Macro F1 Pos Neg,Macro Precision,Macro Recall,Accuracy,Runtime,Samples Per Second
0,No log,0.318349,0.77917,0.77917,0.799985,0.763484,0.861141,7.67,121.122
1,No log,0.282101,0.841621,0.841621,0.825814,0.862181,0.888052,7.6685,121.144
2,No log,0.230129,0.883504,0.883504,0.880082,0.887066,0.921421,7.6652,121.196
3,No log,0.225309,0.9037,0.9037,0.896125,0.911958,0.934338,7.6669,121.171
4,0.209200,0.22949,0.898109,0.898109,0.888227,0.909226,0.930032,7.6595,121.287


TrainOutput(global_step=580, training_loss=0.18757101913978314, metrics={'train_runtime': 507.1602, 'train_samples_per_second': 1.144, 'total_flos': 3855818601947136, 'epoch': 5.0})

In [52]:
predictions = trainer.predict(intent_test_dataset)

In [53]:
print(predictions.predictions.shape, predictions.label_ids.shape)


(929, 2) (929,)


In [54]:
print(predictions.label_ids)

[0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 1 0 0 1 0 1 0 0 0 1 0 0 0 0 0 1 0 0
 1 1 1 0 0 1 0 0 0 1 1 0 0 0 1 1 1 0 0 1 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 1 0 0 0 1 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0
 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 1 0 0 1 1 1 0 0 0 1 0
 0 0 0 0 1 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 1 0 0 1 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 1 0 1 1
 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 1 0 0 0 1
 0 0 0 0 1 0 0 0 1 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0
 1 0 1 0 1 0 0 1 1 1 0 0 1 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 0 1 0 0 1 1 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 1 0 0 0 0 0 

Unnamed: 0,text,target,predictions


In [56]:
reversed_label_map = {v:k for (k,v) in train_dataset.label_map.items()}

In [57]:
results_df = pd.DataFrame(columns=['text', 'target', 'predictions'])
results_df
predictions = trainer.predict(intent_test_dataset)
results_df.text = test_dataset.text
results_df.target = test_dataset.target
results_df.predictions = predictions.label_ids
results_df.predictions = results_df.predictions.map(reversed_label_map)

results_df

Unnamed: 0,text,target,predictions
0,للحديث عن منظومة الاخلاق والقيم والنفاق العالم...,NEG,NEG
1,ها هم أبناء بلدي الغيارى ، شعب مهجر مشرد ممزق ...,NEG,NEG
2,وصوتك وطن ، وأنا مغترب أحن !,NEG,NEG
3,[مستخدم] وإنك الروح التي أهرب إليها دون علمك ....,NEG,NEG
4,[مستخدم] عذرا عزيزي ان مسافر يوم الاحد ستكون ع...,NEG,NEG
...,...,...,...
924,اريد اهاجر وترك . . الدجاجات [رابط],POS,POS
925,[مستخدم] يش غير مقصره والله هم أهل التقصير علا...,POS,POS
926,اكو مؤتمر Project management بالكويت والبحرين ...,NEG,NEG
927,مسافر باجر نبلش شغل النوم شلون بي,NEG,NEG


In [62]:
results_df[results_df['target']!=results_df['predictions']]


Unnamed: 0,text,target,predictions


In [None]:
reversed_label_map

{0: 'NEG', 1: 'POS'}

In [68]:
trainer.save_model("/content/drive/MyDrive/Thesis implementation/Arabert_fine-tuned/intent")

# Strength Classification

In [71]:
dataset_name = 'strength'
model_name = 'aubmindlab/bert-base-arabertv02'
task_name = 'classification'
max_len = 256
for d in all_datasets:
  if d.name==dataset_name:
    selected_dataset = d
    print('Dataset found')
    break

arabert_prep = ArabertPreprocessor(model_name.split("/")[-1])

selected_dataset.train[DATA_COLUMN] = selected_dataset.train[DATA_COLUMN].apply(lambda x:   arabert_prep.preprocess(x))
selected_dataset.test[DATA_COLUMN] = selected_dataset.test[DATA_COLUMN].apply(lambda x:   arabert_prep.preprocess(x))

label_map = { v:index for index, v in enumerate(selected_dataset.label_list) }
print(label_map)
strength_train_dataset = BERTDataset(selected_dataset.train[DATA_COLUMN].to_list(),selected_dataset.train['strength'].to_list(),model_name,max_len,label_map)
strength_test_dataset = BERTDataset(selected_dataset.test[DATA_COLUMN].to_list(),selected_dataset.test['strength'].to_list(),model_name,max_len,label_map)

Dataset found
{'NEG': 0, 'POS': 1}


In [72]:
strength_trainer = Trainer(
    model = model_init(),
    args = training_args,
    train_dataset = strength_train_dataset,
    eval_dataset=strength_train_dataset,
    compute_metrics=compute_metrics,
)
strength_trainer.train()

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv02 were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification wer

Epoch,Training Loss,Validation Loss,Macro F1,Macro F1 Pos Neg,Macro Precision,Macro Recall,Accuracy,Runtime,Samples Per Second
1,No log,0.611336,0.511135,0.511135,0.774145,0.558879,0.681067,6.4827,121.401
2,No log,0.556192,0.67459,0.67459,0.681466,0.670701,0.709022,6.4829,121.395
3,No log,0.385295,0.729048,0.729048,0.870635,0.712559,0.792884,6.4823,121.408
4,No log,0.170663,0.935446,0.935446,0.957762,0.920996,0.942821,6.4824,121.405
5,No log,0.098699,0.979022,0.979022,0.985577,0.973404,0.98094,6.4855,121.347


TrainOutput(global_step=125, training_loss=0.44864358520507813, metrics={'train_runtime': 132.0236, 'train_samples_per_second': 0.947, 'total_flos': 817139497989120, 'epoch': 5.0})

In [73]:
strength_trainer.save_model("/content/drive/MyDrive/Thesis implementation/Arabert_fine-tuned/strength")

In [87]:
strength_results_df = pd.DataFrame(columns=['text', 'target', 'predictions'])
strength_results_df
strength_predictions = strength_trainer.predict(strength_test_dataset)
strength_results_df.text = strength_test_dataset.text
strength_results_df.target = strength_test_dataset.target
strength_results_df.predictions = strength_predictions.label_ids.tolist()
strength_results_df.predictions = strength_results_df.predictions.map({0:'NEG', 1:'POS'})

strength_results_df

Unnamed: 0,text,target,predictions
0,[مستخدم] راح اهاجر للسويد,POS,POS
1,ادعو جميع العراقيين الهجرة الى دولة الارجنتين ...,NEG,NEG
2,[مستخدم] ادعيلي اطلع من اول عشرين ع العالم حته...,POS,POS
3,# روسيا بيها لجوء,POS,POS
4,ماعرف شلون اريد اسافر وحدي واني لحد هسه اخاف ا...,NEG,NEG
...,...,...,...
192,[مستخدم] احم احم البركة بولد المسؤولين الجبناء...,NEG,NEG
193,# مزاجي _ يتعدل _ لما أسافر وما أرجع بعد,NEG,NEG
194,[مستخدم] اني عراقيه ولطلب لجوء من دوله إسرائيل,POS,POS
195,امنيتي اسافر الى حبيبتي # سويسرا هع شوف السبب ...,NEG,NEG


In [88]:
strength_results_df[strength_results_df['target']!=strength_results_df['predictions']]

Unnamed: 0,text,target,predictions
