# Weakly Learning on IMDB Reviews

Learning using BERT base uncased over the weak labels of the IMDB dataset

## Setup

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_style('white')
sns.set_context('notebook')
%matplotlib inline

## Loading Dataset

In [2]:
train_df = pd.read_json('./train.json')
temp_df = pd.read_json('./valid.json')

In [3]:
valid_df = temp_df.iloc[:20000, :].reset_index(drop=True)
test_df = temp_df.iloc[20000:, :].reset_index(drop=True)

In [4]:
from datasets import Dataset, DatasetDict

train_ds = Dataset.from_pandas(train_df[['text', 'weak_labels']], split='train')
valid_ds = Dataset.from_pandas(valid_df[['text', 'weak_labels']], split='valid')
test_ds = Dataset.from_pandas(test_df[['text', 'label']], split='test')

ds = DatasetDict({'train': train_ds, 'valid': valid_ds, 'test': test_ds})

In [5]:
ds['train'] = ds['train'].remove_columns(['__index_level_0__'])

In [6]:
ds['train'] = ds['train'].rename_column('weak_labels', 'label')
ds['valid'] = ds['valid'].rename_column('weak_labels', 'label')

In [7]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25624
    })
    valid: Dataset({
        features: ['text', 'label'],
        num_rows: 20000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 6368
    })
})

## Tokenization

In [8]:
from transformers import AutoTokenizer

model_ckpt = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [9]:
def tokenize_ds(field):
    '''Tokenize examples from dataset.'''
    return tokenizer(field['text'], truncation=True)

In [10]:
ds_encoded = ds.map(tokenize_ds, batched=True)

  0%|          | 0/26 [00:00<?, ?ba/s]

  0%|          | 0/20 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

In [11]:
ds_encoded

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25624
    })
    valid: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 20000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 6368
    })
})

## Modeling

In [12]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [13]:
from sklearn.metrics import accuracy_score, f1_score

def compute_scores(preds):
    '''Compute scores of transformers predictions.'''
    logits, labels = preds
    pred = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, pred)
    f1 = f1_score(labels, pred)
    return {'Accuracy': acc, 'F1 Score': f1}

In [14]:
id2label = {0: 'negative', 1: 'positive'}
label2id = {'negative': 0, 'positive': 1}

In [15]:
from transformers import AutoModelForSequenceClassification, TrainingArguments

In [16]:
model = (AutoModelForSequenceClassification
         .from_pretrained(model_ckpt, num_labels=2, 
                          id2label=id2label, label2id=label2id))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [17]:
training_args = TrainingArguments(output_dir="./bert-base-uncased-imdb", 
                                  learning_rate=2e-5, 
                                  per_device_train_batch_size=16, 
                                  per_device_eval_batch_size=16, 
                                  num_train_epochs=2, 
                                  weight_decay=0.01, 
                                  evaluation_strategy="epoch", 
                                  save_strategy="epoch", 
                                  load_best_model_at_end=True, 
                                  logging_steps=500, 
                                  log_level="error", 
                                  push_to_hub=False)

In [18]:
from transformers import Trainer

trainer = Trainer(model=model, 
                  args=training_args, 
                  train_dataset=ds_encoded['train'], 
                  eval_dataset=ds_encoded['valid'], 
                  tokenizer=tokenizer, 
                  data_collator=data_collator, 
                  compute_metrics=compute_scores)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [19]:
trainer.train()



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33me_hossam96[0m. Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,Accuracy,F1 score
1,0.4446,0.430442,0.79065,0.855606
2,0.3461,0.448653,0.7971,0.864571


TrainOutput(global_step=3204, training_loss=0.40822319859422546, metrics={'train_runtime': 2901.4306, 'train_samples_per_second': 17.663, 'train_steps_per_second': 1.104, 'total_flos': 1.333559472985056e+16, 'train_loss': 0.40822319859422546, 'epoch': 2.0})

In [20]:
# testing the model

test_outs = trainer.predict(ds_encoded['test'])

test_outs.metrics

{'test_loss': 0.49959537386894226,
 'test_Accuracy': 0.7407349246231156,
 'test_F1 Score': 0.7915667213735639,
 'test_runtime': 98.4085,
 'test_samples_per_second': 64.71,
 'test_steps_per_second': 4.044}

In [21]:
preds = np.argmax(test_outs[0], axis=-1)
labels = test_outs[1]

In [22]:
from sklearn.metrics import classification_report

print(classification_report(labels, preds))

              precision    recall  f1-score   support

           0       0.95      0.50      0.66      3146
           1       0.67      0.97      0.79      3222

    accuracy                           0.74      6368
   macro avg       0.81      0.74      0.72      6368
weighted avg       0.81      0.74      0.73      6368

