# Weakly Learning on IMDB Reviews

Learning using BERT base uncased over the weak labels of the IMDB dataset

## Setup

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_style('white')
sns.set_context('notebook')
%matplotlib inline

## Loading Dataset

In [2]:
train_df = pd.read_json('./train.json')
temp_df = pd.read_json('./valid.json')

In [3]:
train_df.head()

Unnamed: 0,text,label,weak_labels
1,This movie is a great. The plot is very true t...,1,1
2,"George P. Cosmatos' ""Rambo: First Blood Part I...",0,1
5,While this movie's style isn't as understated ...,1,0
7,"really awful... lead actor did OK... the film,...",0,0
9,Home Room deals with a Columbine-like high-sch...,1,1


In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12506 entries, 1 to 26542
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   text         12506 non-null  object
 1   label        12506 non-null  int64 
 2   weak_labels  12506 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 390.8+ KB


In [5]:
temp_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13053 entries, 0 to 28706
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   text         13053 non-null  object
 1   label        13053 non-null  int64 
 2   weak_labels  13053 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 407.9+ KB


In [6]:
valid_df = temp_df.iloc[:10000, :].reset_index(drop=True)
test_df = temp_df.iloc[10000:, :].reset_index(drop=True)

In [7]:
from datasets import Dataset, DatasetDict

train_ds = Dataset.from_pandas(train_df[['text', 'weak_labels']], split='train')
valid_ds = Dataset.from_pandas(valid_df[['text', 'weak_labels']], split='valid')
test_ds = Dataset.from_pandas(test_df[['text', 'label']], split='test')

ds = DatasetDict({'train': train_ds, 'valid': valid_ds, 'test': test_ds})

In [8]:
ds['train'] = ds['train'].remove_columns(['__index_level_0__'])

In [9]:
ds['train'] = ds['train'].rename_column('weak_labels', 'label')
ds['valid'] = ds['valid'].rename_column('weak_labels', 'label')

In [10]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 12506
    })
    valid: Dataset({
        features: ['text', 'label'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 3053
    })
})

## Tokenization

In [11]:
from transformers import AutoTokenizer

model_ckpt = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [12]:
def tokenize_ds(field):
    '''Tokenize examples from dataset.'''
    return tokenizer(field['text'], truncation=True)

In [13]:
ds_encoded = ds.map(tokenize_ds, batched=True)

  0%|          | 0/13 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [14]:
ds_encoded

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 12506
    })
    valid: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3053
    })
})

## Modeling

In [15]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
from sklearn.metrics import accuracy_score, f1_score

def compute_scores(preds):
    '''Compute scores of transformers predictions.'''
    logits, labels = preds
    pred = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, pred)
    f1 = f1_score(labels, pred)
    return {'Accuracy': acc, 'F1 Score': f1}

In [17]:
id2label = {0: 'negative', 1: 'positive'}
label2id = {'negative': 0, 'positive': 1}

In [18]:
from transformers import AutoModelForSequenceClassification, TrainingArguments

In [19]:
model = (AutoModelForSequenceClassification
         .from_pretrained(model_ckpt, num_labels=2, 
                          id2label=id2label, label2id=label2id))

Downloading pytorch_model.bin:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [20]:
training_args = TrainingArguments(output_dir="./bert-base-uncased-imdb", 
                                  learning_rate=2e-5, 
                                  per_device_train_batch_size=16, 
                                  per_device_eval_batch_size=16, 
                                  num_train_epochs=2, 
                                  weight_decay=0.01, 
                                  evaluation_strategy="epoch", 
                                  save_strategy="epoch", 
                                  load_best_model_at_end=True, 
                                  logging_steps=500, 
                                  log_level="error", 
                                  push_to_hub=False)

In [21]:
from transformers import Trainer

trainer = Trainer(model=model, 
                  args=training_args, 
                  train_dataset=ds_encoded['train'], 
                  eval_dataset=ds_encoded['valid'], 
                  tokenizer=tokenizer, 
                  data_collator=data_collator, 
                  compute_metrics=compute_scores)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [22]:
trainer.train()



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,Accuracy,F1 score
1,0.3542,0.33145,0.8687,0.896361
2,0.22,0.385913,0.8658,0.894247


TrainOutput(global_step=1564, training_loss=0.27681667054705605, metrics={'train_runtime': 1435.493, 'train_samples_per_second': 17.424, 'train_steps_per_second': 1.09, 'total_flos': 6314628328647840.0, 'train_loss': 0.27681667054705605, 'epoch': 2.0})

In [23]:
# testing the model

test_outs = trainer.predict(ds_encoded['test'])

test_outs.metrics

{'test_loss': 0.28623396158218384,
 'test_Accuracy': 0.8869963969865706,
 'test_F1 Score': 0.9127686472819218,
 'test_runtime': 45.7788,
 'test_samples_per_second': 66.69,
 'test_steps_per_second': 4.172}

In [24]:
preds = np.argmax(test_outs[0], axis=-1)
labels = test_outs[1]

In [25]:
from sklearn.metrics import classification_report

print(classification_report(labels, preds))

              precision    recall  f1-score   support

           0       0.93      0.77      0.84      1179
           1       0.87      0.96      0.91      1874

    accuracy                           0.89      3053
   macro avg       0.90      0.86      0.88      3053
weighted avg       0.89      0.89      0.88      3053

