# BERT Word Selection
In this notebook, I take a basic implementation of BERT for Sentiment Analysis and try to improve it the accuracy using text selection or summarization.

## Setup

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
pip install datasets

[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
pip install transformers

[0mNote: you may need to restart the kernel to use updated packages.


In [8]:
pip install gensim==3.8.3

[0mNote: you may need to restart the kernel to use updated packages.


In [64]:
from datasets import load_dataset, load_metric
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, pipeline, AdamW
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
from gensim.summarization import summarize
import nltk
import spacy
import re

nlp = spacy.load("en_core_web_sm")
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Load the dataset

In [10]:
raw_datasets = load_dataset("imdb")

Downloading builder script:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [11]:
train_dataset = raw_datasets['train'].shuffle(seed=42).select(range(2000))
test_dataset = raw_datasets['test'].shuffle(seed=42).select(range(1000))
print(len(train_dataset))
print(len(test_dataset))

2000
1000


## Load the models

In [12]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [80]:
# Defining metrics to be used for all models
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    metric = load_metric("accuracy")
    return metric.compute(predictions=predictions, references=labels)


training_args = TrainingArguments("test_trainer",evaluation_strategy="epoch")

## Basic Method

In [43]:
def tokenize_dataset(dataset,tokenizer):
    tokenized_dataset = []
    for item in dataset:
        tokenized = tokenizer(item["text"],padding="max_length", truncation=True)
        item.update(tokenized)
        tokenized_dataset.append(item)
    return tokenized_dataset

tokenized_train = tokenize_dataset(train_dataset,tokenizer)
tokenized_test = tokenize_dataset(test_dataset,tokenizer)

trainer = Trainer(
    model=model, 
    args=training_args, 
    train_dataset=tokenized_train, 
    eval_dataset=tokenized_test,
    compute_metrics = compute_metrics
)

In [44]:
trainer.train()
trainer.evaluate(tokenized_test)



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.500848,0.878
2,No log,0.669341,0.87
3,No log,0.71682,0.882




TrainOutput(global_step=375, training_loss=0.06979254659016927, metrics={'train_runtime': 369.5475, 'train_samples_per_second': 16.236, 'train_steps_per_second': 1.015, 'total_flos': 1578666332160000.0, 'train_loss': 0.06979254659016927, 'epoch': 3.0})

The accuracy received from the basic method is 88.2%

## Word Selection
In this section:
- I use regex to remove HTML tagging.
- I use the Gensim Library to summarize the text to 512 token after checking token count.
- I use the tokenizer to prepare the text for training

In [86]:
from gensim.summarization import summarize

# Select the most representative sentence for each review using TextRank
def process_reviews(dataset, tokenizer, word_count=512):
    reviews = []
    for item in dataset:
        text = item['text']
        
        # Define a regular expression pattern to match HTML tags
        html_tags_pattern = re.compile(r'<.*?>')

        # Replace HTML tags with an empty string
        text = re.sub(html_tags_pattern, ' ', text)
        
        # Check if over word count and summarize
        if len(text.strip().split()) > word_count:
            text = summarize(text, word_count=word_count)
            item.update({'text': text})
        
        # Tokenize the text
        tokenized = tokenizer(text, padding="max_length", truncation=True)
        item.update(tokenized)
        reviews.append(item)
    return reviews

# Select the most representative sentence for each review in train and test dataset
tokenized_train = process_reviews(train_dataset, tokenizer)
tokenized_test = process_reviews(test_dataset, tokenizer)

In [87]:
# Define the training arguments
training_args = TrainingArguments(
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=500,
    evaluation_strategy='steps',
    eval_steps=500,
    save_total_limit=1,
    save_strategy='steps',
    save_steps=500
)

In [88]:
trainer = Trainer(
    model=model, 
    args=training_args, 
    train_dataset=tokenized_train, 
    eval_dataset=tokenized_test,
    compute_metrics = compute_metrics
)

In [90]:
trainer.train()
trainer.evaluate(tokenized_test)

Step,Training Loss,Validation Loss


{'eval_loss': 1.0537279844284058,
 'eval_accuracy': 0.888,
 'eval_runtime': 16.6963,
 'eval_samples_per_second': 59.894,
 'eval_steps_per_second': 0.479,
 'epoch': 3.0}

The accuracy for the test data is 88.8% which is 0.6% greater than the training data accuracy of 88.2%. Therefore, the summarization method is just as good if not better than the basic method.

# ChatGPT API

I tried using the ChatGPT API to summarize the text and then use that for tokenization and sentiment analysis, however I ran out of credits on the plan I was on and was unable to use the API. Below is the code I had planned to use.

In [47]:
pip install openai

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting openai
  Downloading openai-0.27.4-py3-none-any.whl (70 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.3/70.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
Successfully installed openai-0.27.4
[0mNote: you may need to restart the kernel to use updated packages.


In [57]:
import openai
openai.api_key = "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"

def summarize_chatgpt(prompt, model, length=512):
    response = openai.Completion.create(
      engine=model,
      prompt=prompt,
      max_tokens=length,
      n=1,
      stop=None,
      temperature=0.5,
    )

    summary = response.choices[0].text.strip()
    return summary


In [58]:
# Select the most representative sentence for each review using TextRank
def process_reviews_chatgpt(dataset, tokenizer):
    start_prompt = """I am performing sentiment analyis on the text below. 
    Summarize the text so that it captures the key information to be used in sentiment analysis and obeys a 512 token count. 
    Remove any HTML tags and ensure that the returned text is grammatically correct.
    Only return the summarized in your response.

    START OF TEXT TO BE SUMMARIZED
    -------------------------------
    """
    end_prompt = """----------------------------
    END OF TEXT TO BE SUMMARIZED"""
    model = "davinci"
    reviews = []
    for item in dataset:
        
        # Process text using text
        prompt = start_prompt + item['text'] + end_prompt        
        text = summarize_chatgpt(prompt, model)
        
        # Tokenize the text
        tokenized = tokenizer(text, padding="max_length", truncation=True)
        item.update(tokenized)
        reviews.append(item)
    return reviews

# Select the most representative sentence for each review in train and test dataset
tokenized_train = process_reviews_chatgpt(train_dataset, tokenizer)
tokenized_test = process_reviews_chatgpt(test_dataset, tokenizer)

RateLimitError: You exceeded your current quota, please check your plan and billing details.

In [51]:
training_args = TrainingArguments("test_trainer",evaluation_strategy="epoch")



trainer = Trainer(
    model=model, 
    args=training_args, 
    train_dataset=tokenized_train, 
    eval_dataset=tokenized_test,
    compute_metrics = compute_metrics
)

In [53]:
trainer.evaluate(tokenized_test)



{'eval_loss': 0.784658670425415,
 'eval_accuracy': 0.887,
 'eval_runtime': 18.6305,
 'eval_samples_per_second': 53.675,
 'eval_steps_per_second': 3.382,
 'epoch': 3.0}

In [52]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.760983,0.862
2,No log,0.70105,0.888
3,No log,0.784659,0.887




TrainOutput(global_step=375, training_loss=0.042518933614095054, metrics={'train_runtime': 370.2278, 'train_samples_per_second': 16.206, 'train_steps_per_second': 1.013, 'total_flos': 1578666332160000.0, 'train_loss': 0.042518933614095054, 'epoch': 3.0})