# BERT Review Sentiment

Testing out BERT by predicting the sentiment of ten thousand randomly sampled English-language Steam reviews.

First, we will need to install some Python packages.

In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 9.4 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 56.9 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 73.5 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 71.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 4.4 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  A

We can then import and set up the required modules and initialise some constant variables.

In [3]:
# imports
from csv import reader as csv_reader
from google.colab import drive
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
import nltk
import numpy as np
import pandas as pd
import tensorflow as tf

# constants
PATH_GDRIVE = '/content/drive'
PATH_DATA = 'drive/MyDrive/MSc Dissertation/review_sentiments/dataset_10k.csv'
PATH_MODEL = 'drive/MyDrive/MSc Dissertation/review_sentiments/dataset_10k.csv'
KEY_TEXT = 'text'
KEY_LABEL = 'polarity'
BERT_MODEL = 'bert-base-uncased'

# module initialisation
drive.mount(PATH_GDRIVE)
nltk.download('stopwords')

Mounted at /content/drive
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

We will then create functions to load, split and tokenize the review data.

In [4]:
def load_data():
    data = { KEY_TEXT: [], KEY_LABEL: [] }
    with open(PATH_DATA, 'r', encoding='utf-8') as f:
        reader = csv_reader(f, delimiter=',')
        for review in reader:
            data[KEY_TEXT].append(review[0]) # review text (string)
            data[KEY_LABEL].append(int(review[1])) # review polarity (int: 0, 1)
    return pd.DataFrame.from_dict(data).sample(frac=1).reset_index(drop=True)

def split_and_tokenize_data(data):
    # split data
    train_x, val_x, train_y, val_y = train_test_split(
        data[KEY_TEXT].tolist(),
        data[KEY_LABEL].tolist(),
        test_size=0.5
    )
    # load tokenizer and create some helper lambdas
    tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)
    encode = lambda x: tokenizer(x, truncation=True, padding=True)
    format = lambda x, y: tf.data.Dataset.from_tensor_slices((dict(x), y))
    # encode and format the data
    train_data = format(encode(train_x), train_y)
    val_data = format(encode(val_x), val_y)
    return train_data, val_data



#model = BertForSequenceClassification.from_pretrained(BERT_MODEL, num_labels=2)

data = load_data()
split_and_tokenize_data(data)


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

(<TensorSliceDataset shapes: ({input_ids: (512,), token_type_ids: (512,), attention_mask: (512,)}, ()), types: ({input_ids: tf.int32, token_type_ids: tf.int32, attention_mask: tf.int32}, tf.int32)>,
 <TensorSliceDataset shapes: ({input_ids: (512,), token_type_ids: (512,), attention_mask: (512,)}, ()), types: ({input_ids: tf.int32, token_type_ids: tf.int32, attention_mask: tf.int32}, tf.int32)>)