# BERT Review Sentiment

Testing out BERT by predicting the sentiment of ten thousand randomly sampled English-language Steam reviews.

First, we will need to install any necessary Python packages.

In [None]:
!pip install transformers

We can then import and set up the required modules and initialise some constant variables.

In [None]:
# imports
from csv import reader as csv_reader
from google.colab import drive
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
import nltk
import numpy as np
import pandas as pd
import tensorflow as tf

# constants
PATH_GDRIVE = '/content/drive'
PATH_DATA = 'drive/MyDrive/MSc Dissertation/review_sentiments/dataset_50k.csv'
PATH_MODEL = 'drive/MyDrive/MSc Dissertation/review_sentiments/model_50k'
KEY_TEXT = 'text'
KEY_LABEL = 'polarity'
BERT_MODEL = 'bert-base-uncased'

# module initialisation
drive.mount(PATH_GDRIVE)
nltk.download('stopwords')

We will then create functions to load, split and tokenize the review data. Our dataset will be split into three sets: a training set and a validation set for training a predictive model and a test set for testing our trained model.

In [None]:
def load_data():
    data = { KEY_TEXT: [], KEY_LABEL: [] }
    with open(PATH_DATA, 'r', encoding='utf-8') as f:
        reader = csv_reader(f, delimiter=',')
        for review in reader:
            data[KEY_TEXT].append(review[0]) # review text (string)
            data[KEY_LABEL].append(int(review[1])) # review polarity (int: 0, 1)
    return pd.DataFrame.from_dict(data).sample(frac=1).reset_index(drop=True)

def split_data(data):
    model_data, test_data = train_test_split(data, test_size=0.5)
    train_data, val_data = train_test_split(model_data, test_size=0.2)
    return train_data, val_data, test_data

def train_model(data, tokenizer, model):
    return

def split_and_tokenize_data(data):
    # split data
    train_x, val_x, train_y, val_y = train_test_split(
        data[KEY_TEXT].tolist(),
        data[KEY_LABEL].tolist(),
        test_size=0.2
    )
    # load tokenizer and create some helper lambdas
    tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)
    encode = lambda x: tokenizer(x, truncation=True, padding=True)
    format = lambda x, y: tf.data.Dataset.from_tensor_slices((dict(x), y))
    # encode and format the data
    train_data = format(encode(train_x), train_y)
    val_data = format(encode(val_x), val_y)
    return train_data, val_data
    #model = BertForSequenceClassification.from_pretrained(BERT_MODEL, num_labels=2)

def main():
    # load and split the data
    data = load_data()
    train_data, val_data, test_data = split_data(data)
    # load a pre-trained tokenizer and classifier
    tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)
    classifier = BertForSequenceClassification.from_pretrained(BERT_MODEL, num_labels=2)

main()