In [1]:
import torch
import torch.nn as nn # neural network module
import torch.optim as optim # optimizer module
from torch.utils.data import DataLoader, Dataset, TensorDataset, RandomSampler, SequentialSampler # utilities
import torchvision.transforms as transforms # torchvision (computer vision)
import torchaudio # torchaudio (audio processing)
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup, AdamW
import requests
from dotenv import load_dotenv
import os
import re 
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize as tokenize
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score
import numpy as np
from keras.preprocessing.sequence import pad_sequences

In [2]:
# STEP 1: IMPORT AND PREPROCESS DATA

#training data
train_df_unfiltered = pd.read_json('Movies_and_TV.json', lines=True)
train_df = train_df_unfiltered.loc[:, ["overall", "reviewText"]]
#print(train_df.head())

In [3]:
# preprocessing functions
# cleans up text in reviews
# str -> str
def preprocess_text(text):
    text = str(text)
    text = re.sub(r'http\S+', '', text) # remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A) # remove numbers and special characters
    text = text.lower()
    tokens = tokenize(text)
    stop_words = set(stopwords.words('english')) # get and remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

def shift_score(score):
    return (int(score) - 1)
    
train_df['reviewText'] = train_df['reviewText'].apply(preprocess_text)
train_df['overall'] = train_df['overall'].apply(shift_score)
print(train_df.head())

   overall                                         reviewText
0        4  sorry didnt purchase years ago first came good...
1        4  believe tell receive blessing watching video c...
2        4  seen x live many times early days recent reuni...
3        4  excited finally live concert video x ive seen ...
4        4  x one best punk bands ever dont even like call...


In [4]:
# STEP 2: CREATE A TEST AND VALIDATE DATASET

In [5]:
# split data into training data and testing/validating data
#train_df, test_df = train_test_split(train_df, test_size=0.2, random_state=19)
#train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=19)
#print(train_df.head())
#print(test_df.head())
#print(val_df.head())
#print(train_df.shape[0]) # should have 64% of rows
#print(test_df.shape[0]) # should have 20% of rows
#print(val_df.shape[0]) # should have 16% of rows

In [6]:
# STEP 2: TOKENIZE DATA FOR BERT

# initialize the Bert tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# tokenize the text
max_len = 512
# tokenizer on every reviewText, special tokens to indicate CLS, SEP, etc. for bert
train_df['input_ids'] = train_df['reviewText'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512, truncation=True)) 
# post truncates sequences longer than and pads sequences shorter than max_len from the end 
input_ids_padded = pad_sequences(train_df['input_ids'].tolist(), maxlen=max_len, dtype="long", truncating="post", padding="post")
train_df['input_ids'] = input_ids_padded.tolist()
train_df['attention_masks'] = train_df['input_ids'].apply(lambda seq: [float(i > 0) for i in seq]) # set padding tokens to 0
#print(train_df[['overall', 'input_ids', 'attention_masks']].head())

In [7]:
# convert lists to PyTorch-specific tensors
input_ids = torch.tensor(train_df['input_ids'].values.tolist())
attention_masks = torch.tensor(train_df['attention_masks'].values.tolist())
labels = torch.tensor(train_df['overall'].values)

# create the DataLoader for training and validation sets
batch_size = 16

dataset = TensorDataset(input_ids, attention_masks, labels)
train_size = int(0.8 * len(dataset)) # change this to make more or less of set for training/validation
val_size = len(dataset) - train_size

train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)

In [8]:
# STEP 3: INITIALIZE MODEL (BERT)

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=5,  # number of unique sentiment classes
    output_attentions=False,
    output_hidden_states=False
)

# move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [9]:
# STEP 4: TRAIN THE MODEL

learning_rate = 2e-5
epsilon = 1e-8

# optimize parameters
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=epsilon)

epochs = 4 # number of passes through the entire training dataset
total_steps = len(train_dataloader) * epochs

# scheduler initialization
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# training function
# NULL -> NULL
def train():
    model.train() # training mode (enables gradients and dropout)
    total_loss = 0

    for step, batch in enumerate(train_dataloader):
        batch_input_ids, batch_attention_mask, batch_labels = tuple(t.to(device) for t in batch)
        
        model.zero_grad() # clears gradient from previous batch
        
        outputs = model(batch_input_ids, attention_mask=batch_attention_mask, labels=batch_labels) # forward pass through model
        loss = outputs.loss
        total_loss += loss.item() # compute total loss
        
        loss.backward() # backwards propagation
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # clips gradient to prevent exploding
        
        optimizer.step()
        scheduler.step()
    
    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Training loss: {avg_train_loss}")

# validating function
# NULL -> NULL
def validate():
    model.eval() #evaluation mode (disables gradients and dropout)
    preds, true_labels = [], []
    
    for batch in val_dataloader:
        batch_input_ids, batch_attention_mask, batch_labels = tuple(t.to(device) for t in batch)

        # without computing gradients, perform forward pass
        with torch.no_grad():
            outputs = model(batch_input_ids, attention_mask=batch_attention_mask)

        # get model predictions and true labels
        logits = outputs.logits
        preds.extend(torch.argmax(logits, dim=1).cpu().numpy()) 
        true_labels.extend(batch_labels.cpu().numpy())
    
    acc = accuracy_score(true_labels, preds)
    print(f"Validation Accuracy: {acc}")

for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    train()
    validate()



Epoch 1/4


KeyboardInterrupt: 

worst video ive ever seen


                     comment  sentiment
1  worst video ive ever seen          0
