In [1]:
#import libs
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import Dataset, DataLoader
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#create model with base model: bert-large-cased
model_name = 'bert-large-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=1)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
#dataset class, to store review texts and scores
class Review_Score_Dataset(Dataset):

    def __init__(self, texts, scores, tokenizer, max_length=512):
        super().__init__()
        self.tokenizer = tokenizer
        self.texts = texts
        self.scores = scores
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, index):
        text = self.texts[index]
        score = self.scores[index]
        inputs = self.tokenizer(text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()
        return{
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'score': torch.tensor(score, dtype=torch.float)
        }

In [4]:
df = pd.read_csv('Dataset/All_critic.csv')

In [8]:
train_dataset = Review_Score_Dataset(
    texts=df['review'].tolist(),
    scores=df['score'].tolist(),
    tokenizer=tokenizer
)

valid_dataset = Review_Score_Dataset(
    texts=df['review'].tolist(),
    scores=df['score'].tolist(),
    tokenizer=tokenizer
)

In [9]:
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=16, shuffle=True)

In [10]:
#training parameters
optimizer = AdamW(model.parameters(), lr=0.00001)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
loss_fn = torch.nn.MSELoss()



In [11]:
number_epochs = 20
train_losses = []
val_losses = []
counter = 0
for epoch in range(number_epochs):

    model.train()
    total_train_loss = 0
    max_step = 100
    for batch in train_dataloader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_masks = batch['attention_mask'].to(device)
        scores = batch['score'].to(device)

        outputs = model(input_ids, attention_mask=attention_masks)
        loss = loss_fn(outputs.logits.squeeze(), scores)

        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()
        max_step -= 1
        if max_step < 0:
            break

    avg_train_loss = total_train_loss / 100
    train_losses.append(avg_train_loss)

    model.eval()
    total_val_loss = 0

    with torch.no_grad():
        max_step = 50
        for batch in valid_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_masks = batch['attention_mask'].to(device)
            scores = batch['score'].to(device)

            outputs = model(input_ids, attention_mask=attention_masks)
            loss = loss_fn(outputs.logits.squeeze(), scores)

            total_val_loss += loss.item()
            max_step -= 1
            if max_step < 0:
                break
    avg_val_loss = total_val_loss / 50
    val_losses.append(avg_val_loss)
    print(f'Epoch {epoch+1}/{number_epochs} | Train Loss: {avg_train_loss:.3f} | Val Loss: {avg_val_loss:.3f}')

Epoch 1/20 | Train Loss: 5056.258 | Val Loss: 4435.960
Epoch 2/20 | Train Loss: 4000.771 | Val Loss: 3930.302
Epoch 3/20 | Train Loss: 3770.712 | Val Loss: 3600.186


KeyboardInterrupt: 

In [42]:
#save model
model.save_pretrained('FineTunedModels/bert/')
tokenizer.save_pretrained('FineTunedModels/bert/tokenlizer')

('FineTunedModels/bert/tokenlizer\\tokenizer_config.json',
 'FineTunedModels/bert/tokenlizer\\special_tokens_map.json',
 'FineTunedModels/bert/tokenlizer\\vocab.txt',
 'FineTunedModels/bert/tokenlizer\\added_tokens.json')

In [None]:
#reload model
from transformers import AutoModel, AutoTokenizer
model = AutoModel.from_pretrained('FineTunedModels/bert/')
tokenizer = AutoTokenizer.from_pretrained('FineTunedModels/bert/tokenlizer')