In [None]:
# import libs
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

from transformers import BertModel, BertTokenizer

import pandas as pd

In [None]:
# load basic bert model and tokenlizers from local
bert_model = BertModel.from_pretrained('OriginalModels/bert_large_cased')
bert_tokenlizer = BertTokenizer.from_pretrained('OriginalModels/bert_large_cased_tokenlizer')

In [None]:
# create customer model
class BERTandRegressionModel(nn.Module):
    def __init__(self, base_model):
        super(BERTandRegressionModel, self).__init__()
        self.bert = base_model
        # the pooler output of bert_large_cased is 1024
        self.fc1 = nn.Linear(1024, 768)
        self.bn1 = nn.BatchNorm1d(768)
        self.relu1 = nn.LeakyReLU()

        self.fc2 = nn.Linear(768, 512)
        self.bn2 = nn.BatchNorm1d(512)
        self.relu2 = nn.LeakyReLU()

        self.fc3 = nn.Linear(512, 128)
        self.bn3 = nn.BatchNorm1d(128)
        self.relu3 = nn.LeakyReLU()

        self.fc4 = nn.Linear(128, 1)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        bert_output = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        y = bert_output.pooler_output
        y = self.fc1(y)
        y = self.bn1(y)
        y = self.relu1(y)
        
        y = self.fc2(y)
        y = self.bn2(y)
        y = self.relu2(y)

        y = self.fc3(y)
        y = self.bn3(y)
        y = self.relu3(y)

        y = self.fc4(y)
        return y

In [None]:
# create dataset and dataloader
class Review_Score_Dataset(Dataset):

    def __init__(self, texts, scores, tokenlizer, max_length=512):
        super().__init__()
        self.tokenlizer = tokenlizer
        self.texts = texts
        self.scores = scores
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, index):
        text = self.texts[index]
        score = self.scores[index]
        inputs = self.tokenlizer(text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()
        token_type_ids = inputs['token_type_ids'].squeeze()
        return{
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'token_type_ids': token_type_ids, 
            'score': torch.tensor(score, dtype=torch.float)
        }

In [None]:
dfc = pd.read_csv('Dataset/All_critic.csv')
dfu = pd.read_csv('Dataset/All_user.csv')
dfu['score'] = dfu['score'] * 10
df = pd.concat([dfc, dfu], ignore_index=True)
train_dataset = Review_Score_Dataset(
    texts=df['review'].tolist(),
    scores=df['score'].tolist(),
    tokenlizer=bert_tokenlizer
)

In [None]:
bertregmodel = BERTandRegressionModel(bert_model)

In [None]:
bert_params = list(bertregmodel.bert.parameters())
regression_params = list(bertregmodel.fc1.parameters()) + list(bertregmodel.fc2.parameters()) + list(bertregmodel.fc3.parameters()) + list(bertregmodel.fc4.parameters())
param_groups = [
    {'params': bert_params, 'lr': 0.0001},  # Smaller learning rate for BERT
    {'params': regression_params, 'lr': 0.0005}  # Higher learning rate for regression layers
]

optimizer = AdamW(param_groups)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bertregmodel = bertregmodel.to(device)
loss_fn = torch.nn.MSELoss()

In [None]:
dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)

In [None]:
epoch_num = 20
train_losses = []
valid_losses = []

for epoch in range(epoch_num):

    bertregmodel.train()
    total_train_loss = 0.0
    max_step = 50
    
    for batch in dataloader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_masks = batch['attention_mask'].to(device)
        scores = batch['score'].to(device)

        outputs = bertregmodel(input_ids, attention_mask=attention_masks)
        loss = loss_fn(outputs.squeeze(), scores.squeeze())

        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()
        max_step -= 1
        if max_step <= 0:
            break
    
    avg_train_loss = total_train_loss / 50.0
    train_losses.append(avg_train_loss)
    '''
    bertregmodel.eval()
    total_val_loss = 0

    with torch.no_grad():
        max_step = 50
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_masks = batch['attention_mask'].to(device)
            scores = batch['score'].to(device)

            outputs = bertregmodel(input_ids, attention_mask=attention_masks)
            loss = loss_fn(outputs.squeeze(), scores.squeeze())

            total_val_loss += loss.item()
            max_step -= 1
            if max_step <=0:
                break
    avg_val_loss = total_val_loss / 50.0
    valid_losses.append(avg_val_loss)
    print(f'Epoch {epoch+1}/{epoch_num} | Train Loss: {avg_train_loss:.3f} | Val Loss: {avg_val_loss:.3f}')
    '''
    print(f'Epoch {epoch+1}/{epoch_num} | Train Loss: {avg_train_loss:.3f}')

In [None]:
torch.save(bertregmodel, 'FineTunedModels/bertv1')