In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import nltk
from nltk.corpus import stopwords
from collections import Counter
import string

# Download NLTK stopwords if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
import pandas as pd
import json
import torch
from tabulate import tabulate
import numpy as np
import re
import nltk
from nltk.stem import PorterStemmer
from tqdm.notebook import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Process Data

In [None]:
file_path = '../data/IMDB_reviews.json'
data = []
with open(file_path, 'r') as file:
    for line in file:
        data.append(json.loads(line))

df = pd.DataFrame(data)

In [None]:
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'www\S+', '', text)
    
    # Remove emails
    text = re.sub(r'\S*@\S*\s?', '', text)
    
    # Normalize whitespaces
    text = re.sub(r'\s+', ' ', text)
    
    return text
# Set up tqdm for pandas apply
tqdm.pandas(desc="Cleaning Text")

# Apply the cleaning function with a progress bar
df['cleaned_review_text'] = df['review_text'].progress_apply(clean_text)

In [None]:
# Specify the path where you want to save the JSON file
json_file_path = '../data/cleandata.json'  # Change this to your desired file path

# Save the DataFrame to a JSON file
df.to_json(json_file_path, orient='records', lines=True)


In [None]:
# load preprocessed data
file_path = '../data/cleandata.json'
data = []
with open(file_path, 'r') as file:
    for line in file:
        data.append(json.loads(line))

df = pd.DataFrame(data)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['is_spoiler'])
# Split the data 
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label'])

In [None]:
train_txt = train_df.cleaned_review_text.tolist()
train_label = train_df.label.tolist()

val_txt = val_df.cleaned_review_text.tolist()
val_label = val_df.label.tolist()

test_txt = test_df.cleaned_review_text.tolist()
test_label = test_df.label.tolist()

## Set up training

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification # 

model_path = "google-bert/bert-base-uncased"  # "allenai/longformer-base-4096"
model_name = 'bert'
context_len = 512   # 4096

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=len(set(train_label))).to(f'cuda:0')

In [None]:
from data_utils import *

# tokenize input text
# load preprocessed results if the specified path exists
new_train_id = down_sample(train_label)
print('Numer of training samples', len(new_train_id))
train_data = create_dataset([train_txt[i] for i in new_train_id], [train_label[i] for i in new_train_id], tokenizer, f'review_{model_name}_{context_len}_train.pt', max_len=context_len, num_cpus=8)
val_data = create_dataset(val_txt, val_label, tokenizer, f'review_{model_name}_{context_len}_val.pt', max_len=context_len, num_cpus=8)
test_data = create_dataset(test_txt, test_label, tokenizer, f'review_{model_name}_{context_len}_test.pt',max_len=context_len, num_cpus=8)

In [None]:
train_loader = make_dataloader(train_data, 16, shuffle=True)
val_loader = make_dataloader(val_data, 32)
test_loader = make_dataloader(test_data, 32)

## Model Training

In [None]:
from sklearn.metrics import f1_score
from tqdm import tqdm
import numpy as np
import torch
import torch.nn.functional as F
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup, get_constant_schedule_with_warmup
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from collections import Counter, defaultdict

In [None]:
max_epochs = 10
total_steps = len(train_loader) * max_epochs
optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=2e-5, eps=1e-8)

loss_fn = nn.CrossEntropyLoss()
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0.1*total_steps,
                                            num_training_steps=total_steps)

val_step = 5000

In [None]:
model.zero_grad()
best_score = 0
for e in range(max_epochs):
    print(f'Training epoch {e+2}')
    total_train_loss = 0
    for step, batch in enumerate(tqdm(train_loader)):
        input_ids = batch[0].to(f'cuda:0')
        input_mask = batch[1].to(f'cuda:0')
        labels = batch[2].to(f'cuda:0')
        logits = model(input_ids, 
                    attention_mask=input_mask).logits
        loss = loss_fn(logits, labels)
        total_train_loss += loss.item()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        model.zero_grad()
        if (step+1) % val_step == 0:
            class_pred = []
            labels = []
            for batch in tqdm(val_loader):
                batch_max_len = batch[1].sum(dim=1).max()
                input_ids = batch[0][:, :batch_max_len].to(f'cuda:0')
                input_mask = batch[1][:, :batch_max_len].to(f'cuda:0')
                with torch.no_grad():
                    logits = model(input_ids,  
                               attention_mask=input_mask).logits
                    preds = logits.argmax(dim=-1)
                    class_pred.extend(preds.cpu().numpy().tolist())
                    labels.extend(batch[2].numpy().tolist())
            micro, macro = acc(class_pred, labels)
            print(f'Micro F1: {micro}, Macro F1: {macro}')
            if micro > best_score:
                best_score = micro
                torch.save(model.state_dict(), 'best_val_model.pt')
    torch.save(model.state_dict(), f'epoch_{e}_model.pt')

## Load and Test Model

In [None]:
# load model

model.load_state_dict(torch.load('best_val_model.pt'))

In [None]:
class_pred = []
labels = []
for batch in tqdm(test_loader):
    batch_max_len = batch[1].sum(dim=1).max()
    input_ids = batch[0][:, :batch_max_len].to(f'cuda:0')
    input_mask = batch[1][:, :batch_max_len].to(f'cuda:0')
    with torch.no_grad():
        logits = model(input_ids,  
                   attention_mask=input_mask).logits
        
        preds = logits.argmax(dim=-1)
        class_pred.extend(preds.cpu().numpy().tolist())
        labels.extend(batch[2].numpy().tolist())
print(acc(class_pred, labels))