In [None]:
from accelerate import Accelerator, DistributedType

from transformers import BertForSequenceClassification, BertTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
import datasets
import torch
import torch.nn
from torch.utils.data import Dataset, DataLoader, SequentialSampler, RandomSampler
from tqdm.notebook import tqdm

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

In [None]:
batch_size = 32
lr = 2e-5
epochs = 5

# Load Data

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
df = pd.read_csv('data/filtered.csv')
df = df.dropna()
df['target'] = df['target'].apply(lambda x:1 if x == 4 else 0)

train_df, eval_df = train_test_split(df, test_size=0.2, random_state=2020)
eval_df, test_df = train_test_split(eval_df, test_size=0.2, random_state=2020)

In [None]:
class SentimentDataset(Dataset):
    def __init__(self, tokenizer, df, max_length=100):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return self.df.shape[0]
        
    def __getitem__(self, index):
        selected_df = self.df.iloc[index]
        text = selected_df['text']
        labels = selected_df['target']
        
        input_ids= self.tokenizer.encode_plus(
            text, truncation=True, max_length=self.max_length,
            return_tensors='pt', padding='max_length',add_special_tokens=True 
        )
        
        return {
            'input_ids': input_ids['input_ids'].squeeze(),
            'attn_mask': input_ids['attention_mask'].squeeze(),
            'token_type_ids': input_ids['token_type_ids'].squeeze(),
            'labels': torch.tensor(labels, dtype=torch.long)
        }

In [None]:
train_dataset = SentimentDataset(tokenizer, train_df)
train_sample = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sample, batch_size=batch_size)

eval_dataset = SentimentDataset(tokenizer, eval_df)
eval_sampler = RandomSampler(eval_dataset)
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=batch_size)

# Build model

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model = model.to('cuda')

In [None]:
optimizer = AdamW(params=model.parameters(), lr=lr)

lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=100,
    num_training_steps=len(train_dataloader) * epochs,
)

# Build training loop

In [None]:
for i in tqdm(range(epochs)):
    model.train()
    
    for step, batch in enumerate(tqdm(train_dataloader)):
        input_ids = batch['input_ids'].to('cuda')
        attention_mask = batch['attn_mask'].to('cuda')
        token_type_ids = batch['token_type_ids'].to('cuda')
        labels = batch['labels'].to('cuda')
        
        optimizer.zero_grad()
        
        outputs = model(
            input_ids, 
            token_type_ids=token_type_ids, 
            attention_mask=attention_mask, 
            labels=labels
        )
        loss = outputs[0]
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        lr_scheduler.step()