In [None]:
from process_tweets import get_data, give_emoji_free_text, CustomDataset

import pandas as pd
import copy
import matplotlib.pyplot as plt
import torch
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

In [None]:
train_file="../data/train_en.tsv"
test_file="../data/dev_en.tsv"

train_df = pd.read_csv(train_file, sep='\t',skiprows=0, encoding = 'utf-8')
train_df['clean text'] = train_df['text'].apply(give_emoji_free_text)
test_df = pd.read_csv(test_file, sep='\t',skiprows=0, encoding = 'utf-8')
test_df['clean text'] = test_df['text'].apply(give_emoji_free_text)

_, _, _, longest_sent, _, _ = get_data(train_file, test_file)


### Additional pip installs I had to run:
pip install emoji
pip install nltk
pip install transformers[torch]
pip install accelerate -U

In [None]:
# Bert-specific tokenizer

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [None]:
# convert dataframe --> Dataset
hate_train_dataset = CustomDataset(longest_sent=longest_sent, data=train_df, tokenizer=tokenizer)
hate_test_dataset = CustomDataset(longest_sent=longest_sent, data=test_df, tokenizer=tokenizer)

In [None]:
# Convert to DataLoader
from torch.utils.data import DataLoader

hate_train_dataloader = DataLoader(hate_train_dataset, batch_size=10)
hate_test_dataloader = DataLoader(hate_test_dataset, batch_size=10)

In [None]:
# Load model
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

In [None]:
# Optimizer
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
# LR decay schedule
from transformers import get_scheduler

num_epochs = 10
num_training_steps = num_epochs * len(hate_train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

In [None]:
progress_bar = tqdm(range(num_training_steps))

bert_train_acc_epoch = []
model.train()
for epoch in range(num_epochs):
    for batch in hate_train_dataloader:
        # print(batch)
        # break

        model.zero_grad()
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)