In [1]:
%load_ext blackcellmagic

In [2]:
import numpy as np
import os
import pandas as pd
import torch

from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
)




In [3]:
raw = pd.read_csv("../data/train_raw.csv")

In [4]:
raw.shape

(593791, 5)

In [5]:
raw.head()

Unnamed: 0,tweetid,user_display_name,tweet_text,clean_text,troll_or_not
0,1127654960915734528,James albert,RT sports9511: How to watch online : San Diego...,RT sports9511 How to watch online San Diego Pa...,1
1,693094686974570496,Lily Mann,How Much Ongoing Support -- and What Kinds -- ...,How Much Ongoing Support and What Kinds Should...,1
2,964240875722301440,曲剑明,＠null It is 21:50 CET now,null It is CET now,1
3,869125810334179328,春天里,": #rtl ehm,sorry. I'll show up this night, if ...",#rtl ehmsorry Ill show up this night if you do...,1
4,485477116125773824,7d87a814b4c26497e4e7a13047bc7f52452cb2566a939f...,dine getting on his girlfriend page its feels ...,dine getting on his girlfriend page its feels ...,1


In [6]:
X = list(raw["clean_text"].values)
y = list(raw["troll_or_not"].values)


train_texts, test_texts, train_labels, test_labels = train_test_split(
    X, y, random_state=42, test_size=0.2, stratify=y
)


In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [8]:
class tweetsdataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = tweetsdataset(train_encodings, train_labels)
test_dataset = tweetsdataset(test_encodings, test_labels)

In [9]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
%%time

training_args = TrainingArguments(
    output_dir="../results",  # output directory
    overwrite_output_dir=True,
    num_train_epochs=3,  # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=8,  # batch size for evaluation
    warmup_steps=1000,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    logging_dir="../logs",  # directory for storing logs
    logging_steps=5,
    save_steps=10000,
    learning_rate=5e-5,
    do_train=True,
    do_eval=True,
    evaluate_during_training=True,
    eval_steps=1,
    save_total_limit=1,
    gradient_accumulation_steps=8, #reduce memory usage while allowing bigger overall batch size.
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = Trainer(
    model=model,  # the instantiated Transformers model to be trained
    args=training_args,  # training arguments, defined above
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,  # training dataset
    eval_dataset=test_dataset,  # test dataset
)

trainer.train()


In [11]:
trainer.evaluate()

HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=1856.0, style=ProgressStyle(description_…




KeyboardInterrupt: 

In [None]:
%load_ext tensorboard
%tensorboard --logdir logs

In [None]:
ft_model = "finetuned/state_trolls"
trainer.save_model(ft_model)
tokenizer.save_pretrained(ft_model)
