In [None]:
# Import dependencies
import pandas as pd
import numpy as np
import re
import random
from pathlib import Path, PurePath
from typing import List, Collection

import git
from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaConfig
from transformers import Trainer, TrainingArguments
from transformers import EarlyStoppingCallback
from datasets import load_metric

In [None]:
# Print GPU info, set default device and empty cache
print(torch.cuda.is_available())
if torch.cuda.is_available():
    gpu_count = torch.cuda.device_count()
    for i in range(gpu_count):
        print(torch.cuda.get_device_properties(i))
device = torch.device('cuda:'+str(gpu_count-1) if torch.cuda.is_available() else 'cpu')
torch.cuda.empty_cache()

In [None]:
# load datasets
if 'google.colab' not in str(get_ipython()):
    use_fp16 = False
    PROJECT_ROOT = git.Repo(PurePath(), search_parent_directories=True).git.rev_parse("--show-toplevel")
    DATA_ROOT = PurePath(PROJECT_ROOT, "data")
else:
    from google.colab import drive
    use_fp16 = True
    drive.mount('/content/drive')
    PROJECT_ROOT = PurePath('')
    DATA_ROOT = Path('/content/drive/MyDrive/twitter-datasets')


with open(PurePath(DATA_ROOT, 'train_pos.txt'), 'r', encoding='utf-8') as fp:
    train_pos_sub = fp.readlines()

with open(PurePath(DATA_ROOT, 'train_neg.txt'), 'r', encoding='utf-8') as fp:
    train_neg_sub = fp.readlines()

with open(PurePath(DATA_ROOT, 'train_pos_full.txt'), 'r', encoding='utf-8') as fp:
    train_pos_full = fp.readlines()

with open(PurePath(DATA_ROOT, 'train_neg_full.txt'), 'r', encoding='utf-8') as fp:
    train_neg_full = fp.readlines()

with open(PurePath(DATA_ROOT, 'test_data.txt'), 'r', encoding='utf-8') as fp:
    test_full = fp.readlines()

def load_dataset(ratio=0.01):
    if type(ratio) is int:
        ratio = float(ratio)
    assert isinstance(ratio, str) or isinstance(ratio, float)
    if type(ratio) is float:
        if ratio <= 0 or ratio > 1:
            raise AttributeError('The input should be \'full\', \'sub\', or a (float) number between 0 and 1')
        num_samples = int(ratio*len(train_pos_full))
        return random.sample(train_pos_full, num_samples), random.sample(train_neg_full, num_samples)
    else:
        if ratio == 'full':
            return train_pos_full, train_neg_full
        elif ratio == 'sub':
            return train_pos_sub, train_neg_sub
        else:
            raise AttributeError('The input should be \'full\', \'sub\', or a (float) number between 0 and 1')

data_pos, data_neg = load_dataset(0.05)
print(len(data_pos), len(data_neg), len(test_full))

In [None]:
# The pre-processing function besides tokenizers
def cleaning(text_list: list) -> List:
    text_list = [re.sub(r'(<.*?>)|(\r\n|\r|\n)|(\'|\")', '', s.lstrip()) for s in text_list]
    return list(filter(lambda x: x != "", text_list))

data_pos, data_neg, data_test = list(set(cleaning(data_pos))), list(set(cleaning(data_neg))), cleaning(test_full)
print(len(data_pos), len(data_neg), len(data_test))

In [None]:
# The maximum, minimum number of words in tweets
# ... and empty entries
def get_min_max(text_list: list) -> (int, int, List):
    min_len = 999
    max_len = 0
    zero_len_idx = []
    for idx, t in enumerate(text_list):
        t_len = len(t.split())
        if t_len == 0:
            zero_len_idx.append(idx)
        if t_len > max_len:
            max_len = t_len
        if t_len < min_len:
            min_len = t_len
    return min_len, max_len, zero_len_idx

test = [s.split(',', 1)[-1] for s in data_test]
min_test, max_test, zero_len_idx_test = get_min_max(test)
print(min_test, max_test, zero_len_idx_test, len(test))
test_text = list(filter(lambda x: x != "", test))
print(len(test_text))

In [None]:
# The ids of the items in test_text
test_id = list(set(range(1, len(data_test)+1)) - set(zero_len_idx_test))

In [None]:
# Config pre-trained tokenizers and models
model_type = 'roberta'
pretrained_model_name = 'roberta-base'
num_classes = 2

In [None]:
MODEL_CLASSES = {
    'roberta': (RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig),
}

model_class, tokenizer_class, config_class = MODEL_CLASSES[model_type]

In [None]:
MODEL_ROOT = PurePath(PROJECT_ROOT, ".pretrained_models", pretrained_model_name)
tokenizer = tokenizer_class.from_pretrained(pretrained_model_name)
tokenizer.save_pretrained(MODEL_ROOT)

config = config_class.from_pretrained(pretrained_model_name)
config.num_labels = num_classes
config.use_bfloat16 = use_fp16
config.problem_type = "single_label_classification"
config.save_pretrained(MODEL_ROOT)

model = model_class.from_pretrained(pretrained_model_name, config=config)
model.save_pretrained(MODEL_ROOT)

In [None]:
# Shuffle the dataset and split it to train- and validation- set
train_texts = data_pos + data_neg
train_labels = [1]*len(data_pos) + [0]*len(data_neg)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)
len(train_texts), len(val_texts)

In [None]:
class TweetDataset(Dataset):
    def __init__(self, text_list: list, labels_list: list, max_length: int, tokenizer=tokenizer):
        self.encodings = tokenizer(text_list, truncation=True, padding='max_length', max_length=max_length)
        self.labels = labels_list

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings.input_ids)    

In [None]:
# Encode datasets
train_dataset, val_dataset = TweetDataset(train_texts, train_labels, max_test), TweetDataset(val_texts, val_labels, max_test)

In [None]:
# Load back model
config = config_class.from_pretrained(MODEL_ROOT)
model = model_class.from_pretrained(MODEL_ROOT, config=config)
# ... and print model structure
print(model)

In [None]:
# Specify frozen layers
if 'roberta-base' in pretrained_model_name:
    num_layers = 12
elif 'roberta-large' in pretrained_model_name:
    num_layers = 24

frozen_layers = ['embeddings'] + ['layer.' + str(i) for i in range(int(num_layers*0.75)) ]

for name, param in model.named_parameters():
    for frozen_name in frozen_layers:
        if frozen_name in name:
            param.requires_grad = False

In [None]:
# Evaluation metrics
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
training_args = TrainingArguments(
    output_dir=PurePath(PROJECT_ROOT, '.trainer', pretrained_model_name), 
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=1500,
    per_device_eval_batch_size=1500,
    learning_rate=1e-5,
    warmup_steps=5,
    weight_decay=0.01,
    logging_dir=PurePath(PROJECT_ROOT, '.trainer', pretrained_model_name, 'logs'),
    evaluation_strategy="steps",
    logging_steps=100,
    fp16=use_fp16,
    # group_by_length=True,
    load_best_model_at_end=True,
    save_total_limit=3
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)


In [None]:
trainer.train()

In [None]:
trainer.save_model(MODEL_ROOT)

In [None]:
# Make predictions
test_loader = DataLoader(test_text, batch_size=1000)
predictions = torch.tensor([], device=device)
with torch.no_grad():
    for test_data in test_loader:
        inputs = tokenizer(test_data, truncation=True, padding='max_length', max_length=max_test, return_tensors="pt")
        inputs = inputs.to(device)
        logit = model(**inputs).logits
        prediction = torch.argmax(torch.softmax(logit,dim=-1), dim=-1)
        predictions = torch.cat((predictions, prediction), 0)

In [None]:
# Make the predictions be compatible with the submission
pred = predictions.int().tolist()
# pred = np.where(pred==0, -1, pred)
pred_id = test_id+zero_len_idx_test
pred_est = pred+[random.choice([0,1]) for i in range(len(zero_len_idx_test))]
pred_est = [p if p==1 else -1 for p in pred_est]
pred_dict = {'Id': pred_id, 'Prediction': pred_est}
pred_df = pd.DataFrame(pred_dict)
pred_df.to_csv('./submission.csv', index=False)