In [6]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import torch
import random
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer,RobertaTokenizer, LongformerTokenizer,  BigBirdTokenizerFast
from transformers import BertForSequenceClassification, RobertaForSequenceClassification, LongformerForSequenceClassification, BigBirdForSequenceClassification

In [8]:
# If there's a GPU available
# If GPU not availiable you may need to debug or use a machine with GPU
# The model is too large to be trained on CPU
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.()')
    device = torch.device("cpu")


There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce GTX 1070


In [2]:
#Load the model, if no model to load, initialize all the model and tokenizer for further use.
#There will be several warning message pop out when initializing the model, 
# it is because of the model haven't been trained, it could be ignored.

checkpoint_path = "Model.pth"  # Change to your preferred location

if os.path.exists(checkpoint_path):
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_epoch = checkpoint['epoch']
    losses = checkpoint['losses']
    print("Loaded checkpoint.")
else:
    tokenizer_options = {
        "bert": BertTokenizer.from_pretrained('bert-base-uncased'),
        "roberta": RobertaTokenizer.from_pretrained('roberta-base'),
        "longformer": LongformerTokenizer.from_pretrained('allenai/longformer-base-4096'),
        "bigbird":  BigBirdTokenizerFast.from_pretrained('l-yohai/bigbird-roberta-base-mnli')
    }

    model_options = {
        "bert": BertForSequenceClassification.from_pretrained('bert-base-uncased'),
        "roberta": RobertaForSequenceClassification.from_pretrained('roberta-base'),
        "longformer": LongformerForSequenceClassification.from_pretrained('allenai/longformer-base-4096'),
        "bigbird": BigBirdForSequenceClassification.from_pretrained('l-yohai/bigbird-roberta-base-mnli')
    }
    

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [3]:
# Summarize the input to how many tokens, default to 512 for BERT use.
# Since LongFormer could accept 4096 tokens, we may could skip TextRank if LongFormer
IS_TEXT_RANK = True
TEXT_RANK_LENGTH = 512

MODEL = "bert" # Choose from "bert", "roberta", "longformer", "bigbird"
seed_val = 42

TRAIN_DATA_PATH = "data\train.csv"
TEST_DATA_PATH = "data\test.csv"


In [7]:
# Use the selected tokenizer and model
tokenizer = tokenizer_options.get(MODEL)
model = model_options.get(MODEL)

# Set the seed value all over the place to make this reproducible.
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
train = pd.read_csv(DATA_PATH)
test = pd.read_csv

In [None]:
print(train.shape)
train.head()

In [None]:
print(test.shape)
test.head()