In [9]:
import torch
from torchtext import data

# Libraries
import matplotlib.pyplot as plt
import pandas as pd
import torch

# Preliminaries
from torchtext.data import Field, TabularDataset, BucketIterator, Iterator

# Models
import torch.nn as nn
from transformers import RobertaTokenizer
from transformers import BertTokenizer, BertForSequenceClassification

# Training
import torch.optim as optim

# Evaluation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns

In [18]:
# tokenizer = RobertaTokenizer.from_pretrained("roberta-base", max_model_input_sizes={"roberta-base": 2024})   
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Model parameter
MAX_SEQ_LEN = 512
PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)
device = None
destination_folder = "modeling"

In [24]:
def custom_encode(text):
    return tokenizer.encode(text, max_length=512, truncation=True)

In [31]:
# Fields
label_field = Field(sequential=False, tokenize=custom_encode, use_vocab=False, batch_first=True, dtype=torch.float)
text_field = Field(use_vocab=False, tokenize=custom_encode, lower=False, include_lengths=False, batch_first=True,
					fix_length=MAX_SEQ_LEN, pad_token=PAD_INDEX, unk_token=UNK_INDEX)
fields = [("id", None), ("title", text_field), ("text", text_field), ("label", label_field)]
# TabularDataset
print("here")
train, valid, test = TabularDataset.splits(path="data/", train="train.csv", validation="validate.csv",
											test="test.csv", format="CSV", fields=fields, skip_header=True)
# Iterators
print("here2")
train_iter = BucketIterator(train, batch_size=16, sort_key=lambda x: len(x.text),
							device=device, train=True, sort=True, sort_within_batch=True)
print("here3")
valid_iter = BucketIterator(valid, batch_size=16, sort_key=lambda x: len(x.text),
							device=device, train=True, sort=True, sort_within_batch=True)
print("here4")
test_iter = Iterator(test, batch_size=16, device=device, train=False, shuffle=False, sort=False)


here


KeyboardInterrupt: 

In [27]:
class BERT(nn.Module):
	def __init__(self):
		super(BERT, self).__init__()
		options_name = "bert-base-uncased"
		self.encoder = BertForSequenceClassification.from_pretrained(options_name)

	def forward(self, text, label):
		loss, text_fea = self.encoder(text, labels=label)[:2]
		return loss, text_fea

In [28]:
# Save and Load Functions
def save_checkpoint(save_path, model, valid_loss):
	if save_path == None:
		return
	state_dict = {"model_state_dict": model.state_dict(),
					"valid_loss": valid_loss}
	torch.save(state_dict, save_path)
	print("Model saved to ==> {}".format(save_path))

def load_checkpoint(load_path, model):
	if load_path==None:
		return
	state_dict = torch.load(load_path, map_location=device)
	print("Model loaded from <== {}".format(load_path))
	model.load_state_dict(state_dict["model_state_dict"])
	return state_dict["valid_loss"]

def save_metrics(save_path, train_loss_list, valid_loss_list, global_steps_list):
	if save_path == None:
		return
	state_dict = {"train_loss_list": train_loss_list,
					"valid_loss_list": valid_loss_list,
					"global_steps_list": global_steps_list}
	torch.save(state_dict, save_path)
	print("Model saved to ==> {}".format(save_path))

def load_metrics(load_path):
	if load_path==None:
		return    
	state_dict = torch.load(load_path, map_location=device)
	print("Model loaded from <== {}".format(load_path))
	return state_dict["train_loss_list"], state_dict["valid_loss_list"], state_dict["global_steps_list"]

In [29]:
# Training Function
def train(model,
			optimizer,
			criterion = nn.BCELoss(),
			train_loader = train_iter,
			valid_loader = valid_iter,
			num_epochs = 5,
			eval_every = len(train_iter) // 2,
			file_path = destination_folder,
			best_valid_loss = float("Inf")):
    
	# initialize running values
	running_loss = 0.0
	valid_running_loss = 0.0
	global_step = 0
	train_loss_list = []
	valid_loss_list = []
	global_steps_list = []

	# training loop
	model.train()
	for epoch in range(num_epochs):
		for (labels, title, text, titletext), _ in train_loader:
			labels = labels.type(torch.LongTensor)           
			labels = labels.to(device)
			titletext = titletext.type(torch.LongTensor)  
			titletext = titletext.to(device)
			output = model(titletext, labels)
			loss, _ = output

			optimizer.zero_grad()
			loss.backward()
			optimizer.step()

			# update running values
			running_loss += loss.item()
			global_step += 1

			# evaluation step
			if global_step % eval_every == 0:
				model.eval()
				with torch.no_grad():
                    # validation loop
					for (labels, title, text, titletext), _ in valid_loader:
						labels = labels.type(torch.LongTensor)           
						labels = labels.to(device)
						titletext = titletext.type(torch.LongTensor)  
						titletext = titletext.to(device)
						output = model(titletext, labels)
						loss, _ = output
						valid_running_loss += loss.item()
                # evaluation
				average_train_loss = running_loss / eval_every
				average_valid_loss = valid_running_loss / len(valid_loader)
				train_loss_list.append(average_train_loss)
				valid_loss_list.append(average_valid_loss)
				global_steps_list.append(global_step)

				# resetting running values
				running_loss = 0.0                
				valid_running_loss = 0.0
				model.train()

				# print progress
				print("Epoch [{}/{}], Step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}"
						.format(epoch+1, num_epochs, global_step, num_epochs*len(train_loader),
								average_train_loss, average_valid_loss))
                # checkpoint
				if best_valid_loss > average_valid_loss:
					best_valid_loss = average_valid_loss
					save_checkpoint(file_path + "/" + "model.pt", model, best_valid_loss)
					save_metrics(file_path + "/" + "metrics.pt", train_loss_list, valid_loss_list, global_steps_list)
    
	save_metrics(file_path + "/" + "metrics.pt", train_loss_list, valid_loss_list, global_steps_list)
	print("Finished Training!")

In [30]:
model = BERT().to(device)
optimizer = optim.Adam(model.parameters(), lr=2e-5)

train(model=model, optimizer=optimizer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

ValueError: could not convert string to float: 'FAKE'

In [None]:
train_loss_list, valid_loss_list, global_steps_list = load_metrics(destination_folder + "/metrics.pt")
plt.plot(global_steps_list, train_loss_list, label="Train")
plt.plot(global_steps_list, valid_loss_list, label="Valid")
plt.xlabel("Global Steps")
plt.ylabel("Loss")
plt.legend()
plt.show()