In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
!pip install transformers
%cd drive/My\ Drive/NLP

/content/drive/My Drive/NLP


In [1]:
from transformers import *
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import tensorflow as tf
import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There is/are %d GPU(s) available.' % torch.cuda.device_count())
    print('Using GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

#device = torch.device("cpu")

# Set the seed value all over the place to make this reproducible. Somehow this isn't working!
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

There is/are 1 GPU(s) available.
Using GPU: Tesla P100-PCIE-16GB


In [0]:
# Uncomment the model(s) you want to use
MODELS = [#(BertModel,                           BertTokenizer,       'bert-base-uncased'),
          #(BertForSequenceClassification,       BertTokenizer,       'bert-large-uncased'),
          #(OpenAIGPTModel,                      OpenAIGPTTokenizer,  'openai-gpt'),
          #(GPT2Model,                           GPT2Tokenizer,       'gpt2'),
          #(CTRLModel,                           CTRLTokenizer,       'ctrl'),
          #(TransfoXLModel,                      TransfoXLTokenizer,  'transfo-xl-wt103'),
          #(XLNetModel,                          XLNetTokenizer,      'xlnet-base-cased'),
          #(XLNetForSequenceClassification,      XLNetTokenizer,      'xlnet-large-cased'),
          #(XLMModel,                            XLMTokenizer,        'xlm-mlm-enfr-1024'),
          #(XLMForSequenceClassification,        XLMTokenizer,        'xlm-mlm-enfr-1024'),
          #(RobertaModel,                        RobertaTokenizer,    'roberta-base'),
          (RobertaForSequenceClassification,    RobertaTokenizer,    'roberta-base'),
          #(XLMRobertaModel,                     XLMRobertaTokenizer, 'xlm-roberta-base'),
          #(XLMRobertaForSequenceClassification, XLMRobertaTokenizer, 'xlm-roberta-base'),
         ]
DATAPATH = "data/FC/full_train_1.csv"

# For the first sub-task

In [0]:
class ClassificationDataset(Dataset):
  def __init__(self, corpus, tokenizer_class, pretrained_weights, max_len):
    # Dropping NaN rows
    corpus.dropna(subset=['Text'], inplace=True)
    self.corpus = corpus.reset_index()
    # Tokenising sentences
    self.corpus = self.corpus.dropna(subset=['Text'])
    self.tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
    self.corpus['Enc_Text'] = [self.tokenizer.encode_plus(sent, max_length=max_len, 
                                                      pad_to_max_length='right')['input_ids'] 
                           for sent in self.corpus['Text']]
    # Weights for cross-entropy loss
    self.weights = torch.tensor(self.corpus['Gold'].value_counts(normalize=True).tolist()).to(device)
    print(self.corpus['Gold'].value_counts(normalize=True))

  def __len__(self):
    return len(self.corpus)

  def __getitem__(self, idx):
    if torch.is_tensor(idx):
      idx = idx.tolist()
    S = self.corpus['Text'][idx]
    X = torch.tensor(self.corpus['Enc_Text'][idx]).to(device)
    y = torch.tensor(self.corpus['Gold'][idx]).to(device)
    # Sample is sentence, embedding, gold label
    sample = (S, X, y)
    return sample

In [0]:
def set_worker_seed(worker_id):
  random.seed(seed_val)
  np.random.seed(seed_val)
  torch.manual_seed(seed_val)
  torch.cuda.manual_seed(seed_val)
  torch.cuda.manual_seed_all(seed_val)

In [0]:
master_corpus = pd.read_csv(DATAPATH)

In [0]:
master_corpus

Unnamed: 0,Index,Text,Gold
0,1.00001,Third Democratic presidential debate Septembe...,0
1,1.00002,"On the policy front, Bernie Sanders claimed hi...",0
2,1.00003,Joe Biden misrepresented recent history when h...,0
3,1.00004,Here's a look at some of the assertions in the...,0
4,1.00005,"It killed 22 people, and injured many more, we...",0
...,...,...,...
22170,590.00061,Contact transpo@gmu.edu with questions.,0
22171,590.00062,Campus Fire Safety Month September is Campus ...,0
22172,590.00063,"Review the university's Fire Safety Plan, whic...",0
22173,590.00064,Contact Meredith Muckerman at 703-993-9715 or ...,0


In [0]:
train_corpus, test_corpus = train_test_split(master_corpus, stratify=master_corpus['Gold'])
train_corpus.to_csv("data/FC/train_train_1.csv", index=False)
test_corpus.to_csv("data/FC/train_val_1.csv", index=False)

In [0]:
train_corpus = pd.read_csv("data/FC/train_train_1.csv")
test_corpus = pd.read_csv("data/FC/train_val_1.csv")

In [0]:
for model_class, tokenizer_class, pretrained_weights in MODELS:
  # Loading the data and splitting it
  # master_corpus = master_corpus
  # train_corpus, test_corpus = master_corpus, master_corpus

  train_dataset = ClassificationDataset(train_corpus, tokenizer_class, pretrained_weights, max_len=512)
  test_dataset = ClassificationDataset(test_corpus, tokenizer_class, pretrained_weights, max_len=512)
  train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True, worker_init_fn=set_worker_seed)
  test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=8, shuffle=False, worker_init_fn=set_worker_seed)

  model = model_class.from_pretrained(pretrained_weights, num_labels=2, output_hidden_states=False, output_attentions=False)
  model.to(device)
  criterion = nn.CrossEntropyLoss(weight=train_dataset.weights)

  # Number of training epochs (authors recommend between 2 and 4)
  epochs = 20

  """For XLNet
  param_optimizer = list(model.named_parameters())
  no_decay = ['bias', 'gamma', 'beta']
  optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
  ]
  # This variable contains all of the hyperparemeter information our training loop needs
  optimizer = AdamW(optimizer_grouped_parameters,
                    lr=1e-5)
  """
  """ For BERT """
  optimizer = AdamW(model.parameters(),
                    lr = 1e-5, # args.learning_rate - default is 5e-5, 1e-5 worked best for me
                    eps = 1e-8) # args.adam_epsilon  - default is 1e-8.

  # Total number of training steps is number of batches * number of epochs.
  total_train_steps = len(train_loader) * epochs
  # Create the learning rate scheduler.
  scheduler = get_linear_schedule_with_warmup(optimizer, 
                                              num_warmup_steps = 0, # Default value in run_glue.py
                                              num_training_steps = total_train_steps)
  
  for epoch in range(epochs):
    running_loss = 0.0
    total_loss = 0.0
    model.train()

    train_preds = None
    train_labels = None

    for i, data in enumerate(train_loader):
      _, inputs, labels = data
      optimizer.zero_grad()
      outputs = model(inputs) # labels=b_labels)
      loss = criterion(outputs[0], labels)
      
      running_loss += loss.item()
      total_loss += loss.item()

      if train_preds is None or train_labels is None:
        train_preds = np.argmax(outputs[0].detach().cpu().numpy(), axis=1).flatten()
        train_labels = labels.cpu().numpy().flatten()
      else:
        train_preds = np.concatenate((train_preds, np.argmax(outputs[0].detach().cpu().numpy(), axis=1).flatten()))
        train_labels = np.concatenate((train_labels, labels.cpu().numpy().flatten()))

      # Clip the norm of the gradients to 1.0.
      # This is to help prevent the "exploding gradients" problem.
      nn.utils.clip_grad_norm_(model.parameters(), 1.0)

      loss.backward()
      optimizer.step()
      scheduler.step()

      if i % 100 == 99:    # print every 100 mini-batches
        print('[%d, %5d] loss: %.5f' % (epoch + 1, i + 1, running_loss / 100))
        running_loss = 0.0
    
    print("Training loss in epoch %d is %.5f" % (epoch + 1, total_loss / len(train_loader)))
    print("Training accuracy in epoch %d is %.5f" % (epoch + 1, accuracy_score(train_labels, train_preds) * 100))
    print("Training precision in epoch %d is %.5f" % (epoch + 1, precision_score(train_labels, train_preds) * 100))
    print("Training recall in epoch %d is %.5f" % (epoch + 1, recall_score(train_labels, train_preds) * 100))
    print("Training F1-score in epoch %d is %.5f" % (epoch + 1, f1_score(train_labels, train_preds) * 100))

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()
    # Tracking variables 
    test_loss = 0.0

    test_preds = None
    test_labels = None

    with torch.no_grad():
      for data in test_loader:
        _, inputs, labels = data
        outputs = model(inputs) # labels=b_labels)
        loss = criterion(outputs[0], labels)
      
        test_loss += loss.item()
        if test_preds is None or test_labels is None:
          test_preds = np.argmax(outputs[0].detach().cpu().numpy(), axis=1).flatten()
          test_labels = labels.cpu().numpy().flatten()
        else:
          test_preds = np.concatenate((test_preds, np.argmax(outputs[0].detach().cpu().numpy(), axis=1).flatten()))
          test_labels = np.concatenate((test_labels, labels.cpu().numpy().flatten()))

    print("Test loss in epoch %d is %.5f" % (epoch + 1, test_loss / len(test_loader)))
    print("Test accuracy in epoch %d is %.5f" % (epoch + 1, accuracy_score(test_labels, test_preds) * 100))
    print("Test precision in epoch %d is %.5f" % (epoch + 1, precision_score(test_labels, test_preds) * 100))
    print("Test recall in epoch %d is %.5f" % (epoch + 1, recall_score(test_labels, test_preds) * 100))
    print("Test F1-score in epoch %d is %.5f" % (epoch + 1, f1_score(test_labels, test_preds) * 100))

0    0.927949
1    0.072051
Name: Gold, dtype: float64
0    0.92803
1    0.07197
Name: Gold, dtype: float64
[1,   100] loss: 0.07751
[1,   200] loss: 0.03603


In [0]:
# Saving model
model.save_pretrained('fc-models/roberta-base-fc-32b')

In [0]:
# Loading model
model = AutoModelForSequenceClassification.from_pretrained("xlnet-base-cf-all")

In [0]:
# To generate labels for final model
for model_class, tokenizer_class, pretrained_weights in MODELS:
  # Loading the data and splitting it
  master_corpus = pd.read_csv(DATAPATH)
  master_dataset = ClassificationDataset(master_corpus, tokenizer_class, pretrained_weights, max_len=512)
  master_loader = torch.utils.data.DataLoader(master_dataset, batch_size=8, shuffle=False)
  # criterion = nn.CrossEntropyLoss(weight=master_dataset.weights)

  model.to(device)
  
  FP = []
  FN = []

  test_loss = 0
  test_preds = None
  test_labels = None
  for i, data in enumerate(master_loader):
    sent, inp, _ = data
    outputs = model(inp)
    # loss = criterion(outputs[0], labels)
      
    # test_loss += loss.item()
    preds = np.argmax(outputs[0].detach().cpu().numpy(), axis=1).flatten()
    # labels = labels.cpu().numpy().flatten()
    if test_preds is None:# or test_labels is None:
      test_preds = preds.copy()
      # test_labels = labels.copy()
    else:
      test_preds = np.concatenate((test_preds, preds))
      # test_labels = np.concatenate((test_labels, labels))

    #for i in range(preds.shape[0]):
    #  if preds[i].item() is 0 and labels[i].item() is 1:
    #    FN.append(sent[i])
    #  elif preds[i].item() is 1 and labels[i].item() is 0:
    #    FP.append(sent[i])

  #print("Test loss in epoch %d is %.5f" % (1, test_loss / len(master_loader)))
  #print("Test accuracy in epoch %d is %.5f" % (1, accuracy_score(test_labels, test_preds) * 100))
  #print("Test precision in epoch %d is %.5f" % (1, precision_score(test_labels, test_preds) * 100))
  #print("Test recall in epoch %d is %.5f" % (1, recall_score(test_labels, test_preds) * 100))
  #print("Test F1-score in epoch %d is %.5f" % (1, f1_score(test_labels, test_preds) * 100))

  np.save("xlnet-base-cf-all/xlnet-base-cf-all-preds.npy", test_preds)
  for i in FP:
    print(i)
  print()
  for i in FN:
    print(i)






In [0]:
print(len(FP))
print(len(FN))

31
75


In [0]:
print(FP[np.argmin([len(x) for x in FP])])
print(FP[np.argmax([len(x) for x in FP])])
print(FN[np.argmin([len(x) for x in FN])])
print(FN[np.argmax([len(x) for x in FN])])

I thought that if I was just doing what the doctor said, I'd be fine.
As part of his vetting in late 2014, in which he and his wife almost leased a home in the trendy Penn Quarter neighborhood of Washington D.C., Landon said he "felt an obligation to help" when he realized the sitting Fed governors were doing the jobs of two or three people. "You wish there were a more certain path to the job," he said..
Even if they could, I would still oppose them on moral grounds.
It's become fashionable to tell a disability story in a hopeful arc, where the heroine may have moments of discouragement or fear, but comes out into full life at the end - into mainstream schools, love and romance, full participation in the social world, and these stories have become so pervasive that if they were to spread to aliens they'd find them familiar.
