In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

# paths
train_file_OD = '/content/gdrive/MyDrive/Data/trainOD.csv'

train_file_ID = '/content/gdrive/MyDrive/Data/trainID.csv'
dev_file_ID = '/content/gdrive/MyDrive/Data/devID.csv'
test_file_ID = '/content/gdrive/MyDrive/Data/testID.csv'

test_trump = '/content/gdrive/MyDrive/Data/testtrump.csv'
test_police = '/content/gdrive/MyDrive/Data/testpolice.csv'
test_education = '/content/gdrive/MyDrive/Data/testeducation.csv'
test_immigration = '/content/gdrive/MyDrive/Data/testimmigration.csv'
test_economy = '/content/gdrive/MyDrive/Data/testeconomy.csv'


!pip install transformers

import random
import torch
import torch.nn as nn
from transformers import BertTokenizerFast, BertForSequenceClassification, LongformerTokenizer, LongformerForSequenceClassification
from transformers import Trainer, TrainingArguments, AdamW
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report
import csv
import gc
gc.collect()
torch.cuda.empty_cache()

Mounted at /content/gdrive
Collecting transformers
  Downloading transformers-4.12.3-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 4.0 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 33.3 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 44.6 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.1.1-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 7.3 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 38.7 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingfac

In [2]:
def read_corpus(corpus_file):
    '''Read in data set and returns docs and labels'''
    documents = []
    labels = []
    with open(corpus_file, encoding='utf-8') as f:
        lines = csv.reader(f, delimiter=',')
        for line in lines:
            line[0] = line[0].replace('timescontent.com', '').replace('MATP', '').replace('Reprint', '').replace('â€', '' ).replace('â€¢', '').replace('Â', '').replace('™️', '').replace('Herald', '')
            documents.append(line[0])
            labels.append(line[-1])
    return documents, labels

In [3]:
# in domain
'''
X_train, Y_train = read_corpus(train_file_ID)
X_dev, Y_dev = read_corpus(dev_file_ID)
X_test, Y_test = read_corpus(test_file_ID)
'''


# out of domain
X_train, Y_train = read_corpus(train_file_OD)
X_dev, Y_dev = read_corpus(dev_file_ID)
X_test, Y_test = read_corpus(test_file_ID)

X_test_trump, Y_test_trump = read_corpus(test_trump)
X_test_police, Y_test_police = read_corpus(test_police)
X_test_education, Y_test_education = read_corpus(test_education)
X_test_immigration, Y_test_immigration = read_corpus(test_immigration)
X_test_economy, Y_test_economy = read_corpus(test_economy)


In [4]:
model_name = "bert-base-uncased"
max_length = 512

tokenizer = BertTokenizerFast.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to('cuda')

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [5]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

tokens_train = tokenizer(X_train, truncation=True, padding=True, max_length=max_length)
tokens_val = tokenizer(X_dev, truncation=True, padding=True, max_length=max_length)

encoder = LabelBinarizer()
Y_train_bin = encoder.fit_transform(Y_train)
Y_dev_bin = encoder.fit_transform(Y_dev)
Y_test_bin = encoder.fit_transform(Y_test)
Y_test_trump_bin = encoder.fit_transform(Y_test_trump)
Y_test_police_bin = encoder.fit_transform(Y_test_police)
Y_test_education_bin = encoder.fit_transform(Y_test_education)
Y_test_immigration_bin = encoder.fit_transform(Y_test_immigration)
Y_test_economy_bin = encoder.fit_transform(Y_test_economy)

# convert our tokenized data into a torch Dataset
train_dataset = Dataset(tokens_train, Y_train_bin)
valid_dataset = Dataset(tokens_val, Y_dev_bin)

In [6]:
training_args = TrainingArguments(
    output_dir = './results' ,          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=20,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    logging_steps=250,               # log & save weights each logging_steps
    evaluation_strategy="steps"     # evaluate each `logging_steps`
)

trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset, put in train_datasetQ1 or train_datasetQ2
    eval_dataset=valid_dataset           # evaluation dataset, put in valid_datasetQ1 or valid_datasetQ2
)

In [7]:
trainer.train()
trainer.evaluate()

***** Running training *****
  Num examples = 10707
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 4017


Step,Training Loss,Validation Loss
250,0.5976,0.881276
500,0.4599,1.215627
750,0.4298,0.716057
1000,0.3685,0.93402
1250,0.3875,0.780823
1500,0.2975,1.253944
1750,0.3216,1.12969
2000,0.3018,1.572225
2250,0.2978,1.403896
2500,0.2931,1.635852


***** Running Evaluation *****
  Num examples = 2000
  Batch size = 20
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 20
Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 20
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 20
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 20
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 20
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json
Model weights saved in ./results/checkpoint-1500/pytorch_model.bin
***** Running Evaluation *****
  N

{'epoch': 3.0,
 'eval_loss': 0.9340200424194336,
 'eval_runtime': 136.7218,
 'eval_samples_per_second': 14.628,
 'eval_steps_per_second': 0.731}

In [9]:
def get_prediction(text):
    # prepare our text into tokenized sequence
    inputs = tokenizer(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to("cuda")
    # perform inference to our model
    outputs = model(**inputs)
    # get output probabilities by doing softmax
    probs = outputs[0].softmax(1)
    # executing argmax function to get the candidate label
    return probs.argmax()

Whole test set:

In [10]:
preds = []
for x in X_test:
  preds.append(get_prediction(x).tolist())

print(classification_report(Y_test_bin, preds, zero_division=True, digits=3))

              precision    recall  f1-score   support

           0      0.511     0.942     0.662       995
           1      0.651     0.107     0.184      1006

    accuracy                          0.522      2001
   macro avg      0.581     0.525     0.423      2001
weighted avg      0.581     0.522     0.422      2001



Trump set:

In [11]:
preds = []
for x in X_test_trump:
  preds.append(get_prediction(x).tolist())

print(classification_report(Y_test_trump_bin, preds, zero_division=True, digits=3))

              precision    recall  f1-score   support

           0      0.506     0.875     0.641      1370
           1      0.504     0.129     0.206      1344

    accuracy                          0.506      2714
   macro avg      0.505     0.502     0.424      2714
weighted avg      0.505     0.506     0.426      2714



Police set:

In [12]:
preds = []
for x in X_test_police:
  preds.append(get_prediction(x).tolist())

print(classification_report(Y_test_police_bin, preds, zero_division=True, digits=3))

              precision    recall  f1-score   support

           0      0.541     0.976     0.696       123
           1      0.667     0.056     0.103       108

    accuracy                          0.545       231
   macro avg      0.604     0.516     0.399       231
weighted avg      0.600     0.545     0.418       231



Education set:

In [13]:
preds = []
for x in X_test_education:
  preds.append(get_prediction(x).tolist())

print(classification_report(Y_test_education_bin, preds, zero_division=True, digits=3))

              precision    recall  f1-score   support

           0      0.302     1.000     0.464        70
           1      1.000     0.110     0.198       182

    accuracy                          0.357       252
   macro avg      0.651     0.555     0.331       252
weighted avg      0.806     0.357     0.272       252



Immigration set:

In [14]:
preds = []
for x in X_test_immigration:
  preds.append(get_prediction(x).tolist())

print(classification_report(Y_test_immigration_bin, preds, zero_division=True, digits=3))

              precision    recall  f1-score   support

           0      0.580     0.936     0.716       109
           1      0.588     0.119     0.198        84

    accuracy                          0.580       193
   macro avg      0.584     0.527     0.457       193
weighted avg      0.583     0.580     0.490       193



Economy set:

In [15]:
preds = []
for x in X_test_economy:
  preds.append(get_prediction(x).tolist())

print(classification_report(Y_test_economy_bin, preds, zero_division=True, digits=3))

              precision    recall  f1-score   support

           0      0.411     0.987     0.580        77
           1      0.938     0.121     0.214       124

    accuracy                          0.453       201
   macro avg      0.674     0.554     0.397       201
weighted avg      0.736     0.453     0.354       201

