In [None]:
pip install transformers



In [None]:
import csv
from transformers import AutoTokenizer,AutoModelForSequenceClassification,TrainingArguments,Trainer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
import torch
from transformers import DistilBertModel, DistilBertTokenizer
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing
from sklearn.linear_model import SGDClassifier
from collections import defaultdict
import numpy as np

ModuleNotFoundError: ignored

In [None]:
train_arguments = []
train_frames = []
train_topics = []
with open('/content/drive/MyDrive/LT_project_data/Train.csv') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)
    for row in reader:
        train_arguments.append(row[2])
        train_frames.append(row[-1])
        train_topics.append(row[4])

val_arguments = []
val_frames = []
val_topics = []
with open('/content/drive/MyDrive/LT_project_data/Validation.csv') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)
    for row in reader:
        val_arguments.append(row[2])
        val_frames.append(row[-1])
        val_topics.append(row[4])

test_arguments = []
test_frames = []
test_topics = []
with open('/content/drive/MyDrive/LT_project_data/Test.csv') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)
    for row in reader:
        test_arguments.append(row[2])
        test_frames.append(row[-1])
        test_topics.append(row[4])

In [None]:
# # REMOVE 'Other' and 'Irrelevant' classes

train = list(zip(train_arguments, train_frames,train_topics))
val = list(zip(val_arguments, val_frames,val_topics))
test = list(zip(test_arguments, test_frames,test_topics))

train = [s for s in train if s[1] not in ['Other','Irrelevant']]
val = [s for s in val if s[1] not in ['Other','Irrelevant']]
test = [s for s in test if s[1] not in ['Other','Irrelevant']]

train_arguments = list(list(zip(*train))[0])
train_frames = list(list(zip(*train))[1])
train_topics = list(list(zip(*train))[2])

val_arguments = list(list(zip(*val))[0])
val_frames = list(list(zip(*val))[1])
val_topics = list(list(zip(*val))[2])

test_arguments = list(list(zip(*test))[0])
test_frames = list(list(zip(*test))[1])
test_topics = list(list(zip(*test))[2])

In [None]:
# # REMOVE IMPORTANT FEATURES PER TOPIC

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(train_arguments)
le = preprocessing.LabelEncoder()
le.fit(train_topics)
Y = le.transform(train_topics)

clf = SGDClassifier(loss='log')
clf.fit(X, Y)

labeldict = defaultdict(list)
for i in range(0, clf.coef_.shape[0]):
    top20_indices = np.argsort(clf.coef_[i])[-50:]
    for j in top20_indices:
      labeldict[le.classes_[i]].append(vectorizer.get_feature_names()[j])

train = list(zip(train_arguments, train_topics))
val = list(zip(val_arguments, val_topics))
test = list(zip(test_arguments, test_topics))

train_arguments = []
for i in train:
  sent = i[0].split()
  for word in sent:
    if word in labeldict[i[1]]:
      sent[sent.index(word)] = '[MASK]'
  train_arguments.append(' '.join(sent))

val_arguments = []
for i in val:
  sent = i[0].split()
  for word in sent:
    if word in labeldict[i[1]]:
      sent[sent.index(word)] = '[MASK]'
  val_arguments.append(' '.join(sent))

test_arguments = []
for i in test:
  sent = i[0].split()
  for word in sent:
    if word in labeldict[i[1]]:
      sent[sent.index(word)] = '[MASK]'
  test_arguments.append(' '.join(sent))



In [None]:
labeldict = {'Morality':0,'Quality of Life':1,'Crime and punishment':2,'International relations and reputation':3,'Fairness and equality':4,'Cultural identity':5,'Political':6,'Capacity and resources':7,'Security and defense':8,'Health and Safety':9,'Economic':10,'Climate and environment':11,'Historical':12,'Policy prescription and evaluation':13,'Education':14,'Technology and innovation':15,'Legality, constitutionality and jurisprudence':16,'Public opinion':17, 'Irrelevant':18, 'Other':19}
train_frames_bin = []
val_frames_bin = []
test_frames_bin = []
for label in train_frames:
    train_frames_bin.append(labeldict[label])
for label in val_frames:
    val_frames_bin.append(labeldict[label])
for label in test_frames:
    test_frames_bin.append(labeldict[label])

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

train_encodings = tokenizer(train_arguments, truncation=True, padding=True)
val_encodings = tokenizer(val_arguments, truncation=True, padding=True)

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = Dataset(train_encodings, train_frames_bin)
val_dataset = Dataset(val_encodings, val_frames_bin)


In [None]:
model_name = "bert-base-uncased"
max_length = 512
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=18).to('cuda')

training_args = TrainingArguments(
    output_dir = './results' ,          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=20,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    logging_steps=50,               # log & save weights each logging_steps
    evaluation_strategy="steps",                 # evaluate each `logging_steps`
)

trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset, put in train_datasetQ1 or train_datasetQ2
    eval_dataset=val_dataset           # evaluation dataset, put in valid_datasetQ1 or valid_datasetQ2
)

trainer.train()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Step,Training Loss,Validation Loss
50,2.9382,2.918811
100,2.8564,2.862777
150,2.783,2.817663
200,2.6866,2.787793
250,2.5843,2.739985
300,2.3444,2.541488
350,2.1815,2.448352
400,1.9992,2.515989
450,1.9108,2.369


***** Running Evaluation *****
  Num examples = 152
  Batch size = 20
***** Running Evaluation *****
  Num examples = 152
  Batch size = 20
***** Running Evaluation *****
  Num examples = 152
  Batch size = 20
***** Running Evaluation *****
  Num examples = 152
  Batch size = 20
***** Running Evaluation *****
  Num examples = 152
  Batch size = 20
***** Running Evaluation *****
  Num examples = 152
  Batch size = 20
***** Running Evaluation *****
  Num examples = 152
  Batch size = 20
***** Running Evaluation *****
  Num examples = 152
  Batch size = 20
***** Running Evaluation *****
  Num examples = 152
  Batch size = 20


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=492, training_loss=2.4294472826205618, metrics={'train_runtime': 380.7551, 'train_samples_per_second': 10.282, 'train_steps_per_second': 1.292, 'total_flos': 899437126813020.0, 'train_loss': 2.4294472826205618, 'epoch': 3.0})

In [None]:
def get_prediction(text):
    # prepare our text into tokenized sequence
    inputs = tokenizer(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to("cuda")
    # perform inference to our model
    outputs = model(**inputs)
    # get output probabilities by doing softmax
    probs = outputs[0].softmax(1)
    # executing argmax function to get the candidate label
    return probs.argmax()

preds = []
for x in test_arguments:
  preds.append(get_prediction(x).tolist())

print(classification_report(test_frames_bin, preds, zero_division=True, digits=3))

              precision    recall  f1-score   support

           0      1.000     0.125     0.222         8
           1      0.182     0.364     0.242        11
           2      1.000     0.000     0.000         4
           3      1.000     0.000     0.000         1
           4      0.143     0.059     0.083        17
           5      0.188     0.600     0.286         5
           6      0.333     0.714     0.455         7
           7      1.000     0.000     0.000         9
           8      0.000     1.000     0.000         0
           9      0.000     0.000     0.000         1
          10      0.500     0.125     0.200         8
          11      1.000     0.020     0.040        49
          12      0.385     0.417     0.400        12
          13      1.000     0.059     0.111        17
          15      0.062     0.750     0.115         4
          16      0.783     0.562     0.655        32
          17      1.000     0.000     0.000         5

    accuracy              