In [None]:
!pip install openprompt

In [None]:
import os 
import sys
import csv
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification,TrainingArguments,Trainer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from transformers import DistilBertModel, DistilBertTokenizer
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing
from sklearn.linear_model import SGDClassifier
from collections import defaultdict
from google.colab import drive

drive.mount('/content/drive')
sys.path.append('/content/drive/My Drive/frame')

In [None]:
# load dataset
train_arguments = []
train_frames = []
train_topics = []
with open('/content/drive/MyDrive/frame/Final data/Train.csv') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)
    for row in reader:
        train_arguments.append(row[2])
        train_frames.append(row[-1])
        train_topics.append(row[4])

val_arguments = []
val_frames = []
val_topics = []
with open('/content/drive/MyDrive/frame/Final data/Validation.csv') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)
    for row in reader:
        val_arguments.append(row[2])
        val_frames.append(row[-1])
        val_topics.append(row[4])

test_arguments = []
test_frames = []
test_topics = []
with open('/content/drive/MyDrive/frame/Final data/Test.csv') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)
    for row in reader:
        test_arguments.append(row[2])
        test_frames.append(row[-1])
        test_topics.append(row[4])

In [None]:
# Optional REMOVE IMPORTANT FEATURES PER TOPIC

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(train_arguments)
le = preprocessing.LabelEncoder()
le.fit(train_topics)
Y = le.transform(train_topics)

clf = SGDClassifier(loss='log')
clf.fit(X, Y)

labeldict = defaultdict(list)
for i in range(0, clf.coef_.shape[0]):
    top20_indices = np.argsort(clf.coef_[i])[-50:]
    for j in top20_indices:
      labeldict[le.classes_[i]].append(vectorizer.get_feature_names()[j])

train = list(zip(train_arguments, train_topics))
val = list(zip(val_arguments, val_topics))
test = list(zip(test_arguments, test_topics))

train_arguments = []
for i in train:
  sent = i[0].split()
  for word in sent:
    if word in labeldict[i[1]]:
      sent[sent.index(word)] = '[MASK]'
  train_arguments.append(' '.join(sent))

val_arguments = []
for i in val:
  sent = i[0].split()
  for word in sent:
    if word in labeldict[i[1]]:
      sent[sent.index(word)] = '[MASK]'
  val_arguments.append(' '.join(sent))

test_arguments = []
for i in test:
  sent = i[0].split()
  for word in sent:
    if word in labeldict[i[1]]:
      sent[sent.index(word)] = '[MASK]'
  test_arguments.append(' '.join(sent))

In [None]:
# Convert the labels to numbers
labeldict = {'Morality':0,'Quality of Life':1,'Crime and punishment':2,'International relations and reputation':3,'Fairness and equality':4,'Cultural identity':5,'Political':6,'Capacity and resources':7,'Security and defense':8,'Health and Safety':9,'Economic':10,'Climate and environment':11,'Historical':12,'Policy prescription and evaluation':13,'Education':14,'Technology and innovation':15,'Legality, constitutionality and jurisprudence':16,'Public opinion':17, 'Irrelevant':18, 'Other':19}
train_frames_bin = []
val_frames_bin = []
test_frames_bin = []
for label in train_frames:
    train_frames_bin.append(labeldict[label])
for label in val_frames:
    val_frames_bin.append(labeldict[label])
for label in test_frames:
    test_frames_bin.append(labeldict[label])

In [None]:
# Determine classes 
from openprompt.data_utils import InputExample
input_train = []
for x in range(len(train_arguments)):
  example = InputExample(guid = x, text_a=train_arguments[x], label=train_frames_bin[x])
  input_train.append(example)

input_test = []
for x in range(len(test_arguments)):
  example = InputExample(guid = x, text_a=test_arguments[x], label=test_frames_bin[x])
  input_test.append(example)

input_val = []
for x in range(len(val_arguments)):
  example = InputExample(guid = x, text_a=val_arguments[x], label=val_frames_bin[x])
  input_val.append(example)

from openprompt.data_utils import InputExample
classes = [ # 18 categories including irrelevant and other
"capacity and resources", 
"climate and environment",
"crime and punishment",
"cultural identity",
"economic",
"education",
"fairness and equality",
"health and safety",
"historical",
"international relations and reputation",
"irrelevant",
"legality constitutionality and jurisprudence",
"morality",
"other",
"policy prescription and evaluation",
"political",
"public opinion",
"quality of life",
"security and defense",
"technology and innovation"
]

In [None]:
# Load BERT pre-trained language model
from openprompt.plms import load_plm
plm, tokenizer, model_config, WrapperClass = load_plm("bert", "bert-base-cased")

In [None]:
# Construct templates
from openprompt.prompts import ManualTemplate
promptTemplate = ManualTemplate(
    text = '{"placeholder":"text_a"} The frame of {"placeholder":"text_a"} is {"mask"}',
    tokenizer = tokenizer,
)

In [None]:
# Construct the verbalizer
from openprompt.prompts import ManualVerbalizer
promptVerbalizer = ManualVerbalizer(
    classes = classes,
    label_words = { #18 categories
        "capacity and resources": ["knowledge"], 
"climate and environment": ["climate"], 
"crime and punishment": ["crime"], 
"cultural identity": ["minority"],
"economic": ["tax"],
"education": ["school"],
"fairness and equality": ["sex", "gender"], 
"health and safety": ["food"],
"historical": ["history"],
"international relations and reputation": ["country"],
"irrelevant": ["irrelevant"],
"legality constitutionality and jurisprudence": ["law"],
"morality": ["moral"] ,
"other": ["other"],
"policy prescription and evaluation": ["policy"],
"political": ["political"] ,
"public opinion": ["publicity"],
"quality of life": ["quality"],
"security and defense": ["safe"],
"technology and innovation": ["technology"],
    },
    tokenizer = tokenizer,
)

In [None]:
use_cuda = True

# Combine PLM, template and verbalizer into a prompt model
from openprompt import PromptForClassification
promptModel = PromptForClassification(
    template = promptTemplate,
    plm = plm,
    verbalizer = promptVerbalizer,
)
if use_cuda:
    promptModel=  promptModel.cuda()

In [None]:
# Construct a dataloader
from openprompt import PromptDataLoader
data_loader_train = PromptDataLoader(dataset = input_train, 
        tokenizer = tokenizer,
        template = promptTemplate, 
        tokenizer_wrapper_class=WrapperClass,
)

In [None]:
# Use Pytorch to train the data
from transformers import  AdamW, get_linear_schedule_with_warmup
loss_func = torch.nn.CrossEntropyLoss()
no_decay = ['bias', 'LayerNorm.weight']
# it's always good practice to set no decay to biase and LayerNorm parameters
optimizer_grouped_parameters = [
    {'params': [p for n, p in promptModel.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in promptModel.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5)

for epoch in range(25):
    tot_loss = 0
    for step, inputs in enumerate(data_loader_train):
        if use_cuda:
            inputs = inputs.cuda()
        logits = promptModel(inputs)
        labels = inputs['label']
        loss = loss_func(logits, labels)
        loss.backward()
        tot_loss += loss.item()
        optimizer.step()
        optimizer.zero_grad()
        if step %100 ==1:
            print("Epoch {}, average loss: {}".format(epoch, tot_loss/(step+1)), flush=True)


In [None]:
# Evaluate 
data_loader_val = PromptDataLoader(dataset = input_test, 
        tokenizer = tokenizer,
        template = promptTemplate, 
        tokenizer_wrapper_class=WrapperClass, max_seq_length=256, decoder_max_length=3,
    batch_size=4,shuffle=False, teacher_forcing=False, predict_eos_token=False,
    truncate_method="head")

allpreds = []
alllabels = []
for step, inputs in enumerate(data_loader_val):
    if use_cuda:
        inputs = inputs.cuda()
    logits = promptModel(inputs)
    labels = inputs['label']
    alllabels.extend(labels.cpu().tolist())
    allpreds.extend(torch.argmax(logits, dim=-1).cpu().tolist())

acc = sum([int(i==j) for i,j in zip(allpreds, alllabels)])/len(allpreds)
print(acc)
print(classification_report(alllabels, allpreds, zero_division=True, digits=3))