In [None]:

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

import checklist
from checklist.editor import Editor
from checklist.perturb import Perturb
from checklist.test_types import INV
import csv
import spacy
import numpy as np

from tqdm import tqdm
from sklearn.metrics import accuracy_score


## Model setup

In [None]:

def set_seed(seed):
    """
    Set a seed for reproducibility.
    """
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.determinstic = True
    torch.backends.cudnn.benchmark = False


def load_model_and_tokenizer(name="qwen"):

    path_dict = {
        "qwen" : "Qwen/Qwen1.5-7B-Chat",
        "aya"  : "CohereForAI/aya-101",
        "yi"   : "01-ai/Yi-6B-Chat",
    }
    
    assert name in path_dict, "unknown model"
    
    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-7B-Chat")
    model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen1.5-7B-Chat", torch_dtype="auto")
    
    return model, tokenizer


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
set_seed(42)

model, tokenizer = load_model_and_tokenizer(name="qwen")

model = model.to(device)


## Create dataset

### Sentiment analysis

In [None]:
# Load and parse airline tweets

def read_tweets(path):
    r = csv.DictReader(open(path))
    labels = []
    confs = []
    airlines = []
    tdata = []
    reasons = []
    for row in r:
        sentiment, conf, airline, text = row['airline_sentiment'], row['airline_sentiment_confidence'], row['airline'], row['text']
        labels.append(sentiment)
        confs.append(conf)
        airlines.append(airline)
        tdata.append(text)
        reasons.append(row['negativereason'])

    mapping = {'negative': 0, 'positive': 2, 'neutral': 1}
    labels = np.array([mapping[x] for x in labels]).astype(int)
    
    return tdata, labels # labels, confs, airlines, tdata, reasons

data, labels = read_tweets('./Tweets.csv')

nlp = spacy.load('en_core_web_sm')
sentences = data
parsed_data = list(nlp.pipe(sentences))


#### Named Entity Recognition (NER) test using INVariance

In [None]:

# Change location
perturb_location_data = Perturb.perturb(parsed_data, Perturb.change_location, nsamples=1000, n=5).data

# Change names
perturb_names_data = Perturb.perturb(parsed_data, Perturb.change_names, nsamples=1000, n=5).data


#### Robustness using INVariance

In [None]:

import string

def random_string(n):
    return ''.join(np.random.choice([x for x in string.ascii_letters + string.digits], n))

def random_url(n=6):
    return 'https://t.co/%s' % random_string(n)

def random_handle(n=6):
    return '@%s' % random_string(n)

def add_irrelevant(sentence):
    urls_and_handles = [random_url(n=6) for _ in range(5)] + [random_handle() for _ in range(5)]
    irrelevant_before = ['@airline '] + urls_and_handles
    irrelevant_after = urls_and_handles 
    rets = ['%s %s' % (x, sentence) for x in irrelevant_before ]
    rets += ['%s %s' % (sentence, x) for x in irrelevant_after]
    return rets


In [None]:

N = 1000

# Add randomly generated URLs and handles
perturb_irrelevant_data = Perturb.perturb(sentences, add_irrelevant, nsamples=N).data

# Add typos
perturb_punc_data = Perturb.perturb(parsed_data, Perturb.punctuation, nsamples=N).data
perturb_typo1_data = Perturb.perturb(sentences, Perturb.add_typos, nsamples=N, typos=1).data
perturb_typo2_data = Perturb.perturb(sentences, Perturb.add_typos, nsamples=N, typos=2).data
perturb_typo5_data = Perturb.perturb(sentences, Perturb.add_typos, nsamples=N, typos=5).data

# Contract or expand contractions
perturb_contract_data = Perturb.perturb(sentences, Perturb.contractions, nsamples=N).data


#### Negation using Min Func Test (MFT)

In [None]:

editor = Editor()
editor.tg

air_noun = ['flight', 'seat', 'pilot', 'staff', 'service', 'customer service', 'aircraft', 'plane', 'food', 'cabin crew', 'company', 'airline', 'crew']
editor.add_lexicon('air_noun', air_noun)

pos_adj = ['good', 'great', 'excellent', 'amazing', 'extraordinary', 'beautiful', 'fantastic', 'nice', 'incredible', 'exceptional', 'awesome', 'perfect', 'fun', 'happy', 'adorable', 'brilliant', 'exciting', 'sweet', 'wonderful']
neg_adj = ['awful', 'bad', 'horrible', 'weird', 'rough', 'lousy', 'unhappy', 'average', 'difficult', 'poor', 'sad', 'frustrating', 'hard', 'lame', 'nasty', 'annoying', 'boring', 'creepy', 'dreadful', 'ridiculous', 'terrible', 'ugly', 'unpleasant']
neutral_adj = ['American', 'international',  'commercial', 'British', 'private', 'Italian', 'Indian', 'Australian', 'Israeli', ]
editor.add_lexicon('pos_adj', pos_adj, overwrite=True)
editor.add_lexicon('neg_adj', neg_adj, overwrite=True )
editor.add_lexicon('neutral_adj', neutral_adj, overwrite=True)

pos_verb_present = ['like', 'enjoy', 'appreciate', 'love',  'recommend', 'admire', 'value', 'welcome']
neg_verb_present = ['hate', 'dislike', 'regret',  'abhor', 'dread', 'despise' ]
neutral_verb_present = ['see', 'find']
pos_verb_past = ['liked', 'enjoyed', 'appreciated', 'loved', 'admired', 'valued', 'welcomed']
neg_verb_past = ['hated', 'disliked', 'regretted',  'abhorred', 'dreaded', 'despised']
neutral_verb_past = ['saw', 'found']
editor.add_lexicon('pos_verb_present', pos_verb_present, overwrite=True)
editor.add_lexicon('neg_verb_present', neg_verb_present, overwrite=True)
editor.add_lexicon('neutral_verb_present', neutral_verb_present, overwrite=True)
editor.add_lexicon('pos_verb_past', pos_verb_past, overwrite=True)
editor.add_lexicon('neg_verb_past', neg_verb_past, overwrite=True)
editor.add_lexicon('neutral_verb_past', neutral_verb_past, overwrite=True)
editor.add_lexicon('pos_verb', pos_verb_present+ pos_verb_past, overwrite=True)
editor.add_lexicon('neg_verb', neg_verb_present + neg_verb_past, overwrite=True)
editor.add_lexicon('neutral_verb', neutral_verb_present + neutral_verb_past, overwrite=True)


In [None]:
### Simple negations
N = 1000
neg = ['I can\'t say I', 'I don\'t', 'I would never say I', 'I don\'t think I', 'I didn\'t' ]
it1 = ['This', 'That', 'The']
nt1 = ['is not', 'isn\'t']
benot1 = ['is not',  'isn\'t', 'was not', 'wasn\'t']
the1 = ['this', 'that', 'the']

# Simple negations of positive statements (should be negative)
neg1_negative_data = editor.template('{it} {air_noun} {nt} {pos_adj}.', it=it1, nt=nt1)
neg1_negative_data += editor.template('{it} {benot} {a:pos_adj} {air_noun}.', it=it1, benot=benot1)
neg1_negative_data += editor.template('{neg} {pos_verb_present} {the} {air_noun}.', neg=neg, the=the1)
neg1_negative_data += editor.template('No one {pos_verb_present}s {the} {air_noun}.', neg=neg, the=the1)
neg1_negative_data.data = list(np.random.choice(neg1_negative_data.data, N, replace=False))

print(len(neg1_negative_data.data))

# Simple negations of negative statements (should be positive/neutral)
neg1_positive_data = editor.template('{it} {air_noun} {nt} {neg_adj}.', it=it1, nt=nt1)
neg1_positive_data += editor.template('{it} {benot} {a:neg_adj} {air_noun}.', it=it1, benot=benot1)
neg1_positive_data += editor.template('{neg} {neg_verb_present} {the} {air_noun}.', neg=neg, the=the1)
neg1_positive_data += editor.template('No one {neg_verb_present}s {the} {air_noun}.', neg=neg, the=the1)
neg1_positive_data.data = list(np.random.choice(neg1_positive_data.data, N, replace=False))

print(len(neg1_positive_data.data))

# Simple negations of neutral statement (should be neutral)
neg1_neutral_data = editor.template('{it} {air_noun} {nt} {neutral_adj}.', it=it1, nt=nt1)
neg1_neutral_data += editor.template('{it} {benot} {a:neutral_adj} {air_noun}.', it=it1, benot=benot1)
neg1_neutral_data += editor.template('{neg} {neutral_verb_present} {the} {air_noun}.', neg=neg, the=the1)
neg1_neutral_data.data = list(np.random.choice(neg1_neutral_data.data, N, replace=False))

print(len(neg1_neutral_data.data))


In [None]:
### Negation at the end
N = 1000
it2 = ['this', 'that', 'the']
nt2 = ['is not', 'isn\'t']
the2 = ['this', 'that', 'the']

air_noun_it = [x for x in editor.lexicons['air_noun'] if x != 'pilot']

# I thought x was positive, but it was not (should be negative)
neg2_negative_data = editor.template('I thought {it} {air_noun} would be {pos_adj}, but it {neg}.', air_noun=air_noun_it, neg=['was not', 'wasn\'t'], it=it2, nt=nt2)
neg2_negative_data += editor.template('I thought I would {pos_verb_present} {the} {air_noun}, but I {neg}.', neg=['did not', 'didn\'t'], the=the2)
neg2_negative_data.data = list(np.random.choice(neg2_negative_data.data, N, replace=False))

print(len(neg2_negative_data.data))

# I thought x was negative, but it was not (should be neutral or positive)
neg2_positive_data = editor.template('I thought {it} {air_noun} would be {neg_adj}, but it {neg}.', air_noun=air_noun_it, neg=['was not', 'wasn\'t'], it=it2, nt=nt2)
neg2_positive_data += editor.template('I thought I would {neg_verb_present} {the} {air_noun}, but I {neg}.', neg=['did not', 'didn\'t'], the=the2)
neg2_positive_data.data = list(np.random.choice(neg2_positive_data.data, N, replace=False))

print(len(neg2_positive_data.data))

# But it was not (neutral) should still be neutral
neg2_neutral_data = editor.template('I thought {it} {air_noun} would be {neutral_adj}, but it {neg}.', air_noun=air_noun_it, neg=['was not', 'wasn\'t'], it=it2, nt=nt2)
neg2_neutral_data += editor.template('I thought I would {neutral_verb_present} {the} {air_noun}, but I {neg}.', neg=['did not', 'didn\'t'], the=the2)
if len(neg2_neutral_data.data) >= N: neg2_neutral_data.data = list(np.random.choice(neg2_neutral_data.data, N, replace=False))

print(len(neg2_neutral_data.data))


In [None]:
### Negation with neutral in the middle
N = 1000
neutral =['that I am from Brazil', 'my history with airplanes', 'all that I\'ve seen over the years', 'the time that I\'ve been flying', 'it\'s a Tuesday']
neg3 = ['I don\'t think', 'I can\'t say', 'I wouldn\'t say']
it3 = ['this', 'that', 'the']
be3 = ['is', 'was']
i3 = ['I', 'we']
the3 = ['this', 'that', 'the']

# Negation of positive with neutral stuff in the middle (should be negative)
new_neg = neg[:-1]
neg3_negative_data = editor.template('{neg}, given {neutral}, that {it} {air_noun} {be} {pos_adj}.', neutral=neutral, neg=neg3, it=it3, be=be3)
neg3_negative_data += editor.template('{neg}, given {neutral}, that {it} {be} {a:pos_adj} {air_noun}.', neutral=neutral,  neg=neg3, it=it3, be=be3)
neg3_negative_data += editor.template('{neg}, given {neutral}, that {i} {pos_verb_present} {the} {air_noun}.', neutral=neutral,  neg=neg3, i=i3, the=the3)
neg3_negative_data.data = list(np.random.choice(neg3_negative_data.data, N, replace=False))

print(len(neg3_negative_data.data))

# Negation of negative with neutral stuff in the middle (should be positive or neutral)
neg3_positive_data = editor.template('{neg}, given {neutral}, that {it} {air_noun} {be} {neg_adj}.', neutral=neutral, neg=neg3, it=it3, be=be3)
neg3_positive_data += editor.template('{neg}, given {neutral}, that {it} {be} {a:neg_adj} {air_noun}.', neutral=neutral,  neg=neg3, it=it3, be=be3)
neg3_positive_data += editor.template('{neg}, given {neutral}, that {i} {neg_verb_present} {the} {air_noun}.', neutral=neutral,  neg=neg3, i=i3, the=the3)
neg3_positive_data.data = list(np.random.choice(neg3_positive_data.data, N, replace=False))

print(len(neg3_positive_data.data))

# Negation of neutral with neutral in the middle, should still neutral
neg3_neutral_data = editor.template('{neg}, given {neutral}, that {it} {air_noun} {be} {neutral_adj}.', neutral=neutral, neg=neg3, it=it3, be=be3)
neg3_neutral_data += editor.template('{neg}, given {neutral}, that {it} {be} {a:neutral_adj} {air_noun}.', neutral=neutral,  neg=neg3, it=it3, be=be3)
neg3_neutral_data += editor.template('{neg}, given {neutral}, that {i} {neutral_verb_present} {the} {air_noun}.', neutral=neutral,  neg=neg3, i=i3, the=the3)
neg3_neutral_data.data = list(np.random.choice(neg3_neutral_data.data, N, replace=False))

print(len(neg3_neutral_data.data))


#### SRL using Min Funct Test (MFT)

In [None]:
### Author sentiment is more important
N = 1000

# Positive sentiment
change = [' but', '']
templates = ['Some people think you are {neg_adj},{change} I think you are {pos_adj}.', 'I think you are {pos_adj},{change} some people think you are {neg_adj}.',
             'I had heard you were {neg_adj},{change} I think you are {pos_adj}.', 'I think you are {pos_adj},{change} I had heard you were {neg_adj}.',]
author_positive_data = editor.template(templates, change=change, unroll=True)

templates = ['{others} {neg_verb_present} you,{change} I {pos_verb_present} you.', 'I {pos_verb_present} you,{change} {others} {neg_verb_present} you.',]
others = ['some people', 'my parents', 'my friends', 'people']
author_positive_data += editor.template(templates, others=others, change=change, unroll=True)
author_positive_data.data = list(np.random.choice(author_positive_data.data, N, replace=False))

print(len(author_positive_data.data))

# Negative sentiment
change = [' but', '']
templates = ['Some people think you are {pos_adj},{change} I think you are {neg_adj}.', 'I think you are {neg_adj},{change} some people think you are {pos_adj}.',
             'I had heard you were {pos_adj},{change} I think you are {neg_adj}.', 'I think you are {neg_adj},{change} I had heard you were {pos_adj}.', ]
author_negative_data = editor.template(templates, change=change, unroll=True)

templates = ['{others} {pos_verb_present} you,{change} I {neg_verb_present} you.', 'I {neg_verb_present} you,{change} {others} {pos_verb_present} you.',]
others = ['some people', 'my parents', 'my friends', 'people']
author_negative_data += editor.template(templates, others=others, change=change, unroll=True)
author_negative_data.data = list(np.random.choice(author_negative_data.data, N, replace=False))

print(len(author_negative_data.data))


In [None]:
### Parsing sentiment in question "yes" form (0: negative, 1: neutral, 2: positive)
N = 1000

# Negative question
q_yes_positive_data = editor.template('Do I think {it} {air_noun} {be} {pos_adj}? Yes', it=['that', 'this', 'the'], be=['is', 'was'])
q_yes_positive_data += editor.template('Do I think {it} {be} {a:pos_adj} {air_noun}? Yes', it=['it', 'this', 'that'], be=['is', 'was'])
q_yes_positive_data += editor.template('Did {i} {pos_verb_present} {the} {air_noun}? Yes', i=['I', 'we'], the=['this', 'that', 'the'])
q_yes_positive_data.data = list(np.random.choice(q_yes_positive_data.data, N, replace=False))

print(len(q_yes_positive_data.data))

# Positive or neutral question
q_yes_negative_data = editor.template('Do I think {it} {air_noun} {be} {neg_adj}? Yes', it=['that', 'this', 'the'], be=['is', 'was'])
q_yes_negative_data += editor.template('Do I think {it} {be} {a:neg_adj} {air_noun}? Yes', it=['it', 'this', 'that'], be=['is', 'was'])
q_yes_negative_data += editor.template('Did {i} {neg_verb_present} {the} {air_noun}? Yes', i=['I', 'we'], the=['this', 'that', 'the'])
q_yes_negative_data.data = list(np.random.choice(q_yes_negative_data.data, N, replace=False))

print(len(q_yes_positive_data.data))

# Neutral question
q_yes_neutral_data = editor.template('Do I think {it} {air_noun} {be} {neutral_adj}? Yes', it=['that', 'this', 'the'], be=['is', 'was'])
q_yes_neutral_data += editor.template('Do I think {it} {be} {a:neutral_adj} {air_noun}? Yes', it=['it', 'this', 'that'], be=['is', 'was'])
q_yes_neutral_data += editor.template('Did {i} {neutral_verb_present} {the} {air_noun}? Yes', i=['I', 'we'], the=['this', 'that', 'the'])
q_yes_neutral_data.data = list(np.random.choice(q_yes_neutral_data.data, N, replace=False))

print(len(q_yes_neutral_data.data))


In [None]:
### Parsing sentiment in question "no" form (0: negative, 1: neutral, 2: positive)
N = 1000

# Negative question
q_no_negative_data = editor.template('Do I think {it} {air_noun} {be} {pos_adj}? No', it=['that', 'this', 'the'], be=['is', 'was'])
q_no_negative_data += editor.template('Do I think {it} {be} {a:pos_adj} {air_noun}? No', it=['it', 'this', 'that'], be=['is', 'was'])
q_no_negative_data += editor.template('Did {i} {pos_verb_present} {the} {air_noun}? No', i=['I', 'we'], the=['this', 'that', 'the'])
q_no_negative_data.data = list(np.random.choice(q_no_negative_data.data, N, replace=False))

print(len(q_no_negative_data.data))

# Positive or neutral question
q_no_positive_data = editor.template('Do I think {it} {air_noun} {be} {neg_adj}? No', it=['that', 'this', 'the'], be=['is', 'was'])
q_no_positive_data += editor.template('Do I think {it} {be} {a:neg_adj} {air_noun}? No', it=['it', 'this', 'that'], be=['is', 'was'])
q_no_positive_data += editor.template('Did {i} {neg_verb_present} {the} {air_noun}? No', i=['I', 'we'], the=['this', 'that', 'the'])
q_no_positive_data.data = list(np.random.choice(q_no_positive_data.data, N, replace=False))

print(len(q_no_positive_data.data))

# Neutral question
q_no_neutral_data = editor.template('Do I think {it} {air_noun} {be} {neutral_adj}? No', it=['that', 'this', 'the'], be=['is', 'was'])
q_no_neutral_data += editor.template('Do I think {it} {be} {a:neutral_adj} {air_noun}? No', it=['it', 'this', 'that'], be=['is', 'was'])
q_no_neutral_data += editor.template('Did {i} {neutral_verb_present} {the} {air_noun}? No', i=['I', 'we'], the=['this', 'that', 'the'])
q_no_neutral_data.data = list(np.random.choice(q_no_neutral_data.data, N, replace=False))

print(len(q_no_neutral_data.data))


## Inference

### Run model

In [None]:

def response_from_generate(model, messages):
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
    model_inputs = tokenizer([text], return_tensors="pt").to(device)
    generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=1)
    generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
    
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
    output_mapping = {'A' : 0, 'B' : 1, 'C' : 2}
    
    return output_mapping[response]


def response_from_forward(model, messages):
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    model_inputs = tokenizer([text], return_tensors="pt").to(device)
    output = model.forward(model_inputs.input_ids)
    
    # idx 32 = A (positive), idx 33 = B (negative), idx 34 = C (neutral)
    response = torch.argmax(output.logits[0, -1, 32:35]).item()

    return response


def inference_inv(model, data, inference_mode='forward'):
    """
    Perform inference on model using created data samples. The first sentence
    in each list of strings is the gold label. inference_mode='generate' means
    .generate() is used to create a written response; inference_mode='forward'
    means .forward() uses the output logits to determine the response.
    """
    
    system_message = "Give the sentiment of the user's prompt. Please only respond with A (positive), B (negative) or C (neutral)."
    
    gold_labels, pred_labels = [], []
    
    for sentences in tqdm(data):
        sentence_labels = []
        for i, user_prompt in enumerate(sentences):

            messages = [
                {"role": "system", "content": system_message},
                {"role": "user", "content": user_prompt}
            ]
            
            if inference_mode == 'generate':
                response = response_from_generate(model, messages)
            elif inference_mode == 'forward':
                response = response_from_forward(model, messages)
            else:
                assert False, 'unknown inference mode'
            
            if i == 0:
                gold_labels.append(response)
            else:
                sentence_labels.append(response)
        
        pred_labels.append(sentence_labels)
    
    return gold_labels, pred_labels


def inference_mft(model, data):
    """
    Perform inference on model using created data samples. The first sentence
    in each list of strings is the gold label. inference_mode='generate' means
    .generate() is used to create a written response; inference_mode='forward'
    means .forward() uses the output logits to determine the response.
    """

    system_message = "Give the sentiment of the user's prompt. Please only respond with A (positive), B (negative) or C (neutral)."
    pred_labels = []
    
    print(data[0])
    
    for user_prompt in data:
        messages = [
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_prompt}
        ]

        response = response_from_forward(model, messages)
        pred_labels.append(response)
    
    return pred_labels


# gold_labels, pred_labels = inference_inv(model, perturb_location_data, inference_mode='forward')

### Evaluate

In [None]:

def evaluate_inv(gold_labels, pred_labels):
    
    y_true, y_pred = [], []
    
    for i, sentence_labels in enumerate(pred_labels):
        for prompt_label in sentence_labels:
            y_pred.append(prompt_label)
            y_true.append(gold_labels[i])
    
    return accuracy_score(y_true, y_pred)


def evaluate_mft(pred_labels, expected_output=None):
    """
    Give expected output to model (0 = positive, 1 = negative, 2 = neutral).
    Can be list if multiple outputs are correct.
    """
    
    if type(expected_output) != list:
        expected_output = [expected_output]
    
    y_pred = [1 if label in expected_output else 0 for label in pred_labels]

    return sum(y_pred) / len(y_pred)

# evaluate_inv(gold_labels, pred_labels)


## Testing area

In [None]:
# Sentiment NER INV
gold_labels, pred_labels = inference_inv(model, perturb_location_data, inference_mode='forward')
print(f'Accuracy: {evaluate_inv(gold_labels, pred_labels):.2f}')

gold_labels, pred_labels = inference_inv(model, perturb_names_data, inference_mode='forward')
print(f'Accuracy: {evaluate_inv(gold_labels, pred_labels):.2f}')

# Sentiment Robustness INV
gold_labels, pred_labels = inference_inv(model, perturb_irrelevant_data, inference_mode='forward')
print(f'Accuracy: {evaluate_inv(gold_labels, pred_labels):.2f}')

gold_labels, pred_labels = inference_inv(model, perturb_punc_data, inference_mode='forward')
print(f'Accuracy: {evaluate_inv(gold_labels, pred_labels):.2f}')

gold_labels, pred_labels = inference_inv(model, perturb_typo1_data, inference_mode='forward')
print(f'Accuracy: {evaluate_inv(gold_labels, pred_labels):.2f}')

gold_labels, pred_labels = inference_inv(model, perturb_typo2_data, inference_mode='forward')
print(f'Accuracy: {evaluate_inv(gold_labels, pred_labels):.2f}')

gold_labels, pred_labels = inference_inv(model, perturb_typo5_data, inference_mode='forward')
print(f'Accuracy: {evaluate_inv(gold_labels, pred_labels):.2f}')

gold_labels, pred_labels = inference_inv(model, perturb_contract_data, inference_mode='forward')
print(f'Accuracy: {evaluate_inv(gold_labels, pred_labels):.2f}')


In [None]:
# Sentiment Negation MFT
# Level 1
pred_labels = inference_mft(model, neg1_negative_data.data)
accuracy = evaluate_mft(pred_labels, expected_output=1)
print(f'Accuracy: {accuracy:.2f}\n')

pred_labels = inference_mft(model, neg1_positive_data.data)
accuracy = evaluate_mft(pred_labels, expected_output=[0, 2])
print(f'Accuracy: {accuracy:.2f}\n')

pred_labels = inference_mft(model, neg1_neutral_data.data)
accuracy = evaluate_mft(pred_labels, expected_output=2)
print(f'Accuracy: {accuracy:.2f}\n')

# Level 2
pred_labels = inference_mft(model, neg2_negative_data.data)
accuracy = evaluate_mft(pred_labels, expected_output=1)
print(f'Accuracy: {accuracy:.2f}\n')

pred_labels = inference_mft(model, neg2_positive_data.data)
accuracy = evaluate_mft(pred_labels, expected_output=[0, 2])
print(f'Accuracy: {accuracy:.2f}\n')

pred_labels = inference_mft(model, neg2_neutral_data.data)
accuracy = evaluate_mft(pred_labels, expected_output=2)
print(f'Accuracy: {accuracy:.2f}\n')

# Level 3
pred_labels = inference_mft(model, neg3_negative_data.data)
accuracy = evaluate_mft(pred_labels, expected_output=1)
print(f'Accuracy: {accuracy:.2f}\n')

pred_labels = inference_mft(model, neg3_positive_data.data)
accuracy = evaluate_mft(pred_labels, expected_output=[0, 2])
print(f'Accuracy: {accuracy:.2f}\n')

pred_labels = inference_mft(model, neg3_neutral_data.data)
accuracy = evaluate_mft(pred_labels, expected_output=2)
print(f'Accuracy: {accuracy:.2f}\n')


In [None]:
# Sentiment SRL MFT

# TODO
