# Load libraries and dataset


In [None]:
# Import relevant packages
import torch
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sn
import statsmodels.stats.multicomp as multi

!pip install transformers
from transformers import pipeline, set_seed
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

In [None]:
# Load data
data = pd.read_csv('scriptie_data.csv')
control_data = pd.read_csv('scriptie_control_data.csv')

# Recognize underspecification

for the relevant model, change the "model" string in the first line

In [None]:
generator = pipeline('text-generation', model='google/flan-t5-xxl')
set_seed(42)

def find_prediction_of_underspecified(sentence, control_sentence):
  control_first = random.randint(0,1)
  first_sentence = control_sentence if control_first else sentence
  second_sentence = sentence if control_first else control_sentence
  textje = "Here are two sentences. A: \"" + first_sentence + "\". B: \"" + second_sentence + "\". Which one of these is more unclear? Please respond by outputting only A or B. Answer:"
  answer = generator(textje, max_new_tokens=1, num_return_sequences=1)[0]['generated_text']
  answer_is_A = answer[-1] == "A"; answer_is_B = answer[-1] == "B"
  if control_first and answer_is_B:
    return "Correct-B"
  elif control_first and answer_is_A:
    return "Incorrect-A"
  elif control_first:
    return "Incorrect-Neither"
  elif not control_first and answer_is_A:
    return "Correct-A"
  elif not control_first and answer_is_B:
    return "Incorrect-B"
  else:
    return "Incorrect-Neither"

outputs = []
for sentence, control_sentence in zip(data['text'], control_data['text']):
  outputs.append(find_prediction_of_underspecified(sentence, control_sentence))
data['Flan-T5 recognition'] = outputs
control_data['Flan-T5 recognition'] = outputs

Test if the accuracy is significantly higher than chance

In [None]:
import scipy.stats as stats
for model in ["Flan-T5-xxl recognition", "GPT2-xl recognition", "XLNet-large-cased recognition", "OPT-13b recognition"]:
  print(model)
  subdivision = 'class'
  for klasje in data[subdivision].unique():
    newdata = data[data[subdivision] == klasje]
    accuracy = len(newdata[(newdata[model] == "Correct-A") | (newdata[model] == "Correct-B")])
    print(klasje, "accuracy:", accuracy/len(newdata))
    print(klasje, "significance:", stats.binom_test(accuracy, n=len(newdata), p=0.5, alternative='greater'))
  print("---------------")

# Self-correct underspecification

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

def generate_reasoning(sentence):
  textje = "Here is a sentence: \"" + sentence + "\". This sentence is underspecified. Please explain why this sentence is underspecified. Answer:"
  inputs = tokenizer(textje, return_tensors="pt")
  outputs = model.generate(**inputs, max_new_tokens=200)
  answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)
  return answer[0]

def generate_specified(sentence):
  textje = "Here is a sentence: \"" + sentence + "\". This sentence is underspecified. Please formulate a version of this sentence that is less underspecified. Answer:"
  inputs = tokenizer(textje, return_tensors="pt")
  outputs = model.generate(**inputs, max_new_tokens=200)
  answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)
  return answer[0]

sample = data.groupby('subclass', group_keys=False).apply(lambda x: x.sample(10))
control_sample = control_data.iloc[sample.index.values]
for sentence, control_sentence, subclass in zip(sample['text'], control_sample['text'], sample['subclass']):
  output = generate_reasoning(sentence)
  print("When prompted with the sentence \"" + sentence + "\", whose specified counterpart is \"" + control_sentence + "\" and whose subclass is " + subclass + ", the model's reasoning for this being underspecified is \"" + output + "\"\n")
print("\n\n---------------\n\n")
for sentence, control_sentence, subclass in zip(sample['text'], control_sample['text'], sample['subclass']):
  output = generate_specified(sentence)
  print("When prompted with the sentence \"" + sentence + "\", whose specified counterpart is \"" + control_sentence + "\" and whose subclass is " + subclass + ", the model's specified phrasing of this sentence is \"" + output + "\"\n")