# Install libraries and load data

In [None]:
# Import relevant packages
import torch
import seaborn
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import statsmodels.stats.multicomp as multi
import statsmodels.api as sm

!pip install lmppl
import lmppl

!pip install transformers
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import XLNetLMHeadModel, XLNetTokenizer

In [None]:
# Load data
data = pd.read_csv('scriptie_data.csv')
control_data = pd.read_csv('scriptie_control_data.csv')

# Calculate perplexities

for all the below, change the string describing the model to get the smaller version of the model

Calculate Flan-T5 perplexity

In [None]:
scorer = lmppl.EncoderDecoderLM('google/flan-t5-xxl')
def calculate_FlanT5_perplexity(sentence):
    inputs = [""]
    outputs = [sentence]
    ppl = scorer.get_perplexity(input_texts=inputs, output_texts=outputs)
    return ppl[0]
data['Flan-T5-xxl perplexity'] = data['text'].apply(calculate_FlanT5_perplexity)
control_data['Flan-T5-xxl perplexity'] = control_data['text'].apply(calculate_FlanT5_perplexity)

Calculate OPT perplexity

In [None]:
scorer = lmppl.LM('facebook/opt-13b')
def calculate_OPT_perplexity(sentence):
    inputs = [sentence]
    ppl = scorer.get_perplexity(input_texts=inputs)
    return ppl[0]
data['OPT-13b perplexity'] = data['text'].apply(calculate_OPT_perplexity)
control_data['OPT-13b perplexity'] = control_data['text'].apply(calculate_OPT_perplexity)

Calculate XLNet perplexity

In [None]:
XLNet_model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased')
XLNet_tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
def calculate_XLNet_perplexity(sentence):
    inputs = XLNet_tokenizer.encode(sentence, return_tensors='pt')
    with torch.no_grad():
      outputs = XLNet_model(inputs, labels=inputs)
      loss = outputs[0]
      perplexity = torch.exp(loss).item()
    return perplexity
data['XLNet-large-cased perplexity'] = data['text'].apply(calculate_XLNet_perplexity)
control_data['XLNet-large-cased perplexity'] = control_data['text'].apply(calculate_XLNet_perplexity)

Calculate GPT2 perplexity

In [None]:
GPT2_model = GPT2LMHeadModel.from_pretrained('gpt2-xl', output_hidden_states=True)
GPT2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2-xl')
GPT2_tokenizer.padding_side = "left" # Very Important
GPT2_tokenizer.pad_token = GPT2_tokenizer.eos_token

def calculate_GPT2_perplexity(sentence):
    inputs = GPT2_tokenizer.encode(sentence, return_tensors='pt')
    outputs = GPT2_model(inputs)
    logits = outputs.logits[:, :-1, :]
    loss = torch.nn.functional.cross_entropy(logits.reshape(-1, logits.shape[-1]), inputs[:, 1:].reshape(-1))
    perplexity = torch.exp(loss).item()
    return perplexity
data['GPT2-xl perplexity'] = data['text'].apply(calculate_GPT2_perplexity)
control_data['GPT2-xl perplexity'] = control_data['text'].apply(calculate_GPT2_perplexity)

In [None]:
# !pip install transformers
# from transformers import pipeline, set_seed
import random
generator = pipeline('text-generation', model='facebook/opt-350m')
set_seed(42)
for i in range(5):
  j = random.randint(0, len(data))
  textje = "Here are two sentences. A: \"" + control_data['text'][j] + "\" and B: \"" + data['text'][j] + "\". Which one of these is more semantically underspecified? Please answer by saying only 'A' or 'B'. Answer:"
  print(generator(textje, max_new_tokens=5, num_return_sequences=1))

# Visualization and significance tests

Violin plots

In [None]:
def violinplotter(statistic, subdivision):
  temp_data = data
  temp_data['underspecified'] = [True] * len(temp_data)
  temp_control_data = control_data
  temp_control_data['underspecified'] = [False] * len(temp_control_data)
  temp = pd.concat([temp_data, temp_control_data])
  plt.figure(figsize=(10,6))
  ax = seaborn.violinplot(temp, x=subdivision, y=statistic, hue='underspecified', cut=0)
  #ax.set_ylim(0,1500)
  plt.xticks(rotation = 90)
  plt.savefig(statistic + "_by_" + subdivision + '.png', bbox_inches='tight')
violinplotter('Average word frequency', 'subclass')

Heatmap with significance tests

In [None]:
# Gather required data
models = ['XLNet-base-cased perplexity', 'XLNet-large-cased perplexity', "OPT-125m perplexity", "OPT-13b perplexity", "GPT2-base perplexity", "GPT2-xl perplexity", "Flan-T5-base perplexity", "Flan-T5-xxl perplexity"]
subdivision = "class"
classes = data[subdivision].unique()
classes.sort()

matrix = []
labels = []
for model in models:
  row_of_matrix = []
  row_of_labels = []
  for klasje in classes:
    stat, p = stats.mannwhitneyu(data[data[subdivision] == klasje][model], control_data[control_data[subdivision] == klasje][model])
    q = ">" if data[data[subdivision] == klasje][model].mean() > control_data[control_data[subdivision] == klasje][model].mean() else "<"
    row_of_matrix.append(p)
    row_of_labels.append(q)
  matrix.append(row_of_matrix)
  labels.append(row_of_labels)

# Plot heatmap
df_cm = pd.DataFrame(matrix,
                     index = ["XLNet base", "XLNet large", "OPT 125m", "OPT 13b", "GPT2 base", "GPT2-xl", "Flan-T5 base", "Flan-T5 xxl"],
                     columns = classes
              )
green = seaborn.light_palette("seagreen", reverse=True, as_cmap=True)
green.set_over('tomato')
plt.figure(figsize = (7,7))
plt.tight_layout()
heatmap = seaborn.heatmap(df_cm, annot=labels, fmt = '', cmap=green, vmin=0, vmax=0.05, cbar_kws={'label': 'p'})
heatmap.set(xlabel=subdivision, ylabel='Model')
fig = heatmap.get_figure()
fig.tight_layout()
fig.savefig("Perplexity_signifiance" + subdivision + ".png")

In [None]:
models = ['XLNet-base-cased perplexity', 'XLNet-large-cased perplexity', "OPT-125m perplexity", "OPT-13b perplexity", "GPT2-base perplexity", "GPT2-xl perplexity", "Flan-T5-base perplexity", "Flan-T5-xxl perplexity"]
relevant = data
relevant_control = control_data
classes = relevant['class'].unique().tolist()
classes.sort()

matrix = []
labels = []
for model in models:
  row_of_matrix = []
  row_of_labels = []
  for klasje in classes:
    _, p = stats.mannwhitneyu(relevant[relevant["class"] == klasje][model], relevant_control[relevant_control["class"] == klasje][model])
    q = ">" if relevant[relevant['class'] == klasje][model].mean() > relevant_control[relevant_control['class'] == klasje][model].mean() else "<"
    row_of_matrix.append(p)
    row_of_labels.append(q)
  matrix.append(row_of_matrix)
  labels.append(row_of_labels)

# Logistic regression fits

Fit logistic regression model

In [None]:
models = ['XLNet-base-cased perplexity', 'XLNet-large-cased perplexity', "OPT-125m perplexity", "OPT-13b perplexity", "GPT2-base perplexity", "GPT2-xl perplexity", "Flan-T5-base perplexity", "Flan-T5-xxl perplexity"]
for model in models:
  data[model + " verschil"] = data[model] - control_data[model]
  control_data[model + " verschil"] = control_data[model] - data[model]
data['underspecifiedness'] = [1] * len(data)
control_data['underspecifiedness'] = [0] * len(data)

# Fit multiple models and store the summaries
summaries = []
#for perpl in ['XLNet-base-cased perplexity', 'XLNet-large-cased perplexity', "OPT-125m perplexity", "OPT-13b perplexity", "GPT2-base perplexity", "GPT2-xl perplexity", "Flan-T5-base perplexity", "Flan-T5-xxl perplexity"]:
#for perpl in ['XLNet-large-cased perplexity',  "OPT-13b perplexity", "GPT2-xl perplexity", "Flan-T5-xxl perplexity"]:
for perpl in ["Flan-T5-xxl perplexity"]:
    cumulative = []
    for dingetje in data['Flan-T5-xxl recognition']:
      cumulative.append(1 if dingetje.split('-')[0] == "Correct" else 0)
    data['prediction_underspecifiedness'] = cumulative
    cumulative = []
    for dingetje in control_data['Flan-T5-xxl recognition']:
      cumulative.append(0 if dingetje.split('-')[0] == "Correct" else 1)
    control_data['prediction_underspecifiedness'] = cumulative

    all_data = pd.concat([data, control_data])
    #all_data = all_data[all_data['class'] == 1]


    x1 = all_data['length in words']
    x2 = all_data['average concreteness']
    x3 = all_data['average AoA']
    x4 = all_data['Average word frequency']
    x5 = all_data[perpl + " verschil"]
    X = np.column_stack([x1, x2, x3, x4, x5])
    y = all_data["prediction_underspecifiedness"]
    model = sm.Logit(y, X, formula='y~x1 + x2 + x3 + x4 + x5')
    result = model.fit()
    summaries.append(result.summary())

# Calculate average coefficients
average_coefs = []
for summary in summaries:
  coefs = summary.tables[1].data[1:]
  coefs = [[float(text) for text in row[1:]] for row in coefs]
  average_coefs.append(coefs)

average_coefs = np.average(average_coefs, 0).tolist()

Print data as LaTeX table

In [None]:
def generate_latex_table(text):
    lines = text.split('\n')
    table_header = r"""
\begin{table}[ht]
\centering
\caption{Average regression coefficients}
\begin{tabular}{lcccccc}
Variable & Coefficient & Standard Error & $z$ & $P>|z|$ & $[0.0.25$ & $0.095]$\\
\hline
"""
    table_footer = r"""
\end{tabular}
\end{table}
"""
    table_rows = []
    for line, variable in zip(lines, ["Sentence length", "Avg. concreteness", "Avg. age of acq.", "Avg. word freq.", "Perplexity"]):
        table_rows.append(variable + " & " + line)

    latex_table = table_header + "\n".join(table_rows) + table_footer
    return latex_table

text = "\\\\\n".join([" & ".join([str(round(value, 4)) for value in coefs_row]) for coefs_row in average_coefs])
print(generate_latex_table(text))

# Miscellaneous

In [None]:
# Save data
data.to_csv('scriptie_data.csv', index=False)
control_data.to_csv('scriptie_control_data.csv', index=False)