In [29]:
# import relevant packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import sys
sys.path.append("../src/")
sys.path.append("..")

import torch
from bert import *

In [30]:
# load bert and define model
bert = BERTBase()
model = torch.load('../trained_models/model.ffnn.binder.5k.50epochs.0.5dropout.lr1e-4.hsize300')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [31]:
# load in noisy dataset (good, bad, and nonsensical nice of you sentences)
noisy_df = pd.read_csv('../data/processed/nice_of_you_davinci-002_scores.csv')
noisy_df = noisy_df.drop(noisy_df.columns[0], axis = 1) # drop weird index
noisy_df = noisy_df[~noisy_df['sentence'].str.contains('being')] # get rid of passive verbs
noisy_df = noisy_df[~noisy_df['sentence'].str.contains('bing')]
noisy_df = noisy_df[~noisy_df['sentence'].str.contains('stoping')]
noisy_df = noisy_df[~noisy_df['sentence'].str.contains('naping')]
noisy_df = noisy_df[~noisy_df['sentence'].str.contains('seing')]
noisy_df["words"] = noisy_df["noun"] + " " + noisy_df["verb"] + " " + noisy_df["adjective"] # for grouping
noisy_mean = noisy_df.groupby("words")["probs"].mean()
noisy_df["prob_diff"] = noisy_df.apply(lambda row: row["probs"] - noisy_mean[row.words], axis = 1)
noisy_df.drop(columns = ["words"], inplace = True)


In [32]:
# show noisy sentences
noisy_df.head()

Unnamed: 0,sentence,noun,n_score,verb,v_score,adjective,a_score,total score,construction,rating,probs,prob_diff
0,it's sour of the queen to act,queen,2,act,2,sour,1,5,nice-of-you,okay,-48.43031,-2.58358
1,it's sour for the queen to act,queen,2,act,2,sour,1,5,hard-for,okay,-47.118937,-1.272207
2,it's sour that the queen is acting,queen,2,act,2,sour,1,5,important-that,okay,-46.076011,-0.229281
3,the queen is sour to act,queen,2,act,2,sour,1,5,GAI,okay,-41.761663,4.085067
4,it's outstanding of the horse to taste that,horse,1,taste,-10,outstanding,2,-7,nice-of-you,bad,-50.774481,-1.227295


In [33]:
# perform a grammaticality analysis on subjects
noisy_good = noisy_df[noisy_df["n_score"] == 2]
noisy_bad = noisy_df[noisy_df["n_score"] == -10]
noisy_okay = noisy_df[noisy_df["n_score"] == 1]

fig = go.Figure()

# box plot of sentences with good subjects
fig.add_trace(go.Box(x = noisy_good["construction"], y = noisy_good["prob_diff"],
    name = "good subject", hovertext = noisy_good["sentence"], boxpoints = "all", jitter = 0.3
))

# box plot of sentences with okay subjects
fig.add_trace(go.Box(x = noisy_okay["construction"], y = noisy_okay["prob_diff"],
    name = "okay subject", hovertext = noisy_okay["sentence"], boxpoints = "all", jitter = 0.3
))

# box plot of sentences with bad subjects
fig.add_trace(go.Box(x = noisy_bad["construction"], y = noisy_bad["prob_diff"],
    name = "bad subject", hovertext = noisy_bad["sentence"], boxpoints = "all", jitter = 0.3
))

fig.update_layout(
    title_text = "Deviation From Sentence Mean (No VERB/ADJ control)",
    yaxis_title='scoring',
    boxmode='group' # group together boxes of the different traces for each value of x
)
fig.show()

In [34]:
# plot base grammaticality judgement vs. comparison with sentence mean
fig = px.scatter(noisy_df, 
                 x = noisy_df["probs"], y = noisy_df["prob_diff"],
                 color = noisy_df["construction"], labels = {'x': 'GPT evaluation', 'y': 'construction deviation'},
                 title = "Comparing Grammaticality and Construction Fit (No VERB/ADJ control)", 
                 hover_data = [noisy_df["sentence"]])

fig.show()

In [35]:
# load in nice dataset (all constructions take the adjective "nice" and the verb "do")
nice_df = pd.read_csv('../data/processed/varying_subject_davinci-002_scores.csv')
nice_df = nice_df.drop(nice_df.columns[0:2], axis = 1)
nice_df = nice_df.drop(columns = ["verb", "v_score", "adjective", "a_score", "total score", "rating"])
nice_mean = nice_df.groupby("noun")["probs"].mean()
nice_df["prob_diff"] = nice_df.apply(lambda row: row["probs"] - nice_mean[row.noun], axis = 1)

In [36]:
# show nice sentences
nice_df.head()

Unnamed: 0,sentence,noun,n_score,construction,probs,prob_diff
0,It's nice of the person to do that,person,2,nice-of-you,-27.225583,2.415579
1,It's nice for the person to do that,person,2,hard-for,-28.123272,1.517891
2,It's nice that the person is doing that,person,2,important-that,-28.429098,1.212064
3,The person is nice to do that,person,2,GAI,-34.781772,-5.14061
4,It's nice of the child to do that,child,2,nice-of-you,-29.714565,1.459728


In [37]:
# perform a grammaticality analysis on subjects
nice_good = nice_df[nice_df["n_score"] == 2]
nice_bad = nice_df[nice_df["n_score"] == -10]
nice_okay = nice_df[nice_df["n_score"] == 1]

fig = go.Figure()

# box plot of sentences with good subjects
fig.add_trace(go.Box(x = nice_good["construction"], y = nice_good["prob_diff"],
    name = "good subject", hovertext = nice_good["sentence"], boxpoints = "all", jitter = 0.3
))

# box plot of sentences with okay subjects
fig.add_trace(go.Box(x = nice_okay["construction"], y = nice_okay["prob_diff"],
    name = "okay subject", hovertext = nice_okay["sentence"], boxpoints = "all", jitter = 0.3
))

# box plot of sentences with bad subjects
fig.add_trace(go.Box(x = nice_bad["construction"], y = nice_bad["prob_diff"],
    name = "bad subject", hovertext = nice_bad["sentence"], boxpoints = "all", jitter = 0.3
))

fig.update_layout(
    yaxis_title='scoring',
    title_text = "Deviation From Sentence Mean (Fixed VERB/ADJ)",
    boxmode='group' # group together boxes of the different traces for each value of x
)
fig.show()

In [38]:
# plot base grammaticality judgement vs. comparison with sentence mean
fig = px.scatter(nice_df, 
                 x = nice_df["probs"], y = nice_df["prob_diff"],
                 color = nice_df["construction"], labels = {'x': 'GPT evaluation', 'y': 'construction deviation'},
                 title = "Comparing Grammaticality and Construction Fit (Fixed VERB/ADJ)",
                 hover_data=[nice_df["sentence"]])
fig.show()

In [39]:
# load in okay dataset (all constructions take "good" adjectives and verbs)
okay_df = pd.read_csv('../data/processed/varying_subject_good_verb_adj_davinci-002_scores.csv')
okay_df = okay_df.drop(okay_df.columns[:2], axis = 1)
okay_df = okay_df[~okay_df['sentence'].str.contains('being')]
okay_df = okay_df[~okay_df['sentence'].str.contains('seing')]
okay_df = okay_df[~okay_df['sentence'].str.contains('stoping')]
okay_df = okay_df[~okay_df['sentence'].str.contains('naping')]
okay_df["words"] = okay_df["noun"] + " " + okay_df["verb"] + " " + okay_df["adjective"] # for grouping
okay_mean = okay_df.groupby("words")["probs"].mean()
okay_df["prob_diff"] = okay_df.apply(lambda row: row["probs"] - okay_mean[row.words], axis = 1)
okay_df.drop(columns = ["v_score", "a_score", "total score", "words"], inplace = True)

In [40]:
# show okay sentences
okay_df.head()

Unnamed: 0,sentence,noun,n_score,verb,adjective,construction,rating,probs,prob_diff
0,It's nutty of the singer to launch that,singer,2,launch,nutty,nice-of-you,good,-51.849281,-1.868152
1,It's nutty for the singer to launch that,singer,2,launch,nutty,hard-for,good,-50.389603,-0.408473
2,It's nutty that the singer is launching that,singer,2,launch,nutty,important-that,good,-49.914448,0.066682
3,The singer is nutty to launch that,singer,2,launch,nutty,GAI,good,-47.771187,2.209943
4,It's marvellous of the direction to guard that,direction,-10,guard,marvellous,nice-of-you,bad,-55.414163,-2.924979


In [41]:
# perform a grammaticality analysis on subjects
okay_good = okay_df[okay_df["n_score"] == 2]
okay_bad = okay_df[okay_df["n_score"] == -10]
okay_okay = okay_df[okay_df["n_score"] == 1]

fig = go.Figure()

# box plot of sentences with good subjects
fig.add_trace(go.Box(x = okay_good["construction"], y = okay_good["prob_diff"],
    name = "good subject", hovertext = okay_good["sentence"], boxpoints = "all", jitter = 0.3
))

# box plot of sentences with okay subjects
fig.add_trace(go.Box(x = okay_okay["construction"], y = okay_okay["prob_diff"],
    name = "okay subject", hovertext = okay_okay["sentence"], boxpoints = "all", jitter = 0.3
))

# box plot of sentences with bad subjects
fig.add_trace(go.Box(x = okay_bad["construction"], y = okay_bad["prob_diff"],
    name = "bad subject", hovertext = okay_bad["sentence"], boxpoints = "all", jitter = 0.3
))

fig.update_layout(
    yaxis_title='scoring',
    title_text = "Deviation From Subject Mean (Reasonable VERB/ADJ)",
    boxmode='group' # group together boxes of the different traces for each value of x
)
fig.show()

In [42]:
# plot base grammaticality judgement vs. comparison with sentence mean
fig = px.scatter(okay_df, 
                 x = okay_df["probs"], y = okay_df["prob_diff"],
                 color = okay_df["construction"], labels = {'x': 'GPT evaluation', 'y': 'construction deviation'},
                 title = "Comparing Grammaticality and Construction Fit (Reasonable VERB/ADJ)",
                 hover_data=[okay_df["sentence"]])
fig.show()