# BabyBERTa, RoBERTa, and Adjectives


The following are some initial experiments with BabyBERTa ([paper](https://aclanthology.org/2021.conll-1.49/)).

# Setting up

1. import libraries
2. Download data
3. Download code for models
4. Load BabyBERTa

In [None]:
!pip install openai
!pip install tiktoken
!pip install transformers
import pandas as pd

In [None]:
# Clone code
!git clone https://github.com/forrestdavis/PublicModelsAPI.git

In [None]:
# Load BabyBERTa
modelType = 'roberta'
modelName = 'phueb/BabyBERTa-1'

import sys
sys.path.append("/home/ubuntu/PublicModelsAPI/")
from src.models import models

run_config = {'models': {modelType: [modelName]}}
BabyBERTa = models.load_models(run_config)[0]


In [None]:
# Retrieve data
!wget -O frames.xlsx https://www.dropbox.com/s/w06gbjq28ug7zg0/PythonAnnotatedAdjectivesWithFrames.xlsx?dl=0
!wget -O replacements.xlsx https://www.dropbox.com/s/23o4gqracovwcv1/replacements.xlsx?dl=0

In [None]:
# Load frames
import pandas as pd
frames = pd.read_excel("frames.xlsx")
frames = frames.fillna('')

In [None]:
# Adjectives, class, etc.
adjectives = frames['adjective'].unique().tolist()

adj2class = {}
class2adj = {}
for adj in adjectives:
    if adj == '':
        continue
    c = frames[frames['adjective'] == adj]
    c = c['adjective_class'].tolist()[0]
    adj2class[adj] = c
    if c not in class2adj:
        class2adj[c] = []
    class2adj[c].append(adj)

In [None]:
frames.columns

In [None]:
frames[frames['right'] == 'gerund'][['gloss', 'stem', 'part_of_speech']]

In [None]:
# Gather those pesky lines

replace = pd.read_excel('replacements.xlsx')
og = replace['og'].tolist()
masked = replace['masked'].tolist()
REPLACEMENTS = dict(zip(og, masked))

In [None]:
# Get the relevant rows of data
inf = frames[(frames['self'] == 'adj') & (frames['right'] == 'infin')]
NPinf = frames[(frames['right'] == 'NP') & (frames['+2'] == 'inf')]
gerund = frames[(frames['right'] == 'gerund')]
forXP = frames[(frames['right'] == 'for XP')]
toXP = frames[(frames['right'] == 'to XP')]
withXP = frames[(frames['right'] == 'with XP')]
ofXP = frames[(frames['right'] == 'of XP')]

subset = pd.concat([inf, NPinf, gerund, forXP, toXP, withXP, ofXP])

# Get sentences to check
masked = []
gloss = subset['gloss'].tolist()
adjectives = subset['adjective'].tolist()
types = subset['right'].tolist()
for x, t in enumerate(types):
    if t == 'NP':
        types[x] = 'NPinf'

count = 0
for g, a, t in zip(gloss, adjectives, types):
    if a not in g:
        masked.append('NA')
    elif g.count(a) != 1:
        masked.append(REPLACEMENTS[g])
    else:
        masked.append(g.replace(a, 'MASKTOKEN'))

subset['masked'] = masked
subset = subset[subset['masked'] != 'NA']
subset = subset[list(filter(lambda x: 'Unnamed' not in x, subset.columns))]

In [None]:
subset.to_csv('AnnotatedSubsetForModels.tsv', sep='\t', index=False)

In [None]:
# Get the average ppl of each adjective class in each frame (BabyBERTa)
results = {}
for sent, og_adjective in zip(subset['masked'].tolist(), subset['adjective'].tolist()):
    for adjClass in class2adj:
        if adjClass not in results:
            results[adjClass] = []
        batch = []
        for target in class2adj[adjClass]:
            if target == og_adjective:
                continue
            assert 'MASKTOKEN' in sent, sent
            filled = sent.replace('MASKTOKEN', target)
            batch.append(filled)
        out = BabyBERTa.get_by_sentence_perplexity(batch)
        sum = 0
        for o in out:
            sum += o[1]
        results[adjClass].append(sum/len(out))

# Get adjective probability

In [None]:
data = pd.read_csv('AnnotatedSubsetForModels.tsv', sep='\t')
class2adj = pd.read_csv('Class2Adj.tsv', sep='\t')
adj2class = {}
temp = {}
for adjClass in class2adj.columns:
    adjs = class2adj[adjClass].tolist()
    temp[adjClass] = adjs
    for adj in adjs:
        adj2class[adj] = adjClass
class2adj = temp

In [None]:
SentData = {}
for sent in data['masked'].tolist():
    # Find MASKTOKEN word position
    maskIDX = 0
    for idx, word in enumerate(sent.split(' ')):
        if word == 'MASKTOKEN':
            maskIDX = idx
            break
    for adj in adj2class:
        if adj not in SentData:
            SentData[adj] = []
        filled = sent.replace('MASKTOKEN', adj)
        assert adj == filled.split(' ')[maskIDX], f"{filled.split(' ')} {adj}"
        SentData[adj].append((idx, adj, filled))

In [None]:
# Get Results
ResultData = {}
batchSize = 400
for adj in SentData:
    ResultData[adj] = []
    pairs = SentData[adj]
    for idx in range(0, len(pairs), batchSize):
        batch = pairs[idx:idx+batchSize]
        positions = []
        sents = []
        targets = []
        for pair in batch:
            positions.append(pair[0])
            targets.append(pair[1])
            sents.append(pair[2])
        probabilities = BabyBERTa.get_aligned_words_probabilities(sents)
        assert len(positions) == len(probabilities)
        for j, (position, probability) in enumerate(zip(positions, probabilities)):
            assert probability[position].word == targets[j], f"{probability[position].word} - {targets[j]}"
            ResultData[adj].append(probability[position].prob)

In [None]:
# Compile results
baseCols = data.columns.tolist()
targetCols = list(ResultData.keys())
results = data.copy()
for adj in ResultData:
    results[adj] = ResultData[adj]

results = pd.melt(results, id_vars=baseCols, value_vars=targetCols,
                  var_name='target', value_name='prob')

In [None]:
results.to_csv('BabyBERTaResults.tsv', sep='\t', index=False)

In [None]:
# Retrieve pre-compiled results
!wget -O BabyBERTa.tsv https://www.dropbox.com/s/315a2et75nimyo2/BabyBERTa.tsv?dl=0
!wget -O RoBERTa.tsv https://www.dropbox.com/s/zozf6mwv3a5jdyl/RoBERTa.tsv?dl=0

In [None]:
# Load pre-compiled results
BabyResults = pd.read_csv('BabyBERTa.tsv', sep='\t')
RoResults = pd.read_csv('RoBERTa.tsv', sep='\t')

RoResults.insert(0, 'model', 'roberta')
BabyResults.insert(0, 'model', 'babyberta')

header = subset.columns.tolist()
header = list(filter(lambda x: 'Unnamed' not in x, header))
header = list(filter(lambda x: 'index' not in x, header))
Baby = subset[header]
Ro = subset[header]

for head in BabyResults.columns.tolist():
    Baby[head] = BabyResults[head].tolist()
BabyResults = Baby

for head in RoResults.columns.tolist():
    Ro[head] = RoResults[head].tolist()
RoResults = Ro

In [None]:
combined = pd.concat([BabyResults, RoResults])

In [None]:
# Establish by-item winner

winner = []
for _, row in combined.iterrows():
    w = [10000000000, '']
    for adjClass in class2adj:
        if row[adjClass] < w[0]:
            w[0] = row[adjClass]
            w[1] = adjClass
    winner.append(w[1])
combined.insert(39, 'prefer', winner)


In [None]:
data = combined[combined['model'] =='babyberta']
value_counts = data[['model', 'right', 'prefer']].groupby(['model', 'right', 'prefer']).value_counts()

In [None]:
value_counts

In [None]:
f"{100*(data['adjective_class'] == data['prefer']).astype(int).sum()/(len(data['prefer'].tolist()))}% have same prediction as original adjective class"

In [None]:
data.head(10)