BERT tutorial
https://mccormickml.com/2019/07/22/BERT-fine-tuning/

GAP preprocessing example
https://www.kaggle.com/code/sunilcube/text-data-gendered-pronoun-resolution

Transformer Explanation: https://nlp.seas.harvard.edu/2018/04/03/attention.html http://nlp.seas.harvard.edu/annotated-transformer/

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import time
import string
import re
from collections import Counter
import random
from typing import *
import csv

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

SEED = 10

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = True

# Display the entire text
pd.set_option("display.max_colwidth", None)

In [2]:
import transformers
from transformers import (
    Trainer,
    TrainingArguments,
    BertTokenizer,
    BertModel
)

In [3]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
device

device(type='cuda')

# 1.

In [5]:
train_path = "../../model/data/train.tsv"
valid_path = "../../model/data/dev.tsv"

In [6]:
def read_dataset(path: str) -> List[Dict]:
    samples: List[Dict] = []
    pron_counter = Counter()
    with open(path) as f:
        next(f)
        for line in f:
            (
                id,
                text,
                pron,
                p_offset,
                entity_A,
                offset_A,
                is_coref_A,
                entity_B,
                offset_B,
                is_coref_B,
                url,
            ) = line.strip().split("\t")
            pron_counter[pron.lower()] += 1
            samples.append(
                {
                    "id": id,
                    "text": text,
                    "pron": pron,
                    "p_offset": int(p_offset),
                    "entity_A": entity_A,
                    "offset_A": int(offset_A),
                    "is_coref_A": is_coref_A,
                    "entity_B": entity_B,
                    "offset_B": int(offset_B),
                    "is_coref_B": is_coref_B,
                    "url": url,
                }
            )
    print(pron_counter)
    return samples, pron_counter

In [7]:
train_dataset, train_pron_counter = read_dataset(train_path)
valid_dataset, valid_pron_counter = read_dataset(valid_path)

Counter({'his': 904, 'her': 773, 'he': 610, 'she': 555, 'him': 157})
Counter({'her': 140, 'his': 108, 'he': 93, 'she': 87, 'him': 26})


In [8]:
# train_dataset[0]

In the training dataset there is a slightly bias towards the male pronouns (1671 M vs 1328 F).

In [9]:
train_gender_pron_counter = {
    "F": train_pron_counter['her'] + train_pron_counter['she'],
    "M": train_pron_counter['his'] + train_pron_counter['him'] + train_pron_counter['he']
}
train_gender_pron_counter

{'F': 1328, 'M': 1671}

The validation dataset is perfectly balanced between gender pronouns (227 pronous for both Female and Male).

In [9]:
valid_gender_pron_counter = {
    "F": valid_pron_counter['her'] + valid_pron_counter['she'],
    "M": valid_pron_counter['his'] + valid_pron_counter['him'] + valid_pron_counter['he']
}
valid_gender_pron_counter

{'F': 227, 'M': 227}

In [10]:
from matplotlib.pyplot import figure
def plot_freq(frequencies: dict, title: str = "plot"):
    """
    A bar chart with frequency of tokens.
    """
    figure(figsize=(8, 4), dpi=80)

    words = list(frequencies.keys())
    freq = list(frequencies.values())

    plt.bar(range(len(frequencies)), freq, tick_label=words)
    plt.xticks(rotation=45)
    plt.title(title)
    plt.show()

In [11]:
# plot_freq(train_pron_counter)
# plot_freq(valid_pron_counter)

In [10]:
df_train = pd.DataFrame(train_dataset)
df_valid = pd.DataFrame(valid_dataset)

In [11]:
def clean_text(text):
    return text.translate(str.maketrans("`", "'"))

In [12]:
df_test = df_valid.copy()

In [13]:
df_train['text'] = df_train['text'].map(clean_text)
df_train['entity_A'] = df_train['entity_A'].map(clean_text) 
df_train['entity_B'] = df_train['entity_B'].map(clean_text) 

df_valid['text'] = df_valid['text'].map(clean_text)
df_valid['entity_A'] = df_valid['entity_A'].map(clean_text)
df_valid['entity_B'] = df_valid['entity_B'].map(clean_text)

In [43]:
# train_clean_path = "../../model/data/train_clean.tsv"
# valid_clean_path = "../../model/data/valid_clean.tsv"
# df_train.to_csv(path_or_buf=train_clean_path, sep="\t", index=False)
# df_valid.to_csv(path_or_buf=valid_clean_path, sep="\t", index=False)

In [18]:
prova = read_dataset(train_clean_path)

Counter({'his': 904, 'her': 773, 'he': 610, 'she': 555, 'him': 157})


In [44]:
# p = pd.read_csv(filepath_or_buffer=train_clean_path, sep="\t")
# v = pd.read_csv(filepath_or_buffer=valid_clean_path, sep="\t")

In [12]:
df_train_entities = df_train[['entity_A', 'is_coref_A', 'entity_B', 'is_coref_B']]
df_train_entities

Unnamed: 0,entity_A,is_coref_A,entity_B,is_coref_B
0,Cheryl Cassidy,TRUE,Pauline,FALSE
1,MacKenzie,TRUE,Bernard Leach,FALSE
2,Angeloz,FALSE,De la Sota,TRUE
3,Hell,FALSE,Henry Rosenthal,TRUE
4,Kitty Oppenheimer,FALSE,Rivera,TRUE
...,...,...,...,...
2994,Martin,TRUE,Robert Brandon,FALSE
2995,Arthur Davies,TRUE,John Frederick Mowbray-Clarke,FALSE
2996,Katharine Anthony,FALSE,Madge Jenison,FALSE
2997,Carole,TRUE,Lillian Grey,FALSE


In [17]:
# Dataframe contiaining sentences where neither A nor B entities are the right coreference entities to the pronoun
df_train_neither_ent = df_train.loc[(df_train['is_coref_A'] == "FALSE") & (df_train['is_coref_B'] == "FALSE")]

df_train_A_ent = df_train.loc[df_train['is_coref_A'] == "TRUE"]
df_train_B_ent = df_train.loc[df_train['is_coref_B'] == "TRUE"]

print("# sentences:", df_train.shape[0])
print(f"# sentences neither: {df_train_neither_ent.shape[0]} ({((df_train_neither_ent.shape[0] / df_train.shape[0]) * 100):.2f}%)")
print(f"# sentences A pronoun: {df_train_A_ent.shape[0]} ({((df_train_A_ent.shape[0] / df_train.shape[0]) * 100):.2f}%)")
print(f"# sentences B pronoun: {df_train_B_ent.shape[0]} ({((df_train_B_ent.shape[0] / df_train.shape[0]) * 100):.2f}%)")

# sentences: 2999
# sentences neither: 315 (10.50%)
# sentences A pronoun: 1331 (44.38%)
# sentences B pronoun: 1353 (45.12%)


In [18]:
# Dataframe contiaining sentences where neither A nor B entities are the right coreference entities to the pronoun
df_valid_neither_ent = df_valid.loc[(df_valid['is_coref_A'] == "FALSE") & (df_valid['is_coref_B'] == "FALSE")]

df_valid_A_ent = df_valid.loc[df_valid['is_coref_A'] == "TRUE"]
df_valid_B_ent = df_valid.loc[df_valid['is_coref_B'] == "TRUE"]

print("# sentences:", df_valid.shape[0])
print(f"# sentences neither: {df_valid_neither_ent.shape[0]} ({((df_valid_neither_ent.shape[0] / df_valid.shape[0]) * 100):.2f}%)")
print(f"# sentences A pronoun: {df_valid_A_ent.shape[0]} ({((df_valid_A_ent.shape[0] / df_valid.shape[0]) * 100):.2f}%)")
print(f"# sentences B pronoun: {df_valid_B_ent.shape[0]} ({((df_valid_B_ent.shape[0] / df_valid.shape[0]) * 100):.2f}%)")

# sentences: 454
# sentences neither: 62 (13.66%)
# sentences A pronoun: 187 (41.19%)
# sentences B pronoun: 205 (45.15%)


The datasets are pretty balanced with respect the A B pronouns; but we have fewer examples of "neither".

In [19]:
df_train_text_only = df_train['text']
df_train_text_only.sample()

1779    Jeanine Basinger (born 3 February 1936), a film historian, is Corwin-Fuller Professor of Film Studies and Founder and Curator of The Cinema Archives at Wesleyan University, Middletown, Connecticut. She is also a Trustee of the American Film Institute (which awarded her an honorary degree, a Doctorate of Humane Letters, on June 7, 2006 ), a member of the Steering Committee of the National Center for Film and Video Preservation, and one of the Board of Advisors for the Association of Independent Video and Filmmakers.
Name: text, dtype: object

In [20]:
# df_train_text_only.map(lambda sentence: len(tokenizer.tokenize(sentence))).max()

In [21]:
df_train_text_lens = df_train_text_only.map(lambda sentence: len(sentence))

print(f"Mean lenght: {df_train_text_lens.mean():.2f}")
print(f"Min lenght: {df_train_text_lens.min():.2f}")
print(f"Max lenght: {df_train_text_lens.max():.2f}")

Mean lenght: 430.92
Min lenght: 69.00
Max lenght: 1347.00


In [22]:
df_valid_text_only = df_valid['text']
df_valid_text_only.sample()

125    It soon moved to Castle Hill, home of Lord and Lady Fortescue at Filleigh in North Devon until the end of the war, when in 1945 St Peters moved back to its old home in Seaford, now vacated by the Army, and resumed normal service. In 1956 Pat and Marjorie retired and Basil Talbot, an assistant Headmaster, a member of the team from the 1930s briefly took over but he retired through ill health.
Name: text, dtype: object

In [23]:
df_valid_text_lens = df_valid_text_only.map(lambda sentence: len(sentence))

print(f"Mean lenght: {df_valid_text_lens.mean():.2f}")
print(f"Min lenght: {df_valid_text_lens.min():.2f}")
print(f"Max lenght: {df_valid_text_lens.max():.2f}")

Mean lenght: 426.39
Min lenght: 147.00
Max lenght: 1012.00


In [24]:
df_train_text_lens[df_train_text_lens.map(lambda lenght: lenght == 1347)]

2342    1347
Name: text, dtype: int64

In [25]:
df_train.iloc[2342]

id                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      

In [26]:
df_train.iloc[2342]['text'][624:626]

'he'

In [27]:
df_train.loc[(df_train['is_coref_A'] == "FALSE") & (df_train['offset_B'] > 512)].shape

(80, 11)

In [28]:
df_train.loc[df_train['p_offset'] > 512]

Unnamed: 0,id,text,pron,p_offset,entity_A,offset_A,is_coref_A,entity_B,offset_B,is_coref_B,url
91,train-92,"After a few years of almost no work -- although he was a guest star on Lou Grant and Charlie's Angels in the late 1970s, he once summed up the 1970s as ''I cried and did a lot of gardening'' -- he was hired in 1979 for his best-known role, self-made millionaire Palmer Cortlandt on ABC's long-running soap opera All My Children. Initially hired for only one year, he remained on contract through 2009. For much of his first decade on the show, Palmer was a ruthless villain, totally possessive of his daughter, Nina and violently threatening his ex-wife Daisy with being attacked by dobermans when she came back from the dead.",she,598,Nina,511,FALSE,Daisy,554,TRUE,http://en.wikipedia.org/wiki/James_Mitchell_(actor)
157,train-158,"On 17 June 2005, after 12 years at Birmingham, Bennett transferred to Leeds United who already had Scottish international goalkeeper Neil Sullivan as first-choice goalkeeper. Despite playing the pre-season friendlies, he was limited to four league appearances during the 2005-06 season, obtained deputising for the injured Sullivan. In July 2006, Bennett transferred for an undisclosed fee to newly promoted Premiership club Sheffield United, signing on a two-year deal to provide competition to the Blades first-choice goalkeeper, Paddy Kenny. He played the first game of his second spell at the club at Bramall Lane against Reading on 16 September 2006.",He,545,Bennett,347,TRUE,Paddy Kenny,532,FALSE,http://en.wikipedia.org/wiki/Ian_Bennett_(footballer)
209,train-210,"In 1851 Barlow was in England, where he published a short work ''Industry on Christian Principles, London, 1851. He published at London ''Letteratura Dantesca: Remarks on the Reading of the 114th Verse of the 7th Canto of the Paradise of the ''Divina Commedia'''' (1857), and two years afterwards ''Francesca da Rimini, her Lament and Vindication; with a brief Notice of the Malatesti'' (1859, 2nd edition, 1875). An Italian translation, ''Francesca da Rimini, suo Lamento e Difesa,'' &c., in Filippo Scolari's ''Esercitazioni Dantesche,'' appeared at Venice in 1865. Barlow published in 1862 ''Il Gran Rifiuto, what it was, who made it, and how fatal to Dante Allighieri,'' on verses 58 to 63 of the 3rd canto of the Inferno; an Italian translation by G. G(uiscardi) appeared at Naples in 1864. Barlow also issued in 1862 ''Il Conte Ugolino e l'Arcivescovo Ruggieri: a Sketch from the Pisan Chronicles,'' and a fragment of English history, entitled ''The Young King and Bertrand de Born,'' from which the author deduced an amended reading in line 135 of the 28th canto of the ''Inferno.'' In 1864 Barlow published the final result of his work on the ''Divina Commedia,'' ''Critical, Historical, and Philosophical Contributions to the Study of the ''Divina Commedia.''''",his,1135,Bertrand de Born,971,FALSE,Barlow,1098,TRUE,http://en.wikipedia.org/wiki/Henry_Clark_Barlow
274,train-275,"Homer follows one of the raccoons into the family's home under a tree stump and prepares to take them out, despite Bart warning him that he always loses fights with animals, as his battle with the earthworms proved--but Homer claims that that was phased withdrawal, but after seeing that their family is basically the raccoon version of his family, Homer cannot bring himself to do it. During a windy break, Lisa can't inhale any smoke. She realizes her only alternative is to actually smoke a cigarette, and picks one up. Right as she is about to smoke it, her father arrives and takes it away, by throwing it on the ground, squishing it with his foot, and then shooting it with a gun several times, and he is shown to be putting the gun away into his jacket. (which is also filled with other guns and weapons) Appalled, he goes to tell Marge that Lisa needs to be taken out of the ballet academy, but discovers how proud she is of Lisa; Homer is unable bear to destroy Marge's happiness.",her,558,Marge,838,FALSE,Lisa,849,TRUE,http://en.wikipedia.org/wiki/Smoke_on_the_Daughter
290,train-291,"Llewellyn failed to take a wicket in this first Test and was promptly omitted from the remainder of the series but responded by performing impressively in the 1897--98 and 1898--99 Currie Cups, which led to his recall to the national team for the first Test of the 1898--99 series against England. Llewellyn impressed by taking five wickets but was surprisingly left out of the second Test. At the end of the 1898--99 series Llewellyn, perturbed by the actions of the selectors and seeking financial security, left South Africa to play for English county side Hampshire County Cricket Club as a professional, on the recommendation of South African team-mate Major Robert Poore, an ex-Hampshire cricketer on military assignment. He would star for Hampshire for over a decade, scoring 8772 runs at 27.58 and snaring 711 wickets at 24.66.",He,728,Llewellyn,425,FALSE,Robert Poore,664,TRUE,http://en.wikipedia.org/wiki/Charlie_Llewellyn
...,...,...,...,...,...,...,...,...,...,...,...
2849,train-2850,"A different appraisal, two months later, noting the governor had recently survived an impeachment attempt, said that rather than improving social services and generating employment, after two years in power Akande had implemented massive staff lay offs in the public service, and had caused virtual collapse of public infrastructure. On December 24, 2001, Akande's supporter Bola Ige, the minister of justice, was murdered in his house in Ibadan. The newspaper This Day said that the murder could have been linked to the feud between Akande and the deputy governor Iyiola Omisore. The murder followed another murder the previous week of Osun State legislator Odunayo Olagbaju, who was bludgeoned to death outside his home.",his,713,Akande,534,FALSE,Iyiola Omisore,565,FALSE,http://en.wikipedia.org/wiki/Adebisi_Akande
2868,train-2869,"1998: The Pasukan Gerakan Khas and the Grup Gerak Khas were deployed to provide security and were on standby for hostage rescue, close protection and counter-terrorism duties during the 1998 Commonwealth Games held at National Stadium, Bukit Jalil, Kuala Lumpur on 11 to 21 September 1998. 20 September 1998: In the twilight hours, by orders from the then Prime Minister to the Inspector General of Police, Tan Sri Rahim Noor, 69th Commando PGK operatives led by Inspector Mazlan arrested the ex-Deputy Prime Minister Dato' Sri Anwar Ibrahim in his home 18 days after his ejection from the Cabinet, for inciting anti-Mahathir reforms in Kuala Lumpur.",his,545,Inspector Mazlan,463,FALSE,Dato' Sri Anwar Ibrahim,518,TRUE,http://en.wikipedia.org/wiki/Pasukan_Gerakan_Khas
2890,train-2891,"After several months of captivity Duane is released by Granny Ruth, who is preparing to take everyone on a road trip to the home of her ex-husband Doctor Hal Rockwell, who will help in giving birth to Belial's equally misshapen girlfriend Eve's babies; before leaving for the trip, Granny Ruth sternly tells Duane to stay away from Belial, who has stopped speaking to Duane telepathically after Duane's attempt to put them back together. While traveling via bus to Hal's house in Peachtree County the group stop at a drug store, where Granny Ruth meets local sheriff Andrew Griffin while Duane, attempting to wriggle out a bus window, meets the sheriff's daughter Opal, who he tries to convince help him and Belial escape.",he,674,Hal,465,FALSE,Andrew Griffin,567,FALSE,http://en.wikipedia.org/wiki/Basket_Case_3:_The_Progeny
2921,train-2922,"''In the early spring he stood above the heights of Miles Canyon ... the line 'I have gazed on naked grandeur where there's nothing else to gaze on' came into his mind and again he hammered out a complete poem, ''The Call of the Wild''. Conversations with locals led Service to write about things he had not seen (some of which had not actually happened) as well. He did not set foot in Dawson City until 1908, arriving in the Klondike ten years after the Gold Rush when his renown as a writer was already established. After having collected enough poems for a book, Service ''sent the poems to his father, who had emigrated to Toronto, and asked him to find a printing house so they could make it into a booklet. He enclosed a cheque to cover the costs and intended to give these booklets away to his friends in Whitehorse'' for Christmas. His father took the manuscript to William Briggs in Toronto, whose employees loved the book. ''The foreman and printers recited the ballads while they worked. A salesman read the proofs out loud as they came off the typesetting machines.'' An ''enterprising salesman sold 1700 copies in advance orders from galley proofs.'' The publisher ''sent Robert's cheque back to him and offered a ten percent royalty contract for the book.'",him,1210,William Briggs,875,FALSE,Robert,1186,TRUE,http://en.wikipedia.org/wiki/Songs_of_a_Sourdough


In [29]:
train_sentences = [sentence.split() for sentence in df_train_text_only]

In [30]:
# train_sentences = list(df_train_text_only)

In [17]:
# Utility function taken from the 'evaluate.py' script
def flat_list(l: List[List[Any]]) -> List[Any]:
    """
    Returns
    -------
        A single list containing all elements that
        were in the input list.
        
    Parameters
    ----------
    l: List[List[Any]]
        A list of lists of any type
    """
    return [_e for e in l for _e in e]

In [18]:
def freq_most_common_tokens(dataset_text: List[List[str]], n: int = 20) -> dict:
    """
    Returns
    -------
        The first n common tokens and their frequencies, where the tokens are
        retrieved from the list 'dataset_text'.
        
    Parameters
    ----------
    dataset_text: List[List[str]]
        A list of lists of strings. 
        In this case each nested list is a sentence.
    
    n: int
        Indicates how many tokens to consider.
        If it is a negative number, 
        the function returns the frequencies of all the tokens in the dataset.
    
    """
    # The input is flattened
    tokens = flat_list(dataset_text)  

    # If negative number, return the frequency of all the tokens
    if n <= -1:
        return dict(Counter(tokens).most_common(len(Counter(tokens))))
    else:
        return dict(Counter(tokens).most_common(n))

In [27]:
train_freq = freq_most_common_tokens(df_train_text_only, n=-1)

In [28]:
a = "Cai` com'"

In [29]:
df_train_text_only = df_train_text_only.map(clean_text)

In [30]:
df_train_text_only.sample(10)

1606                                                                                                                                                                                                                          The magazine also listed Pam Bouvier seventh on their list of worst Bond girls, saying Carey Lowell ''fumbled this attempt at giving 007 a modern, independent counterpart by turning her into a nagging pest.'' Norman Wilner of MSN considered Licence to Kill the second worst Bond film, above only A View to a Kill, but defended Dalton, saying he ''got a raw deal.
46                                                                                                                                                                                                                                                      The Movie was either opening or closing night at more than half of those festivals. Director Dyanna Taylor's film about her grandmother, the photographer Dorothea Lang

In [31]:
# list(train_freq.keys())[-500:]

In [32]:
df_train['text'].sample(5)

1372                                                                                                                                                                                                                                                                                            Felicia Rudolphina Scatcherd (1862 -- March 12, 1927) was a journalist and spiritualist. Felicia Scatcherd was born to Watson Scatcherd and Emily Frances Crofton. She lived with her parents in London until her mother's death in 1901.
329                                                                                                                                                 In 1806, still commanding his cavalry division (23rd, 29th and 30th Dragoon Regiments), he was present at the siege and capture of the fortress of Gaeta, on the west coast of Italy. During the second French invasion of Portugal in 1809, Mermet led a division under Marshal Nicolas Soult. He fought at the First Battle of Porto on 

# 2.

In [33]:
# ` -> '

Autocast https://wandb.ai/wandb_fc/tips/reports/How-to-use-Autocast-in-PyTorch--VmlldzoyMTk4NTky

Optimization https://towardsdatascience.com/optimize-pytorch-performance-for-speed-and-memory-efficiency-2022-84f453916ea6

In [14]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [15]:
df_train_text_lens = df_train_text_only.map(lambda sentence: len(tokenizer.tokenize(sentence)))

print(f"Mean lenght: {df_train_text_lens.mean():.2f}")
print(f"Min lenght: {df_train_text_lens.min():.2f}")
print(f"Max lenght: {df_train_text_lens.max():.2f}")

NameError: name 'df_train_text_only' is not defined

In [None]:
sent = df_train['text'][792]
sent

In [None]:
row = df_train.iloc[444]
print(row['offset_A'], row['entity_A'], row['is_coref_A'])
print(row['offset_B'], row['entity_B'], row['is_coref_B'])
print(row['p_offset'], row['pron'])
row['text']

In [None]:
# The order is important because we want that the pronoun comes after all the 
# coreferenced entities in the output, even if B could come after the pronoun. 
break_points = sorted([
    ("A", row['offset_A'], row['entity_A']),
    ("B", row['offset_B'], row['entity_B']),
    ("P", row['p_offset'], row['pron'])
], key=lambda x: x[0])

tokens, spans, current_pos = [], {}, 0
for name, offset, text in break_points:
    tokens.extend(tokenizer.tokenize(row["text"][current_pos:offset]))
    # Make sure we do not get it wrong
    assert row["text"][offset:offset+len(text)] == text
    # Tokenize the target
    tmp_tokens = tokenizer.tokenize(row["text"][offset:offset+len(text)])
    
    # [num_tokens until entity, num_tokens including the entity]
    spans[name] = [len(tokens), len(tokens) + len(tmp_tokens) - 1] # inclusive
    print("BEFORE", tokens)
    
    # In the last iteration, the pronoun is appended to the end
    tokens.extend(tmp_tokens)
    # print()
    print("AFTER", tokens)
    current_pos = offset + len(text)
# print(tokens)
tokens.extend(tokenizer.tokenize(row["text"][current_pos:offset]))
# print("\n",tokens)
# The pronoun is a single token, so the span is the same
assert spans["P"][0] == spans["P"][1]
print("\n", tokens)
off = spans["A"] + spans["B"] + [spans["P"][0]]
print(off)

In [39]:
len(tokenizer.tokenize(row['text'][316:316+len(row['entity_A'])]))

4

In [40]:
a = sorted([
    (144, 500, 3),
    (10, 20, 30),
    (100, 200, 300)
], key=lambda x: x[1])
a

[(10, 20, 30), (100, 200, 300), (144, 500, 3)]

1. Create dataset with text and offsets of entities and pronoun
2. Get contextualized embeddings from Bert
3. Select through the offsets the embeddings of enities and pronoun
4. Concat somehow the embeddings and pass them to a MLP + softmax to retrieve the probabilities
5. The probabilities are about 3 classes: M F N

In [16]:
def get_class_label(is_coref_A: str, is_coref_B: str):
    if is_coref_A == "TRUE" or is_coref_A is True:
        return 0
    elif is_coref_B == "TRUE" or is_coref_B is True:
        return 1
    else:
        return 2

In [17]:
FEMININE = 0
MASCULINE = 1
UNKNOWN = 2

def get_gender(pronoun: str):
    gender_mapping = {
        'she': FEMININE,
        'her': FEMININE,
        'he': MASCULINE,
        'his': MASCULINE,
        'him': MASCULINE,
    }
    
    return gender_mapping.get(pronoun.lower(), UNKNOWN)

In [19]:
# class PrepareDataFrame:
    
#     def __init__(self, dataset: List[Dict]):
#         self.df = pd.DataFrame(dataset)
        
#         self.df['text'] = self.df['text'].map(clean_text)
# #         self._extract_target(self.df)
    
#     @staticmethod
#     def clean_text(text):
#         return text.translate(str.maketrans("`", "'"))
    
#     def __str__(self):
#         return self.df
  

In [20]:
df_train['text'][2]

"He had been reelected to Congress, but resigned in 1990 to accept a post as Ambassador to Brazil. De la Sota again ran for governor of C*rdoba in 1991. Defeated by Governor Angeloz by over 15%, this latter setback was significant because it cost De la Sota much of his support within the Justicialist Party (which was flush with victory in the 1991 mid-terms), leading to President Carlos Menem 's endorsement of a separate party list in C*rdoba for the 1993 mid-term elections, and to De la Sota's failure to regain a seat in Congress."

In [21]:
# tokenizer.tokenize(df_train['text'][2])

In [18]:
df_train['target'] = [get_class_label(is_coref_A, is_coref_B) for is_coref_A, is_coref_B in zip(df_train['is_coref_A'],  df_train['is_coref_B'])]
df_valid['target'] = [get_class_label(is_coref_A, is_coref_B) for is_coref_A, is_coref_B in zip(df_valid['is_coref_A'],  df_valid['is_coref_B'])]

In [19]:
class GAPDataset(Dataset):
    """Custom GAP Dataset class"""
    def __init__(self, df, tokenizer, labeled=True):
        self.df = df
        
        self.labeled = labeled
        self.tokenizer = tokenizer
        self.offsets, self.tokens = [], []
        
        if labeled:
            self.labels = df.target.values.astype("uint8")
        
        
        self._convert_tokens_to_ids()
        
#     @staticmethod
#     def get_class_label(is_coref_A: str, is_coref_B: str):
#         if is_coref_A == "TRUE":
#                 return 0
#         elif is_coref_B == "TRUE":
#             return 1
#         else:
#             return 2
    
    def _convert_tokens_to_ids(self):
        CLS = [self.tokenizer.cls_token]
        SEP = [self.tokenizer.sep_token]
        
        for _, row in self.df.iterrows():
            tokens, offsets = self._tokenize(row)
            self.offsets.append(offsets)
            self.tokens.append(self.tokenizer.convert_tokens_to_ids(
                CLS + tokens + SEP))
    
    def _tokenize(self, row):
        # The order is important because we want that the pronoun comes after all the 
        # coreferenced entities in the output, even if B could come after the pronoun. 
        break_points = sorted([
            ("A", row['offset_A'], row['entity_A']),
            ("B", row['offset_B'], row['entity_B']),
            ("P", row['p_offset'], row['pron'])
        ], key=lambda x: x[0])

        tokens, spans, current_pos = [], {}, 0
        for name, offset, text in break_points:
            tokens.extend(self.tokenizer.tokenize(row["text"][current_pos:offset]))
            # Make sure we do not get it wrong
            assert row["text"][offset:offset+len(text)] == text
            # Tokenize the target
            tmp_tokens = self.tokenizer.tokenize(row["text"][offset:offset+len(text)])

            # [num_tokens until entity, num_tokens including the entity]
            spans[name] = [len(tokens), len(tokens) + len(tmp_tokens) - 1] # inclusive 
            # In the last iteration, the pronoun is appended to the end
            tokens.extend(tmp_tokens)
            current_pos = offset + len(text)
    
        tokens.extend(self.tokenizer.tokenize(row["text"][current_pos:offset]))

        # The pronoun is a single token, so the span is the same
        assert spans["P"][0] == spans["P"][1]
        return tokens, (spans["A"] + spans["B"] + [spans["P"][0]])
    
    
    def __len__(self):
        return len(self.tokens)

    def __getitem__(self, idx):
        if self.labeled:
            return self.tokens[idx], self.offsets[idx], self.labels[idx]
        return self.tokens[idx], self.offsets[idx], None

In [81]:
train_ds = GAPDataset(df_train[:100], tokenizer)
valid_ds = GAPDataset(df_valid[:50], tokenizer)

In [80]:
df_train[:10]

Unnamed: 0,id,text,pron,p_offset,entity_A,offset_A,is_coref_A,entity_B,offset_B,is_coref_B,url,target
0,train-1,"Zoe Telford -- played the police officer girlfriend of Simon, Maggie. Dumped by Simon in the final episode of series 1, after he slept with Jenny, and is not seen again. Phoebe Thomas played Cheryl Cassidy, Pauline's friend and also a year 11 pupil in Simon's class. Dumped her boyfriend following Simon's advice after he wouldn't have sex with her but later realised this was due to him catching crabs off her friend Pauline.",her,274,Cheryl Cassidy,191,True,Pauline,207,False,http://en.wikipedia.org/wiki/List_of_Teachers_(UK_TV_series)_characters,0
1,train-2,"He grew up in Evanston, Illinois the second oldest of five children including his brothers, Fred and Gordon and sisters, Marge (Peppy) and Marilyn. His high school days were spent at New Trier High School in Winnetka, Illinois. MacKenzie studied with Bernard Leach from 1949 to 1952. His simple, wheel-thrown functional pottery is heavily influenced by the oriental aesthetic of Shoji Hamada and Kanjiro Kawai.",His,284,MacKenzie,228,True,Bernard Leach,251,False,http://en.wikipedia.org/wiki/Warren_MacKenzie,0
2,train-3,"He had been reelected to Congress, but resigned in 1990 to accept a post as Ambassador to Brazil. De la Sota again ran for governor of C*rdoba in 1991. Defeated by Governor Angeloz by over 15%, this latter setback was significant because it cost De la Sota much of his support within the Justicialist Party (which was flush with victory in the 1991 mid-terms), leading to President Carlos Menem 's endorsement of a separate party list in C*rdoba for the 1993 mid-term elections, and to De la Sota's failure to regain a seat in Congress.",his,265,Angeloz,173,False,De la Sota,246,True,http://en.wikipedia.org/wiki/Jos%C3%A9_Manuel_de_la_Sota,1
3,train-4,"The current members of Crime have also performed in San Francisco under the band name ''Remote Viewers''. Strike has published two works of fiction in recent years: Ports of Hell, which is listed in the Rock and Roll Hall of Fame Library, and A Loud Humming Sound Came from Above. Rank has produced numerous films (under his real name, Henry Rosenthal) including the hit The Devil and Daniel Johnston.",his,321,Hell,174,False,Henry Rosenthal,336,True,http://en.wikipedia.org/wiki/Crime_(band),1
4,train-5,"Her Santa Fe Opera debut in 2005 was as Nuria in the revised edition of Golijov's Ainadamar. She sang on the subsequent Deutsche Grammophon recording of the opera. For his opera Doctor Atomic, Adams rewrote the role of Kitty Oppenheimer, originally a mezzo-soprano role, for soprano voice, and Rivera sang the rewritten part of Kitty Oppenheimer at Lyric Opera of Chicago, De Nederlandse Opera, and the Metropolitan Opera., all in 2007. She has since sung several parts and roles in John Adams' works, including the soprano part in El Ni*o, and the role of Kumudha in A Flowering Tree in the Peter Sellars production at the New Crowned Hope Festival in Vienna.",She,437,Kitty Oppenheimer,219,False,Rivera,294,True,http://en.wikipedia.org/wiki/Jessica_Rivera,1
5,train-6,"Sandra Collins is an American DJ. She got her start on the West Coast of the U.S. in Phoenix, Arizona and into residencies in Los Angeles, and eventually moved towards trance. She used American producers to give herself a unique sound. Collins performed for an estimated 80,000 people on the first night of Woodstock '99, and was the first female DJ featured in the Tranceport series of influential recordings. She recently has released two CD mixes under Paul Oakenfold's Perfecto label.",She,411,Collins,236,True,DJ,347,False,http://en.wikipedia.org/wiki/Sandra_Collins,0
6,train-7,"Reb Chaim Yaakov's wife is the sister of Rabbi Moishe Sternbuch, as is the wife of Rabbi Meshulam Dovid Soloveitchik, making the two Rabbis his uncles. Reb Asher's brother Rabbi Shlomo Arieli is the author of a critical edition of the novallae of Rabbi Akiva Eiger. Before his marriage, Rabbi Arieli studied in the Ponevezh Yeshiva headed by Rabbi Shmuel Rozovsky, and he later studied under his father-in-law in the Mirrer Yeshiva.",his,273,Reb Asher,152,False,Akiva Eiger,253,False,http://en.wikipedia.org/wiki/Asher_Arieli,2
7,train-8,"Slant Magazine's Sal Cinquemani viewed the album as formulaic and ''competently, often frustratingly more of the same from an artist who still seems capable of much more.'' Greg Kot of the Chicago Tribune perceived ''formula production and hack songwriting'', but complimented Pink's personality and its ''handful'' of worthy tracks. In his list for The Barnes & Noble Review, Robert Christgau named The Truth About Love the fourth best album of 2012.",his,337,Greg Kot,173,False,Robert Christgau,377,True,http://en.wikipedia.org/wiki/The_Truth_About_Love_(Pink_album),1
8,train-9,"Her father was an Englishman ''of rank and culture'' and her mother was a free woman of color, described as light-skinned. When Mary was six, her mother sent her to Alexandria (then part of the District of Columbia) to attend school. Living with her aunt Mary Paine, Kelsey studied for about ten years.",her,246,Mary Paine,255,False,Kelsey,267,True,http://en.wikipedia.org/wiki/Mary_S._Peake,1
9,train-10,"Shaftesbury's UK partners in the production of the series, British broadcaster UKTV and the international distributor ITV Studios Global Entertainment, were both interested in additional seasons. Christina Jennings approached Kirstine Stewart, executive vice-president of CBC's English services, about continuing the series, and she felt that ''a home at CBC made absolute sense''.",she,329,Christina Jennings,196,True,Kirstine Stewart,226,False,http://en.wikipedia.org/wiki/Murdoch_Mysteries,0


In [21]:
test_ds = GAPDataset(df_test, tokenizer, False)

In [27]:
features = torch.zeros((5, 10), dtype=torch.int64, device=device)
features

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0')

In [21]:
def collate_batch(batch, truncate_len=400):
    """Batch preparation.

    1. Pad the sequences
    2. Transform the target.
    """    
    batch_features, batch_offsets, batch_labels  = zip(*batch)

    max_len = min(
        max((len(x) for x in batch_features)),
        truncate_len
    )
    
    # Features
    features = np.zeros((len(batch), max_len), dtype=np.int64)
    
    # Padding
    for i, row in enumerate(batch_features):
        features[i, :len(row)] = row
   
    features_tensor = torch.tensor(features, device=device)

    # Offsets
    offsets_tensor = torch.stack([
        torch.tensor(x, dtype=torch.int64, device=device) for x in batch_offsets
    ], dim=0) + 1 # Account for the [CLS] token
    
    # Labels
    if batch_labels[0] is None:
        return features_tensor, offsets_tensor, None
    
    labels_tensor = torch.tensor(batch_labels, dtype=torch.uint8, device=device)
    return features_tensor, offsets_tensor, labels_tensor

In [20]:
# def collate_batch(batch, truncate_len=400):
#     """Batch preparation.

#     1. Pad the sequences
#     2. Transform the target.
#     """    
#     batch_features, batch_offsets, batch_labels  = zip(*batch)

#     max_len = min(
#         max((len(x) for x in batch_features)),
#         truncate_len
#     )
    
#     # Features
#     features = np.zeros((len(batch), max_len), dtype=np.int64)
    
#     # Padding
#     for i, row in enumerate(batch_features):
#         features[i, :len(row)] = row
   
#     features_tensor = torch.tensor(features)

#     # Offsets
#     offsets_tensor = torch.stack([
#         torch.tensor(x, dtype=torch.int64) for x in batch_offsets
#     ], dim=0) + 1 # Account for the [CLS] token
    
#     # Labels
#     if batch_labels[0] is None:
#         return features_tensor, offsets_tensor, None
    
#     labels_tensor = torch.tensor(batch_labels, dtype=torch.uint8)
#     return features_tensor, offsets_tensor, labels_tensor

In [30]:
collate_batch(valid_ds)

(tensor([[  101,  2002,  4914,  ...,     0,     0,     0],
         [  101, 14559,  2025,  ...,     0,     0,     0],
         [  101,  2043,  2016,  ...,     0,     0,     0],
         ...,
         [  101,  2016, 17708,  ...,     0,     0,     0],
         [  101,  2006,  2258,  ...,     0,     0,     0],
         [  101,  8242,  7607,  ...,     0,     0,     0]], device='cuda:0'),
 tensor([[47, 52, 56, 57, 59],
         [29, 29, 40, 40, 46],
         [84, 86, 90, 90, 95],
         ...,
         [53, 53, 60, 60, 63],
         [64, 65, 77, 79, 88],
         [57, 59, 63, 63, 70]], device='cuda:0'),
 tensor([2, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0,
         0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0,
         1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 2, 0, 0, 1, 0, 0, 0, 1, 1,
         1, 2, 1, 0, 1, 1, 1, 0, 2, 1, 2, 0, 1, 0, 1, 1, 1, 0, 1, 0, 2, 0, 0, 1,
         2, 2, 0, 1, 1, 2, 1, 1, 0, 0, 0, 1, 2, 0, 2, 1, 0, 0, 0, 0, 0

In [22]:
class GAPModel(nn.Module):
    """The main model."""
    def __init__(self, bert_model: str):
        super().__init__()
      
        if bert_model in ("bert-base-uncased", "bert-base-cased"):
            self.bert_hidden_size = 768
        elif bert_model in ("bert-large-uncased", "bert-large-cased"):
            self.bert_hidden_size = 1024
        else:
            raise ValueError("Unsupported BERT model.")
        
        self.bert = BertModel.from_pretrained(bert_model).to(device, non_blocking=True)
        self.head = CorefHead(self.bert_hidden_size).to(device, non_blocking=True)
    
    def forward(self, x, offsets):
        bert_outputs =  self.bert(
            x, attention_mask=(x > 0).long(), 
            token_type_ids=None, output_hidden_states=True)
#         concat_bert = torch.cat((bert_outputs[-1],bert_outputs[-2],bert_outputs[-3]),dim=-1)
        
        last_layer = bert_outputs.last_hidden_state
        head_outputs = self.head(last_layer, offsets)
#         return concat_bert  
        return head_outputs

In [23]:
def retrieve_entities_and_pron_embeddings(bert_embeddings, entities_and_pron_offsets):
    embeddings_A = []
    embeddings_B = []
    embeddings_pron = []

    # Consider embeddings and offsets in each batch separately
    for embeddings, off in zip(bert_embeddings, entities_and_pron_offsets):
        # The offsets of mention A are the first and the second
        # in the 'off' tensor
        offsets_ent_A = range(off[0], off[1]+1) 
        # The offsets of mention B are the third and the fourth
        # in the 'off' tensor
        offsets_ent_B = range(off[2], off[3]+1)
        # The offset of the pronoun is the last in the 'off' tensor
        offset_pron = off[-1]

        # The embedding of a mention is the mean of
        # all the subtokens embeddings that represent it
        embeddings_A.append(embeddings[offsets_ent_A].mean(dim=0))
        embeddings_B.append(embeddings[offsets_ent_B].mean(dim=0))
        embeddings_pron.append(embeddings[offset_pron])

    # Merge outputs
    merged_entities_and_pron_embeddings = torch.cat([
        torch.stack(embeddings_A, dim=0),
        torch.stack(embeddings_B, dim=0),
        torch.stack(embeddings_pron, dim=0)
    ], dim=1)
    # print(torch.stack(outputs_A, dim=0))
    # torch.stack(outputs_B, dim=0)
    # print(torch.stack(outputs_pron, dim=0))
    
    # shape: batch_size x (embedding_dim * 3)
    return merged_entities_and_pron_embeddings

In [24]:
class CorefHead(nn.Module):
    def __init__(self, bert_hidden_size: int):
        super().__init__()
        self.bert_hidden_size = bert_hidden_size 
        self.head_hidden_size = 512

#         self.fc = nn.Sequential(
#             nn.Dropout(0.1),
#             nn.Linear(bert_hidden_size * 3, 512),           
#             nn.ReLU(),
#             nn.Linear(512, 3)
#         )
        self.fc = nn.Sequential(
#             nn.BatchNorm1d(bert_hidden_size * 3),  
#             nn.Dropout(0.5),      
#             nn.LeakyReLU(),
#             nn.Linear(bert_hidden_size * 3, self.head_hidden_size), 
#             nn.BatchNorm1d(self.head_hidden_size),
#             nn.Dropout(0.5),
#             nn.Linear(self.head_hidden_size, self.head_hidden_size),
#             nn.ReLU(),
#             nn.BatchNorm1d(self.head_hidden_size),
#             nn.Dropout(0.5),
            nn.Dropout(0.1),
            nn.Linear(bert_hidden_size * 3, self.head_hidden_size),
            nn.LeakyReLU(),
            nn.BatchNorm1d(self.head_hidden_size),
            nn.Linear(self.head_hidden_size, 3)
        )
                
    def forward(self, bert_outputs, offsets):
        assert bert_outputs.shape[2] == self.bert_hidden_size
        embeddings = retrieve_entities_and_pron_embeddings(bert_outputs,
                                                          offsets)
        
        return self.fc(embeddings)

In [25]:
def compute_metrics(predictions_s, samples):
    total = 0
    correct = 0
    for pred, label in zip(predictions_s, samples):
        gold_pron_offset = label["p_offset"]
        pred_pron_offset = pred[0][1] if len(pred[0]) > 0 else None
        gold_pron = label["pron"]
        pred_pron = pred[0][0] if len(pred[0]) > 0 else None
        gold_both_wrong = label["is_coref_A"] == "FALSE" and label["is_coref_B"] == "FALSE"
        pred_entity_offset = pred[1][1] if len(pred[1]) > 0 else None
        pred_entity = pred[1][0] if len(pred[1]) > 0 else None
        if gold_both_wrong:
            if pred_entity is None and gold_pron_offset == pred_pron_offset and gold_pron == pred_pron:
                correct += 1
            total += 1
        else:
            gold_entity_offset = (
                label["offset_A"] if label["is_coref_A"] == "TRUE" else label["offset_B"]
            )
            gold_entity = (
                label["entity_A"] if label["is_coref_A"] == "TRUE" else label["entity_B"]
            )
            if (
                gold_pron_offset == pred_pron_offset
                and gold_pron == pred_pron
                and gold_entity_offset == pred_entity_offset
                and gold_entity == pred_entity
            ):
                correct += 1
            total += 1
    print(f"# instances: {total}")
    acc = float(correct) / total
    print(f"# accuracy: {acc:.4f}")

In [82]:
model_name_or_path="bert-base-uncased"
model = GAPModel(model_name_or_path).to(device, non_blocking=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [37]:
model.head.fc[1].weight

Parameter containing:
tensor([[ 0.0045, -0.0120,  0.0158,  ..., -0.0187, -0.0100,  0.0104],
        [-0.0047, -0.0175, -0.0168,  ...,  0.0109, -0.0086, -0.0152],
        [-0.0086,  0.0104,  0.0057,  ..., -0.0009,  0.0120,  0.0015],
        ...,
        [-0.0086,  0.0204, -0.0032,  ..., -0.0065, -0.0132,  0.0075],
        [-0.0075, -0.0087,  0.0196,  ...,  0.0146, -0.0177, -0.0118],
        [ 0.0087, -0.0112, -0.0129,  ...,  0.0022, -0.0154,  0.0030]],
       device='cuda:0', requires_grad=True)

In [38]:
from torch.cuda.amp import GradScaler

In [199]:
class Trainer:
    
    def __init__(
        self,
        model: nn.Module,
        args: TrainingArguments,
        train_dataloader: DataLoader,
        valid_dataloader: DataLoader,
        criterion: torch.nn,
        optimizer: torch.optim.Optimizer,
        scheduler: torch.optim.lr_scheduler = None,
        
    ):
        
        self.model = model
        self.train_dataloader = train_dataloader
        self.valid_dataloader = valid_dataloader
        self.criterion = criterion
        self.optimizer = optimizer
        self.scheduler = scheduler
        
        if args is None:
            output_dir = "../../model/tmp_trainer"
            print(f"No 'TrainingArguments' passed, using 'output_dir={output_dir}'.")
            args = TrainingArguments(output_dir=output_dir)
        
        self.args = args
        
    def train(self):
        args = self.args
        train_dataloader = self.train_dataloader
        valid_dataloader = self.valid_dataloader
        
        train_losses = []
        train_acc_list = []
        valid_losses = []
        valid_acc_list = []
        
        epochs = args.num_train_epochs
        train_loss = 0.0
        train_acc, total_count = 0.0, 0.0
        
        scaler = GradScaler()
        self.model.train()
        for epoch in range(epochs):
            
            epoch_loss = 0.0
            
            for step, (features, offsets, labels) in enumerate(train_dataloader):
                # Empty gradients
                self.optimizer.zero_grad(set_to_none=True)
                
                # Forward
                predictions = self.model(features, offsets)
                
                
                
                loss = self.criterion(predictions, labels)
                train_acc += (predictions.argmax(1) == labels).sum().item()
                total_count += labels.shape[0]
                
#                 # Backward  
#                 loss.backward()
                # Backward pass without mixed precision
                # It's not recommended to use mixed precision for backward pass
                # Because we need more precise loss
                scaler.scale(loss).backward()
                
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), grad_clipping)
                
                # Update weights 
#                 self.optimizer.step()
                scaler.step(self.optimizer)
                scaler.update()
        
                
                epoch_loss += loss.tolist()

                if step % args.logging_steps == args.logging_steps - 1:
                    mid_loss = epoch_loss / (step + 1)
                    mid_acc = train_acc / total_count
#                     print('\t[E: {:2d} @ step {}] current avg loss = {:0.4f}'.format(epoch, step, mid_loss))
                    print(f'\t| step {step+1:3d}/{len(train_dataloader):d} | train_loss: {mid_loss:.3f} | ' \
                    f'train_acc: {mid_acc:.3f} |')
            
            avg_epoch_loss = epoch_loss / len(train_dataloader)
            train_loss += avg_epoch_loss
            train_losses.append(train_loss)
            train_acc_list.append(train_acc / total_count)
            
#             print('\t[E: {:2d}] train loss = {:0.4f}'.format(epoch, avg_epoch_loss))  # print train loss at the end of the epoch
            
    
            valid_loss, valid_acc = self.evaluate(valid_dataloader)
            valid_losses.append(valid_loss)
            valid_acc_list.append(valid_acc)
            
#             print('  [E: {:2d}] valid loss = {:0.4f}'.format(epoch, valid_loss))
            print('-' * 75)
            print(f'| epoch {epoch+1:3d}/{epochs:d} | train_loss: {avg_epoch_loss:.3f} | ' \
                    f'valid_loss: {valid_loss:.3f} | valid_acc: {valid_acc:.3f} |')
            print('-' * 75)
            
        avg_epoch_loss = train_loss / epochs
        histories = {
            "train_losses": train_losses,
            "train_acc": train_acc_list,
            "valid_losses": valid_losses,
            "valid_acc": valid_acc_list,

        }
#         print(histories)
        
        return #avg_epoch_loss, histories
            
    def evaluate(self, eval_dataloader):
        valid_loss = 0.0
        eval_acc, total_count = 0, 0
        
        self.model.eval()
        with torch.no_grad():
            for (features, offsets, labels) in eval_dataloader:
                
                predictions = self.model(features, offsets)
                loss = self.criterion(predictions, labels)
                valid_loss += loss.tolist()

                eval_acc += (predictions.argmax(1) == labels).sum().item()
                total_count += labels.shape[0]
        
        return valid_loss / len(eval_dataloader), eval_acc / total_count
        

In [167]:
import yaml
# yaml_file = "reproduce.yaml"
yaml_file = "./train.yaml"
# yaml_file = "predict.yaml"

# Read configuration file with all the necessary parameters
with open(yaml_file) as file:
    config = yaml.safe_load(file)
    
training_args = TrainingArguments(**config['training_args'])

# Make sure that the learning rate is read as a number and not as a string
training_args.learning_rate = float(training_args.learning_rate)
training_args.learning_rate

1e-05

In [187]:
criterion = torch.nn.CrossEntropyLoss().to(device=device, non_blocking=True)
optimizer = torch.optim.Adam(model.parameters(), lr=training_args.learning_rate)

batch_size = 4

train_dataloader = DataLoader(train_ds, batch_size=batch_size, 
                              collate_fn=collate_batch, shuffle=True)
valid_dataloader = DataLoader(valid_ds, batch_size=batch_size, 
                              collate_fn=collate_batch, shuffle=False)


In [200]:
trainer = Trainer(model, training_args, 
                  train_dataloader, valid_dataloader, 
                  criterion, optimizer)

In [201]:
trainer.train()
# trainer.save_model()

	| step   5/25 | train_loss: 1.068 |train_acc: 0.450
	| step  10/25 | train_loss: 1.105 |train_acc: 0.400
	| step  15/25 | train_loss: 1.156 |train_acc: 0.283
	| step  20/25 | train_loss: 1.145 |train_acc: 0.300
	| step  25/25 | train_loss: 1.098 |train_acc: 0.370
---------------------------------------------------------------------------
| epoch   1/2 | train_loss: 1.098 | valid_loss: 0.957 | valid_acc: 0.560
---------------------------------------------------------------------------
	| step   5/25 | train_loss: 1.139 |train_acc: 0.358
	| step  10/25 | train_loss: 1.091 |train_acc: 0.393
	| step  15/25 | train_loss: 1.065 |train_acc: 0.406
	| step  20/25 | train_loss: 1.072 |train_acc: 0.389
	| step  25/25 | train_loss: 1.049 |train_acc: 0.405
---------------------------------------------------------------------------
| epoch   2/2 | train_loss: 1.049 | valid_loss: 0.842 | valid_acc: 0.560
---------------------------------------------------------------------------


In [116]:
trainer.evaluate(valid_dataloader)

tensor(1.4222, device='cuda:0') 1.4222300052642822
tensor(0.7423, device='cuda:0') 0.7422657608985901
tensor(2.1204, device='cuda:0') 2.1204404830932617
tensor(2.1204, device='cuda:0') 2.120441198348999
tensor(0.7423, device='cuda:0') 0.7422659993171692
tensor(2.8095, device='cuda:0') 2.809528350830078
tensor(1.4314, device='cuda:0') 1.4313530921936035
tensor(1.4314, device='cuda:0') 1.4313533306121826
tensor(2.8095, device='cuda:0') 2.80952787399292
tensor(2.1204, device='cuda:0') 2.12044095993042
tensor(2.8095, device='cuda:0') 2.809528350830078
tensor(2.1204, device='cuda:0') 2.1204402446746826
tensor(2.1204, device='cuda:0') 2.1204416751861572
1 50 0.02


1.9077121019363403

In [68]:
torch.cuda.empty_cache()

In [69]:
print(torch.cuda.max_memory_allocated())

5529680384


In [198]:
# tokenizer(sent, padding='max_length', truncation=True,  max_length=360)

In [285]:
_,oaa,_ = list(zip(*train_ds))

In [322]:
count = 0

for elem in oaa:
    if elem[1] - elem[0] == 1 or elem[3] - elem[2] == 1:
        count += 1
#         print(elem[0], elem[1], elem)
count

1510

In [25]:
batch_size = 2

train_dataloader = DataLoader(train_ds, batch_size=batch_size, 
                              collate_fn=collate_batch, shuffle=False)
# valid_dataloader = DataLoader(valid_ds, batch_size=batch_size, 
#                               collate_fn=collate_batch, shuffle=False)

In [236]:
bert_model_name = "bert-base-uncased"

bert = BertModel.from_pretrained(bert_model_name).to(device, non_blocking=True)

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at C:\Users\flori/.cache\huggingface\transformers\3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.19.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://huggingface.co/bert-base-uncased/resolve/main/pytorch_model.bin fr

In [50]:
# head = CorefHead(768).to(device)

In [237]:
for features, offsets, labels in train_dataloader:
    off = offsets
    output = bert(features, attention_mask=(features > 0).long(), 
                  token_type_ids=None, output_hidden_states=True, output_attentions=True)
    
#     res = head(output.hidden_states[-1], offsets)
    
    break

In [250]:
last = output.last_hidden_state
# last[0][42]


In [60]:
# for embeddings in last:
#     print(embeddings[42])
#     break

offsets

tensor([[42, 43, 45, 45, 62],
        [51, 51, 54, 55, 61]])

In [92]:
# of = offsets[:, range(0,2)]

batch_size = last.shape[0]
offsets_ent_A = offsets[:, range(0,2)]
offsets_ent_B = offsets[:, range(2,4)]

outputs_A = []
outputs_B = []

for batch_idx in range(batch_size):
    outputs_A.append(last[batch_idx, range(offsets_ent_A[batch_idx][0], 
                                offsets_ent_A[batch_idx][1] + 1)].mean(dim=0))
    outputs_B.append(last[batch_idx, range(offsets_ent_B[batch_idx][0], 
                                offsets_ent_B[batch_idx][1] + 1)].mean(dim=0))
#     print(out_B)

torch.stack(outputs_A, dim=0)
torch.stack(outputs_B, dim=0)

tensor([[ 0.4259,  0.0712,  1.0741,  ..., -0.2650,  0.0730,  0.7714],
        [ 0.1712,  0.2298, -0.3611,  ...,  0.5080, -0.7124,  0.5206]],
       grad_fn=<StackBackward0>)

In [137]:
import time

In [147]:
start = time.time()
def JIT_retrieve_entities_and_pron_embeddings(bert_embeddings, entities_and_pron_offsets):
    embeddings_A = []
    embeddings_B = []
    embeddings_pron = []

    # Consider embeddings and offsets in each batch separately
    for embeddings, off in zip(bert_embeddings, entities_and_pron_offsets):
        # The offsets of mention A are the first and the second
        # in the 'off' tensor
        offsets_ent_A = range(off[0], off[1]+1) 
        # The offsets of mention B are the third and the fourth
        # in the 'off' tensor
        offsets_ent_B = range(off[2], off[3]+1)
        # The offset of the pronoun is the last in the 'off' tensor
        offset_pron = off[-1]

        # The embedding of a mention is the mean of
        # all the subtokens embeddings that represent it
        
#         embeddings_A.append(embeddings[offsets_ent_A].mean(dim=0))
#         embeddings_B.append(embeddings[offsets_ent_B].mean(dim=0))
        embeddings_A.append(average_tensors(embeddings[offsets_ent_A]))
        embeddings_B.append(average_tensors(embeddings[offsets_ent_B]))
        embeddings_pron.append(embeddings[offset_pron])

    # Merge outputs
    merged_entities_and_pron_embeddings = torch.cat([
        torch.stack(embeddings_A, dim=0),
        torch.stack(embeddings_B, dim=0),
        torch.stack(embeddings_pron, dim=0)
    ], dim=1)
    # print(torch.stack(outputs_A, dim=0))
    # torch.stack(outputs_B, dim=0)
    # print(torch.stack(outputs_pron, dim=0))
    return merged_entities_and_pron_embeddings
end = time.time()
print("JIT", end - start)

JIT 0.0


In [136]:
retrieve_entities_and_pron_embeddings(last, offsets)

tensor([[ 0.6823, -0.3178,  0.3278,  ...,  0.2920,  0.7835,  0.1617],
        [ 0.7445, -0.5703,  0.8773,  ...,  0.7930,  0.3856, -0.4485]],
       grad_fn=<CatBackward0>)

In [134]:
@torch.jit.script
def average_tensors(tensor):
    return tensor.mean(dim=0)

In [103]:
pron_off = offsets[:, [4]]

last[:,pron_off]

tensor([[[[-0.6122, -1.0878, -0.1084,  ...,  0.2920,  0.7835,  0.1617]],

         [[ 0.4035, -0.6086,  0.5744,  ..., -0.3491,  0.4572, -1.3359]]],


        [[[-0.7060,  0.1587, -0.8657,  ...,  0.5993, -0.2363, -0.4840]],

         [[-0.0975, -0.2683, -0.6387,  ...,  0.7930,  0.3856, -0.4485]]]],
       grad_fn=<IndexBackward0>)

In [60]:
last[:, range(42, 44)]

tensor([[[ 5.3807e-01,  4.6052e-02,  5.1328e-01,  ..., -4.7629e-01,
           2.4409e-01,  4.8122e-01],
         [ 8.2649e-01, -6.8165e-01,  1.4226e-01,  ..., -3.3018e-01,
           3.0405e-01, -3.0243e-01]],

        [[ 6.7679e-01, -8.6487e-02, -4.3221e-01,  ...,  2.4297e-01,
           3.4243e-01, -2.5505e-05],
         [ 3.7130e-02, -3.2842e-01, -3.6175e-01,  ...,  3.9216e-01,
           4.6714e-01, -8.1941e-02]]], device='cuda:0',
       grad_fn=<IndexBackward0>)

In [106]:
# print(last[:, 42])
last[1, 61]

tensor([-9.7506e-02, -2.6827e-01, -6.3870e-01, -9.7911e-01,  1.6228e-01,
         1.3821e-01,  2.0999e-01,  1.4227e+00, -2.0495e-01, -6.8178e-01,
         9.7347e-02, -1.7352e-01, -4.7426e-02,  7.3415e-01, -5.3100e-01,
        -6.1785e-03,  5.9988e-01, -5.5689e-01,  2.7243e-02,  9.7765e-01,
         1.3741e-01, -1.2155e-01, -4.9975e-01,  6.1627e-01,  6.5771e-01,
         3.9494e-01, -8.6346e-02,  2.1224e-01, -4.6273e-01,  3.0996e-01,
         6.4127e-01, -2.3594e-01,  6.1242e-01, -5.5229e-01, -4.4161e-01,
        -9.5874e-01, -5.2585e-02,  4.6089e-01, -2.9261e-01,  5.5857e-01,
        -8.1286e-02, -4.7450e-01,  2.6060e-02, -1.5944e+00, -2.2374e-01,
         1.9711e-01,  1.7964e+00, -1.3266e-01, -9.0319e-01,  3.3096e-01,
         1.7975e-01, -9.3368e-02, -8.8890e-01,  1.9973e-01, -5.0889e-01,
        -3.6299e-01,  3.0422e-01,  7.3791e-01, -6.6398e-01, -1.7381e-01,
         7.2020e-01, -1.9225e-01,  4.7393e-03, -1.4993e-02,  1.0241e+00,
         3.8244e-01,  1.5110e+00,  6.5938e-02, -1.5

In [393]:
# shape (batch_size, num_heads, sequence_length, sequence_length)
print(len(output.attentions))
output.attentions[-1].shape

12


torch.Size([2, 12, 64, 64])

In [394]:
print(output.hidden_states[-1].shape)

o = output.hidden_states[-1]
print(o[0][0].shape)
print(o[0][off[0]])

# extracted_outputs = o.gather(
#             1, offsets.unsqueeze(2).expand(-1, -1, bert_outputs.size(2)) 
#         ).view(bert_outputs.size(0), -1)
# print(off.unsqueeze(2).expand(-1, -1, o.size(2)))
a = o.gather(1, off.unsqueeze(2).expand(-1, -1, o.size(2)))
print(a.shape)
print(a)
b = a.view(o.shape[0], -1)
print(b.shape)
print(b)
off[0][:4].view(-1, 2, 2)

torch.Size([2, 64, 768])
torch.Size([768])
tensor([[ 0.5381,  0.0461,  0.5133,  ..., -0.4763,  0.2441,  0.4812],
        [ 0.8265, -0.6816,  0.1423,  ..., -0.3302,  0.3041, -0.3024],
        [ 0.4259,  0.0712,  1.0741,  ..., -0.2650,  0.0730,  0.7715],
        [ 0.4259,  0.0712,  1.0741,  ..., -0.2650,  0.0730,  0.7715],
        [-0.6122, -1.0878, -0.1084,  ...,  0.2920,  0.7835,  0.1617]],
       device='cuda:0', grad_fn=<IndexBackward0>)
torch.Size([2, 5, 768])
tensor([[[ 0.5381,  0.0461,  0.5133,  ..., -0.4763,  0.2441,  0.4812],
         [ 0.8265, -0.6816,  0.1423,  ..., -0.3302,  0.3041, -0.3024],
         [ 0.4259,  0.0712,  1.0741,  ..., -0.2650,  0.0730,  0.7715],
         [ 0.4259,  0.0712,  1.0741,  ..., -0.2650,  0.0730,  0.7715],
         [-0.6122, -1.0878, -0.1084,  ...,  0.2920,  0.7835,  0.1617]],

        [[ 0.7445, -0.5703,  0.8773,  ...,  0.7073,  0.9091,  0.2327],
         [ 0.7445, -0.5703,  0.8773,  ...,  0.7073,  0.9091,  0.2327],
         [ 0.0447,  0.5941, -0.60

tensor([[[42, 43],
         [45, 45]]], device='cuda:0')

If the mention is represented as a span then `sum` the two spans to produce only one embedding.

In [404]:
# off_2 = off[:, :4].view(-1, 2, 2)
first_ent = off[:,:2].squeeze()
second_ent = off[:,2:4].squeeze()
# pron = off[:, ]
# pron

In [405]:
off

tensor([[42, 43, 45, 45, 62],
        [51, 51, 54, 55, 61]], device='cuda:0')

In [471]:
ofs = off[:, [0, 2, 4]].unsqueeze(2).expand(-1, -1, 768)
print(ofs)
fin = torch.gather(o, 1, ofs)
print(fin.shape)
fin.to(device)

tensor([[[42, 42, 42,  ..., 42, 42, 42],
         [45, 45, 45,  ..., 45, 45, 45],
         [62, 62, 62,  ..., 62, 62, 62]],

        [[51, 51, 51,  ..., 51, 51, 51],
         [54, 54, 54,  ..., 54, 54, 54],
         [61, 61, 61,  ..., 61, 61, 61]]], device='cuda:0')
torch.Size([2, 3, 768])


tensor([[[ 0.5381,  0.0461,  0.5133,  ..., -0.4763,  0.2441,  0.4812],
         [ 0.4259,  0.0712,  1.0741,  ..., -0.2650,  0.0730,  0.7715],
         [-0.6122, -1.0878, -0.1084,  ...,  0.2920,  0.7835,  0.1617]],

        [[ 0.7445, -0.5703,  0.8773,  ...,  0.7073,  0.9091,  0.2327],
         [ 0.0447,  0.5941, -0.6054,  ...,  0.9097, -0.6916, -0.2175],
         [-0.0975, -0.2683, -0.6387,  ...,  0.7930,  0.3856, -0.4485]]],
       device='cuda:0', grad_fn=<GatherBackward0>)

In [533]:
fin.view(o.shape[0], -1)

tensor([[ 0.5381,  0.0461,  0.5133,  ...,  0.2920,  0.7835,  0.1617],
        [ 0.7445, -0.5703,  0.8773,  ...,  0.7930,  0.3856, -0.4485]],
       device='cuda:0', grad_fn=<ViewBackward0>)

In [509]:
lin = nn.Linear(768, 3, device=device)
lin.weight

Parameter containing:
tensor([[-1.2767e-02,  1.0985e-02, -3.4086e-02,  ..., -9.8872e-04,
          2.1345e-02,  8.9290e-03],
        [ 2.4348e-02, -7.4422e-04,  1.8980e-02,  ...,  1.5269e-02,
         -2.6324e-02,  7.5837e-06],
        [-2.6595e-02,  1.9694e-03,  7.3221e-03,  ...,  1.2078e-02,
         -3.5289e-02,  4.1908e-04]], device='cuda:0', requires_grad=True)

In [511]:
outtt = lin(fin)


  


tensor([[[0.4154, 0.5315, 0.5313],
         [0.5333, 0.3944, 0.4611],
         [0.3473, 0.3948, 0.5761]],

        [[0.5846, 0.4685, 0.4687],
         [0.4667, 0.6056, 0.5389],
         [0.6527, 0.6052, 0.4239]]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>)

In [481]:
fin.squeeze()


tensor([[[ 0.5381,  0.0461,  0.5133,  ..., -0.4763,  0.2441,  0.4812],
         [ 0.4259,  0.0712,  1.0741,  ..., -0.2650,  0.0730,  0.7715],
         [-0.6122, -1.0878, -0.1084,  ...,  0.2920,  0.7835,  0.1617]],

        [[ 0.7445, -0.5703,  0.8773,  ...,  0.7073,  0.9091,  0.2327],
         [ 0.0447,  0.5941, -0.6054,  ...,  0.9097, -0.6916, -0.2175],
         [-0.0975, -0.2683, -0.6387,  ...,  0.7930,  0.3856, -0.4485]]],
       device='cuda:0', grad_fn=<SqueezeBackward0>)

In [503]:
emb = nn.Embedding(10, 3, device=device)
i = torch.LongTensor([[0,2,0,5], [0,2,0,5]]).to(device)
i.shape

torch.Size([2, 4])

In [505]:
em_out = emb(i)
em_out

tensor([[[ 0.9968, -0.8154,  1.0260],
         [-0.5402, -0.0711, -1.1686],
         [ 0.9968, -0.8154,  1.0260],
         [-0.9610, -1.3230,  0.3743]],

        [[ 0.9968, -0.8154,  1.0260],
         [-0.5402, -0.0711, -1.1686],
         [ 0.9968, -0.8154,  1.0260],
         [-0.9610, -1.3230,  0.3743]]], device='cuda:0',
       grad_fn=<EmbeddingBackward0>)

In [506]:
em_out.shape

torch.Size([2, 4, 3])

In [507]:
l = nn.Linear(3, 2, device=device)

l(em_out)

tensor([[[-1.1013,  0.3226],
         [ 0.4605, -0.3026],
         [-1.1013,  0.3226],
         [-0.3290,  0.0594]],

        [[-1.1013,  0.3226],
         [ 0.4605, -0.3026],
         [-1.1013,  0.3226],
         [-0.3290,  0.0594]]], device='cuda:0', grad_fn=<AddBackward0>)

In [433]:
torch.cat([fin, fin], dim=1).shape

torch.Size([2, 6, 768])

In [389]:
def unify_spans_embeddings(embeddings, single_dim_offsets):
    if single_dim_offsets[0] == single_dim_offsets[1]:
        ent_offset = single_dim_offsets[0].view(1, -1, 1)

    # Span of two tokens
    elif single_dim_offsets[1] - single_dim_offsets[0] == 1: 
        ent_offset = single_dim_offsets.view(1, -1, 1)
        
    else: # Span of multiple tokens
        ent_offset = torch.tensor(range(single_dim_offsets[0], 
                                        single_dim_offsets[1]+1),
                                  dtype=torch.int64,
                                  device=device).view(1, -1, 1)
    
    ent_offset_expand = ent_offset.expand(-1, -1, embeddings.shape[2])
    entity_embeddings = torch.gather(embeddings, 1, ent_offset_expand)
        
    # Sum the embeddings representing an entity (A or B)
    # to produce a single representation for it
    return entity_embeddings.sum(dim=1).unsqueeze(dim=0)

In [397]:
unify_spans_embeddings(o, first_ent)

RuntimeError: Boolean value of Tensor with more than one value is ambiguous

In [399]:
test = torch.tensor([1,1])
# test
t = torch.tensor(range(test[0], test[1]+1), device=device)
t

tensor([1], device='cuda:0')

In [380]:
first_ent.view(1,-1,1).dtype

torch.int64

In [313]:
torch.gather(o, 1, t.view(1,-1,1).expand(-1,-1,768))

tensor([[[-0.3262,  0.3077,  0.3330,  ...,  0.0246,  0.3267, -0.2383],
         [ 1.5212, -0.4738,  0.9732,  ...,  0.8894,  0.1102,  1.0896],
         [ 0.8939, -0.2292,  0.4933,  ...,  1.0340,  0.1831, -0.2677]]],
       device='cuda:0', grad_fn=<GatherBackward0>)

In [388]:
torch.gather(o, 1, t.view(1,-1,1).expand(-1,-1,768)).sum(dim=1)

tensor([[ 2.0889e+00, -3.9526e-01,  1.7995e+00, -2.0763e+00,  2.0231e+00,
          2.8712e+00,  1.3911e+00,  5.0796e-01, -9.6528e-01, -1.1719e+00,
          6.3579e-01, -5.0594e-01,  3.9600e-01, -5.8539e-01, -1.0251e+00,
          9.8997e-01,  1.2978e+00,  7.6494e-01, -9.2404e-01, -1.7566e+00,
          5.3220e-01, -4.5137e-01, -2.0958e+00,  3.4687e+00,  4.5470e-01,
          5.9785e-01,  3.2067e-02, -9.5322e-02,  1.6850e-01,  1.2676e-01,
          3.8936e+00, -1.8275e-02, -1.9699e-01, -4.9471e-02,  2.4774e-01,
         -1.8513e-01, -1.5461e+00, -1.6436e-01, -5.8126e-01,  5.8290e-01,
         -7.0645e-01, -1.8000e+00, -1.6877e+00,  2.1550e-01, -5.6485e-01,
          1.0550e-02,  7.7738e-01,  1.1309e-01,  1.8419e+00, -6.3968e-01,
         -3.6347e+00,  1.0722e+00,  9.5572e-01, -1.0473e+00,  1.3042e+00,
          3.1889e+00, -2.7895e+00, -1.7851e+00,  9.5010e-01,  9.7430e-01,
         -2.0797e-01,  9.7973e-01,  7.5529e-01, -2.1953e+00,  3.5696e-01,
         -5.5551e-01, -1.2940e+00,  9.

In [362]:
int(first_ent[0])

42

In [359]:
first_ent.view(1, -1, 1)

tensor([[[42],
         [43]]], device='cuda:0')

In [390]:
unify_spans_embeddings(o, t)

tensor([[[ 2.0889e+00, -3.9526e-01,  1.7995e+00, -2.0763e+00,  2.0231e+00,
           2.8712e+00,  1.3911e+00,  5.0796e-01, -9.6528e-01, -1.1719e+00,
           6.3579e-01, -5.0594e-01,  3.9600e-01, -5.8539e-01, -1.0251e+00,
           9.8997e-01,  1.2978e+00,  7.6494e-01, -9.2404e-01, -1.7566e+00,
           5.3220e-01, -4.5137e-01, -2.0958e+00,  3.4687e+00,  4.5470e-01,
           5.9785e-01,  3.2067e-02, -9.5322e-02,  1.6850e-01,  1.2676e-01,
           3.8936e+00, -1.8275e-02, -1.9699e-01, -4.9471e-02,  2.4774e-01,
          -1.8513e-01, -1.5461e+00, -1.6436e-01, -5.8126e-01,  5.8290e-01,
          -7.0645e-01, -1.8000e+00, -1.6877e+00,  2.1550e-01, -5.6485e-01,
           1.0550e-02,  7.7738e-01,  1.1309e-01,  1.8419e+00, -6.3968e-01,
          -3.6347e+00,  1.0722e+00,  9.5572e-01, -1.0473e+00,  1.3042e+00,
           3.1889e+00, -2.7895e+00, -1.7851e+00,  9.5010e-01,  9.7430e-01,
          -2.0797e-01,  9.7973e-01,  7.5529e-01, -2.1953e+00,  3.5696e-01,
          -5.5551e-01, -1

In [358]:
unify_spans(o, first_ent)

tensor([[[ 1.3646e+00, -6.3559e-01,  6.5554e-01, -8.3733e-01,  2.5619e-01,
          -1.5913e-01,  2.6387e+00, -3.5175e-01, -2.2441e-01,  2.8293e-01,
           3.9986e-02, -7.5838e-01,  1.0023e+00, -1.6775e-01, -5.7158e-01,
           1.1860e+00, -3.9145e-01,  7.9424e-01, -5.9099e-01,  4.9874e-01,
           9.0436e-01,  3.1083e-01, -1.6028e+00,  8.1725e-01, -4.8959e-03,
           8.5088e-01, -2.1822e-01,  9.7481e-01, -1.8579e-01,  1.5771e+00,
           1.2519e+00,  1.0207e-01,  1.2020e+00,  5.3160e-01,  3.6940e-01,
           7.1516e-01,  2.0164e-02, -2.0736e-01,  1.4634e-01, -4.7201e-01,
           4.1562e-01, -1.2816e+00,  3.5776e-01, -6.5965e-02, -4.6385e-01,
          -7.1219e-03,  1.8831e+00, -1.0559e+00,  5.5174e-01, -1.1139e+00,
          -1.6550e+00,  6.0078e-01, -3.7074e-01, -1.3973e+00,  9.2192e-01,
          -4.2870e-01, -6.6090e-01, -1.4654e+00, -8.8921e-01, -3.9149e-01,
           7.8399e-01,  5.8704e-02, -1.2684e+00, -6.8383e-01,  1.4816e-01,
           4.7611e-01,  1

In [178]:
# Single entity
if second_ent[0] == second_ent[1]:
    ent_offset = second_ent[0].view(-1, 1, 1)
#     print(ent_offset)
    ent_embedding = ent_offset.expand(-1,-1,768)
    selected = torch.gather(o, 1, ent_embedding)
    print(selected)

tensor([[[ 4.2591e-01,  7.1202e-02,  1.0741e+00, -5.9715e-01,  7.3422e-01,
           8.5603e-01,  1.0841e+00, -2.2846e-01,  7.1181e-01, -1.3976e-01,
           5.7261e-02, -2.8580e-01,  1.8605e-01,  2.5129e-01, -7.7597e-01,
           5.7853e-01, -4.9510e-01,  1.2580e-01,  1.4575e-02,  8.6050e-01,
           2.0907e-01, -2.0072e-01, -9.2557e-01,  4.1754e-01,  1.8603e-02,
           8.5870e-01,  5.6372e-01,  5.2747e-01,  5.0323e-01,  4.8772e-01,
           4.3810e-01,  1.6062e-01,  9.7859e-01,  7.7399e-02, -4.2207e-01,
           3.6537e-01, -9.9465e-02,  7.3076e-01, -2.8857e-01, -5.9149e-01,
           1.9069e-01, -1.3455e+00,  4.8455e-01, -8.1836e-02, -4.9453e-01,
           2.5838e-01, -1.4900e-01, -4.1348e-01, -4.8747e-03, -1.7667e-01,
          -1.3364e+00, -9.1852e-02,  2.3871e-01, -6.4757e-01,  1.0538e+00,
           3.5170e-01, -1.1585e+00, -1.6642e+00, -2.1156e-01, -1.0385e-01,
          -4.6887e-01,  7.8346e-01, -6.3073e-01, -8.2196e-01,  6.5401e-02,
           9.8621e-03,  5

In [217]:
f = o[:,42,:]
s = o[:,43,:]

In [232]:
@torch.jit.script
def sum_tensors(tensors: List[torch.Tensor]):
    return torch.stack(tensors).sum(dim=0)

In [212]:
fa = f[0][:2]
sa = s[0][:2]
print(fa)
print(sa)

tensor([0.5381, 0.0461], device='cuda:0', grad_fn=<SliceBackward0>)
tensor([ 0.8265, -0.6816], device='cuda:0', grad_fn=<SliceBackward0>)


In [216]:
torch.stack([fa, sa]).sum(dim=0).shape

torch.Size([2])

In [180]:
a = torch.tensor([1])
b = torch.tensor([2])
c = torch.tensor([3])
@torch.jit.script
def s(a,b):
    return torch.add(a, b)

In [196]:
torch.stack([a,b,c]).sum()

tensor(6)

In [248]:
of = off.unsqueeze(2).expand(-1,-1,768)
of.shape

torch.Size([1, 5, 768])

In [95]:
# print(o[:,42,:])
print(o[:,45,:])

tensor([[ 4.2591e-01,  7.1202e-02,  1.0741e+00, -5.9715e-01,  7.3422e-01,
          8.5603e-01,  1.0841e+00, -2.2846e-01,  7.1181e-01, -1.3976e-01,
          5.7261e-02, -2.8580e-01,  1.8605e-01,  2.5129e-01, -7.7597e-01,
          5.7853e-01, -4.9510e-01,  1.2580e-01,  1.4575e-02,  8.6050e-01,
          2.0907e-01, -2.0072e-01, -9.2557e-01,  4.1754e-01,  1.8603e-02,
          8.5870e-01,  5.6372e-01,  5.2747e-01,  5.0323e-01,  4.8772e-01,
          4.3810e-01,  1.6062e-01,  9.7859e-01,  7.7399e-02, -4.2207e-01,
          3.6537e-01, -9.9465e-02,  7.3076e-01, -2.8857e-01, -5.9149e-01,
          1.9069e-01, -1.3455e+00,  4.8455e-01, -8.1836e-02, -4.9453e-01,
          2.5838e-01, -1.4900e-01, -4.1348e-01, -4.8747e-03, -1.7667e-01,
         -1.3364e+00, -9.1852e-02,  2.3871e-01, -6.4757e-01,  1.0538e+00,
          3.5170e-01, -1.1585e+00, -1.6642e+00, -2.1156e-01, -1.0385e-01,
         -4.6887e-01,  7.8346e-01, -6.3073e-01, -8.2196e-01,  6.5401e-02,
          9.8621e-03,  5.8867e-01,  9.

In [94]:
# Select from output only the embeddings associated with the positions of the offsets
# When the starting and ending offsets are the same we will have a duplicate embedding
torch.gather(o, 1, of)

tensor([[[ 0.5381,  0.0461,  0.5133,  ..., -0.4763,  0.2441,  0.4812],
         [ 0.8265, -0.6816,  0.1423,  ..., -0.3302,  0.3041, -0.3024],
         [ 0.4259,  0.0712,  1.0741,  ..., -0.2650,  0.0730,  0.7715],
         [ 0.4259,  0.0712,  1.0741,  ..., -0.2650,  0.0730,  0.7715],
         [-0.6122, -1.0878, -0.1084,  ...,  0.2920,  0.7835,  0.1617]]],
       device='cuda:0', grad_fn=<GatherBackward0>)

In [49]:
ten = torch.tensor([[5, 7], [1, 3]])
a = torch.gather(ten, 0, torch.tensor([[0, 0], [0, 0]]))
print(a)

tensor([[5, 7],
        [5, 7]])


In [266]:
concat_bert = torch.cat((output[-1],output[-2]) ,dim=-1)
concat_bert

TypeError: expected Tensor as element 0 in argument 0, but got tuple

In [265]:
torch.cuda.empty_cache()

In [74]:
# embeddings = []
# for features, offsets, labels in train_dataloader:
#     embeddings.append(model(features, offsets))

In [75]:
# embeddings

In [212]:
train = collate_batch(train_ds)
train[0]

tensor([[  101, 11199, 10093,  ...,     0,     0,     0],
        [  101,  2002,  3473,  ...,     0,     0,     0],
        [  101,  2002,  2018,  ...,     0,     0,     0],
        ...,
        [  101,  2002,  2001,  ...,     0,     0,     0],
        [  101,  2798,  8480,  ...,     0,     0,     0],
        [  101,  2009, 14964,  ...,     0,     0,     0]], device='cuda:0')

In [91]:
# embeddings = model(train[0], train[1])

In [202]:
# model.bert.encoder.layer[11]

In [71]:
# model.bert.trainable()

In [None]:
for param in model.bert.parameters():
    print(param)

In [None]:
# import gc
# gc.collect()
# gc.get_count()

In [86]:
tokenizer(s, return_tensors="pt")

{'input_ids': tensor([[  101, 11199, 10093,  3877,  1011,  1011,  2209,  1996,  2610,  2961,
          6513,  1997,  4079,  1010,  8538,  1012, 14019,  2011,  4079,  1999,
          1996,  2345,  2792,  1997,  2186,  1015,  1010,  2044,  2002,  7771,
          2007,  8437,  1010,  1998,  2003,  2025,  2464,  2153,  1012, 18188,
          2726,  2209, 19431, 13737,  1010, 15595,  1005,  1055,  2767,  1998,
          2036,  1037,  2095,  2340, 11136,  1999,  4079,  1005,  1055,  2465,
          1012, 14019,  2014,  6898,  2206,  4079,  1005,  1055,  6040,  2044,
          2002,  2876,  1005,  1056,  2031,  3348,  2007,  2014,  2021,  2101,
         11323,  2023,  2001,  2349,  2000,  2032,  9105, 26076,  2125,  2014,
          2767, 15595,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0