In [None]:
import torch
import spacy
import transformers
import itertools

In [None]:
tok = transformers.AutoTokenizer.from_pretrained('KB/bert-base-swedish-cased')
model = transformers.AutoModel.from_pretrained('KB/bert-base-swedish-cased').eval()
nlp = spacy.load('../data/sv_model_xpos/sv_model0/sv_model0-0.0.0/')

In [None]:
for parameter in model.parameters():
    parameter.requires_grad=False

In [None]:
TXT="""Bob Dylan föddes som Robert Zimmerman i staden Duluth, Minnesota men strax innan han fyllde sex år och efter att hans far fått polio flyttade familjen till den närliggande staden Hibbing, Minnesota där han sedan växte upp. Familjen Zimmerman var judisk och deras förfäder hade utvandrat från Ryssland, Ukraina, Litauen och Turkiet. Morfar och mormor - Benjamin och Liba Edelstein (senare Stein och Stone) - var litauiska judar som emigrerade till USA 1902.
När Bob Dylan var åtta-nio år började han spela på familjens piano. Därefter lärde han sig att spela munspel och gitarr.[3] Mycket av hans ungdomstid gick åt till att lyssna på radio där han tog in stationer som sände blues, country och tidig rock'n'roll. Han började uppträda i mitten av 1950-talet och var medlem i ett flertal band under sin tid i high school.

1959 började han studera på universitetet i Minneapolis. I samma veva tog hans intresse för folkmusik fart. Det var också nu han började presentera sig som Bob Dylan. Var han fått namnet ifrån finns det flera historier om. Vissa menar att det är inspirerat av poeten Dylan Thomas. År 2004 skrev han själv om hur han valde namnet i sin bok Memoarer, första delen"""



In [None]:
def parse_sentence(sentence):
    
    doc = nlp(sentence)
    length = 0
    input_ids = []
    word_start = []
    
    
    
    ixs, tokens = zip(*[(ix, token) for (ix, token) in enumerate(doc) if not token.is_space])
    
    wordpieces_for_token = tok(
            [token.string for token in tokens],
            add_special_tokens=False, 
            padding=False, 
            return_token_type_ids=False, 
            return_attention_mask=False)['input_ids']
    
    for wordpieces in wordpieces_for_token:
        assert len(wordpieces) > 0, "Empty token makes program sad"
        word_start.append(length)
        input_ids += wordpieces
        length += len(wordpieces)
    
    nouns = []
    spans = []
    ptr = 0
    
    chunkfix = {i:j for j,i in enumerate(ixs)}
    
    for chunk in doc.noun_chunks:
        start = chunkfix[chunk.start]
        end = chunkfix[chunk.end]
        #Add the next chunk to spans        
        for i in range(ptr, start):
            #Add all non noun chunks to the span
            nouns.append(False)
            spans.append(word_start[i])
            
        #Add the chunk to the spans
        nouns.append(True)
        spans.append(word_start[start])
        ptr = end
        
    #Add trailing (non noun) chunks to the span
    nouns.extend([False for start in word_start[ptr:]])
    spans.extend([start for start in word_start[ptr:]])
    
    # Add cls token
    input_ids = torch.LongTensor([tok.cls_token_id, *input_ids])
    nouns = [False] +  nouns
    spans = [0] + [start + 1 for start in spans]
    
    spans = list(zip(spans, spans[1:] + [len(input_ids)]))
    
    return input_ids, nouns, spans

input_ids, nouns, spans = parse_sentence(TXT)


In [None]:
list(nlp(TXT).noun_chunks)

In [None]:
test = []
for noun, (start, stop) in zip(nouns, spans):
    if noun: 
        print(tok.decode(input_ids[start:stop]))
        


In [None]:
def get_attention(input_ids, spans):
    # Average attention over heads in the last layer
    # (Using cumulative sum)
    att = model(input_ids.unsqueeze(0), output_attentions=True)['attentions'][-1].mean(1).squeeze(0)
    csatt = att.cumsum(0).cumsum(1)

    starts, ends = zip(*spans)
    starts = torch.LongTensor(starts) - 1
    ends = torch.LongTensor(ends) - 1
    
    ret = csatt[starts, :][:, starts] - csatt[starts, :][:, ends] - csatt[ends, :][:, starts] + csatt[ends, :][:, ends] 
    ret[0,:] = csatt[0,ends] - csatt[0,starts]
    ret[:,0] = csatt[ends,0] - csatt[starts,0]
    ret[0,0] = att[0,0]
    ret /= (ends - starts)[:, None]
                
    return ret

att = get_attention(input_ids, spans)

In [None]:
att.sum(1)

In [None]:
def print_wix(ix):
    start, stop = spans[ix]
    return tok.decode(input_ids[start:stop])

In [None]:
noun_ixs = [ix for ix, noun in enumerate(nouns) if noun]
rel_ixs = [ix for ix, noun in enumerate(nouns) if not noun]
rixs = torch.LongTensor(rel_ixs)


for head, tail in itertools.product(noun_ixs, noun_ixs):
    if abs(head - tail) <= 1:
        continue
        
    rixs = torch.LongTensor([ix for ix, noun in enumerate(nouns) if (not noun) and (head < ix < tail or head > ix > tail)])
    scores = att[head, rixs] + att[rixs, tail]
    best = torch.argmax(scores)
    rel = rixs[best].item()
    score = scores[best].item()
    print('{:.3f}: {} -- {} -- {}'.format(score, print_wix(head), print_wix(rel), print_wix(tail)))

In [None]:
black_list_relation = set([ token2id[n]  for n in noun_chunks ])
all_relation_pairs = []
id2token = { value: key for key, value in token2id.items()}
with Pool(10) as pool:
    params = [  ( pair[0], pair[1], attn_graph, max(tokenid2word_mapping), black_list_relation, ) for pair in tail_head_pairs]
    for output in pool.imap_unordered(bfs, params):
    if len(output):
        all_relation_pairs += [ (o, id2token) for o in output ]
        
triplet_text = []
with Pool(10, global_initializer, (nlp,)) as pool:
    for triplet in pool.imap_unordered(filter_relation_sets, all_relation_pairs):
        if len(triplet) > 0:
            triplet_text.append(triplet)
return triplet_text

In [None]:
att.sum(1)

In [None]:
def get_attention(input_ids, doc, word_ends):
    # Average attention over heads in the last layer
    att = model(input_ids.unsqueeze(0), output_attentions=True)['attentions'][-1].mean(1).squeeze(0)
    
    N = len(word_ends)
    
    tmp = att.new_zeros((N,N))

    tmp[0,0] = att[0,0]
    
    for i in range(1,N):
        istart, iend = word_ends[i-1], word_ends[i]
        tmp[0, i] = att[0, istart:iend].sum()
        tmp[i, 0] = att[istart:iend, 0].sum() / (iend-istart)
    
    for i,j in itertools.product(range(1,N),range(1,N)):
        istart, iend = word_ends[i-1], word_ends[i]
        jstart, jend = word_ends[j-1], word_ends[j]
        tmp[i,j] = att[istart:iend, jstart:jend].sum() / (iend-istart)
        
#    cs = att.cumsum(0).cumsum(1)
#    tmp2 = att.new_zeros((N,N))
#
#    tmp2[0,0] = att[0,0]
#    
#    for i in range(1, N):
#        istart, iend = word_ends[i-1]-1, word_ends[i]-1
#        jstart, jend = 0, 1
#        tmp2[0,i] = cs[jend, iend] - cs[jstart, iend] - cs[jend, istart] + cs[jstart,istart]     
#        tmp2[i,0] = cs[iend, jend] - cs[istart, jend] - cs[iend, jstart] + cs[istart,jstart]
#        tmp2[i,0] /= iend - istart
#    
#    for i,j in itertools.product(range(1,N),range(1,N)):
#        istart, iend = word_ends[i-1]-1, word_ends[i]-1
#        jstart, jend = word_ends[j-1]-1, word_ends[j]-1
#        tmp2[i,j] = cs[iend, jend] - cs[istart, jend] - cs[iend, jstart] + cs[istart,jstart]
#        tmp2[i,j] /= iend - istart
    
    return tmp

att = get_attention(input_ids, doc, word_ends)

In [None]:
att

In [None]:
att.sum(1)

In [None]:
csatt = att.cumsum(0).cumsum(1)

In [None]:
i = 23
j = 65
csatt[0, j] - csatt[0, i] 

In [None]:
att[0, i:j+1].sum()

In [None]:
with torch.no_grad():
    if use_cuda:
        for key in inputs.keys():
        inputs[key] = inputs[key].cuda()
        outputs = encoder(**inputs, output_attentions=True)
trim = True
'''
Use average of last layer attention : page 6, section 3.1.2
'''
attention = process_matrix(outputs[2], avg_head=True, trim=trim, use_cuda=use_cuda)
merged_attention = compress_attention(attention, tokenid2word_mapping)
attn_graph = build_graph(merged_attention)

In [None]:
for (x, start, end) in chunk_spans:
    print(x, tok.decode(input_ids[start:end]))

In [None]:
chunk.start
chunk.end

In [None]:
def parse_sentence(sentence, tok_params):
    huggingface_encoding = tok(sentence, **tok_params)
    doc = nlp(sentence)
    tokens = list(doc)
    chunk2id = {}
    start_chunk = []
    end_chunk = []
    noun_chunks = []
    for chunk in doc.noun_chunks:
        noun_chunks.append(chunk.text)
        start_chunk.append(chunk.start)
        end_chunk.append(chunk.end)

In [None]:
class nlp:
    def __init__(self, sentence):de
        self.text = sentence
        self.noun_chunks = [
        a
def parse_sentence(*args, **kwargs):
    """ parse sentence """
    inputs, tokenid2word_mapping, token2id, noun_chunks  = create_mapping(sentence, return_pt=True, nlp=nlp, tokenizer=tokenizer)
    triplets = []
    return triplets

def create_mapping(sentence, return_pt, nlp, tokenizer):
    inputs = None
    tokenid2word = None
    token2id = None
    noun_chunks = None
    return inputs, tokenid2word_mapping, token2id, noun_chunks

tokenizer = None
encoder = None

sentence = 'Bob Dylan är en gubbe som har skrivit musik.'

triplets = parse_sentence(sent.text, tokenizer, encoder, nlp)

In [None]:
def create_mapping(sentence, return_pt=True, nlp=nlp, tokenizer=tokenizer):
    return None

def parse_sentence(sentence, tokenizer, encoder, nlp, use_cuda=True):
    ''' Parse stuff '''
    tokenizer_name = str(tokenizer.__str__)

    inputs, tokenid2word_mapping, token2id, noun_chunks  = create_mapping(sentence, return_pt=True, nlp=nlp, tokenizer=tokenizer)

    with torch.no_grad():
        if use_cuda:
            for key in inputs.keys():
                inputs[key] = inputs[key].cuda()
        outputs = encoder(**inputs, output_attentions=True)
    trim = True
    if 'GPT2' in tokenizer_name:
        trim  = False

    ''' Parse stuff ''' Parse stuff ''' Parse stuff ''' Parse stuff ''' Parse stuff ''' Parse stuff ''' Parse stuff ''' Parse stuff ''' Parse stuff ''' Parse stuff ''' Parse stuff ''' Parse stuff ''' Parse stuff ''' Parse stuff ''' Parse stuff ''' Parse stuff ''' Parse stuff ''' Parse stuff ''' Parse stuff ''' Parse stuff ''' Parse stuff ''' Parse stuff ''' Parse stuff ''' Parse stuff ''' Parse stuff ''' Parse stuff ''' Parse stuff '''
    Use average of last layer attention : page 6, section 3.1.2
    '''
    attention = process_matrix(outputs[2], avg_head=True, trim=trim, use_cuda=use_cuda)

    merged_attention = compress_attention(attention, tokenid2word_mapping)
    attn_graph = build_graph(merged_attention)

    tail_head_pairs = []
    for head in noun_chunks:
        for tail in noun_chunks:
            if head != tail:
                tail_head_pairs.append((token2id[head], token2id[tail]))

    black_list_relation = set([ token2id[n]  for n in noun_chunks ])

    all_relation_pairs = []
    id2token = { value: key for key, value in token2id.items()}

    with Pool(10) as pool:
        params = [  ( pair[0], pair[1], attn_graph, max(tokenid2word_mapping), black_list_relation, ) for pair in tail_head_pairs]
        for output in pool.imap_unordered(bfs, params):
            if len(output):
                all_relation_pairs += [ (o, id2token) for o in output ]

    triplet_text = []
    with Pool(10, global_initializer, (nlp,)) as pool:
        for triplet in pool.imap_unordered(filter_relation_sets, all_relation_pairs):
            if len(triplet) > 0:
                triplet_text.append(triplet)
    return triplet_text

In [None]:
nlp = spacy.load('xx_ent_wiki_sm')
#nlp = spacy.load('da_core_news_md')

In [None]:
x = nlp(sentence)

In [None]:
list(x.noun_chunks)