In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import sys
import os
from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn as nn
from ast import literal_eval
from urllib import parse
import random

sys.path.append('..')
from baselines import bm25
from baselines import fuzzy_match

In [2]:
def fix_title(title):
    return parse.unquote(title).replace('_', ' ')

# Prepare ranking functions

In [3]:
def bm25_best_candidate(contexts, target_title, target_lead, mentions):
    scores = bm25.rank_contexts(contexts, target_title, target_lead, mentions, True, False)
    
    # find the context with the highest score
    best_context = {'score': float('-inf'), 'context': ''}
    for i, score in enumerate(scores):
        if score > best_context['score']:
            best_context['score'] = score
            best_context['context'] = contexts[i]
    
    return best_context

In [4]:
def string_match_best_candidate(contexts, target_mentions):
    scores = fuzzy_match.rank_contexts(contexts, target_mentions, True, False)
    
    # find the context with the highest score
    best_context = {'score': float('-inf'), 'context': ''}
    for i, score in enumerate(scores):
        if score > best_context['score']:
            best_context['score'] = score
            best_context['context'] = contexts[i]
    
    return best_context

In [5]:
def model_best_candidate(model, prediction_head, tokenizer, contexts, source_sections, target_title, target_lead, mentions):
    best_context = {'score': float('-inf'), 'context': ''}
    
    if target_title not in mentions:
        mentions[target_title] = ''
    
    for context, source_section in zip(contexts, source_sections):
        input = ["", ""]
        input[0] = f"{target_title} {mentions[target_title]}{tokenizer.sep_token}{target_lead}"
        input[1] = f"{source_section}{tokenizer.sep_token}{context}"
        input_tokens = tokenizer([input], return_tensors='pt', padding='max_length', truncation=True, max_length=512).to(device)
        embeddings = model(**input_tokens)['last_hidden_state'][:, 0, :]
        prediction = prediction_head(embeddings).squeeze().detach().cpu().numpy()
        if prediction.item() > best_context['score']:
            best_context['score'] = prediction.item()
            best_context['context'] = context
    
    return best_context

# Load model

In [6]:
device = 'cuda'

model = AutoModel.from_pretrained('/dlabdata1/tsoares/models/roberta_full_multilingual-all/model')
tokenizer = AutoTokenizer.from_pretrained('/dlabdata1/tsoares/models/roberta_full_multilingual-all/tokenizer')
prediction_head = nn.Sequential(nn.Linear(model.config.hidden_size, model.config.hidden_size),
                                nn.ReLU(),
                                nn.Linear(model.config.hidden_size, 1))
prediction_head.load_state_dict(torch.load('/dlabdata1/tsoares/models/roberta_full_multilingual-all/classification_head.pth', map_location='cpu'))

model = model.to(device)
prediction_head = prediction_head.to(device)

model = model.eval()
prediction_head = prediction_head.eval()

# Load and prepare data

In [7]:
df = pd.read_parquet('test_data/en.parquet')
mentions = pd.read_parquet('test_data/en_mention_map.parquet')

In [8]:
df['target_title'] = df['target_title'].apply(fix_title)

In [9]:
mention_map_pre = mentions.to_dict('records')
mention_map = {}
for row in mention_map_pre:
    title = fix_title(row['target_title'])
    if title in mention_map:
        mention_map[title].append(row['mention'])
    else:
        mention_map[title] = [row['mention']]

for title in mention_map:
    mention_map[title] = list(set([mention.lower() for mention in mention_map[title]]))
    if len(mention_map[title]) > 10:
        mention_map[title].sort(key=lambda x: len(x))
        while len(mention_map[title]) > 10 and len(mention_map[title][0]) < 3:
            mention_map[title].pop(0)
        mention_map[title] = mention_map[title][:10]
        random.shuffle(mention_map[title])
    mention_map[title] = ' '.join(mention_map[title])

# Analyze examples

The following are good examples:
 - a
 - a
 - a
 - a
 

## Examples of missing mention

In [31]:
# sample = df[df['missing_category'] == 'missing_sentence'].sample(1)
# sample = df[df['missing_category'].isna()].sample(1)
sample = df[df['source_title'] == 'FC_Hermannstadt'].sample(1)

target_title = sample['target_title'].values[0]
target_lead = sample['target_lead'].values[0]
first_version = sample['first_version'].values[0]

contexts = []
source_sections = []
contexts.append(sample['context'].values[0])
source_sections.append(sample['section'].values[0])
negative_contexts = literal_eval(sample['negative_contexts'].values[0])
for neg_context in negative_contexts:
    contexts.append(neg_context['context'])
    source_sections.append(neg_context['section'])

print('target_title', target_title)
print('target_lead', target_lead)
print('first_version', first_version)
print('contexts', contexts)
print('source_sections', source_sections)
print('number of candidates', len(contexts))

print('RUNNING BM25')
best_bm25 = bm25_best_candidate(contexts, target_title, target_lead, mention_map[target_title] if target_title in mention_map else [target_title])
print('RESULT', best_bm25)

print('RUNNING FUZZY MATCH')
best_fuzzy_match = string_match_best_candidate(contexts, mention_map[target_title] if target_title in mention_map else [target_title])
print('RESULT', best_fuzzy_match)
print('RUNNING MODEL')
best_model = model_best_candidate(model, prediction_head, tokenizer, contexts, source_sections, target_title, target_lead, mention_map)
print('RESULT', best_model)

target_title Karlo Letica
target_lead Karlo Letica is a Croatian professional footballer who plays as a goalkeeper for Swiss Super League club FC Lausanne-Sport.
first_version 1178541564
contexts ["Brazil\nJô Santos\nRomário Pires\n\nBulgaria\nPlamen Iliev\n\nCongo\nJuvhel Tsoumou\n\nCroatia\nGabriel Debeljuh\nCôte d'Ivoire\nOusmane Viera\n\nSwitzerland\nGoran Karanović", "Association football club in Sibiu\nNot to be confused with FC Sibiu.\nFootball club\nAsociația Fotbal Club Hermannstadt (German pronunciation: [ˈhɛʁmanʃtat]), commonly known as FC Hermannstadt, Hermannstadt or familiarly as Sibiu (Romanian pronunciation: [siˈbiw]), is a Romanian professional football club based in the city of Sibiu (German: Hermannstadt), Sibiu County, which currently competes in Liga I. The team was established in 2015 and introduced in the fourth division, with Hermannstadt being the equivalent of the city's name in the standard German language (i.e. Hochdeutsch). Roș-negrii achieved successive pr

RESULT {'score': 13.536664962768555, 'context': "Romário Pires\n \nBulgaria\n Plamen Iliev\n \nCongo\n Juvhel Tsoumou\n \nCroatia\n Gabriel Debeljuh\n \nCôte d'Ivoire\n Ousmane Viera\n \nSwitzerland\n Goran Karanović"}
