# Imports and Definitions

In [2]:
import pandas as pd
from glob import glob
from tqdm import tqdm
import os
import random
import urllib

In [3]:
root = "/scratch/tsoares/wikidumps/simplewiki-NS0-20231001/processed_data/"

In [4]:
def unencode_title(title):
    clean_title = urllib.parse.unquote(title).replace('_', ' ')
    return clean_title

# Load data

In [5]:
link_files = glob(os.path.join(root, "good_links*"))
page_files = glob(os.path.join(root, "good_pages*"))
link_files.sort()
page_files.sort()

In [6]:
print(link_files)
print(page_files)

['/scratch/tsoares/wikidumps/simplewiki-NS0-20231001/processed_data/good_links_0.parquet', '/scratch/tsoares/wikidumps/simplewiki-NS0-20231001/processed_data/good_links_1.parquet']
['/scratch/tsoares/wikidumps/simplewiki-NS0-20231001/processed_data/good_pages_0.parquet', '/scratch/tsoares/wikidumps/simplewiki-NS0-20231001/processed_data/good_pages_1.parquet']


In [7]:
dfs = []
for file in link_files:
    dfs.append(pd.read_parquet(file))
df_links = pd.concat(dfs)
df_links = df_links.sample(100_000).reset_index(drop=True)
df_links['source_title'] = df_links['source_title'].apply(unencode_title)
df_links['target_title'] = df_links['target_title'].apply(unencode_title)
df_links

Unnamed: 0,target_title,target_section,source_title,target_ID,target_QID,source_ID,source_QID,source_version,link_ID,mention,...,sentence_raw,sentence_start_index,sentence_end_index,source_page_length,link_section_depth,context,context_sentence_start_index,context_sentence_end_index,context_mention_start_index,context_mention_end_index
0,Golden Raspberry Awards,Lead,54 (movie),64981,Q40237,928355,Q1849161,https://simple.wikipedia.org/w/index.php?title...,mwIg,Razzie Awards,...,"\n<p id=""mwIQ"">The 1998 movie was nominated fo...",5356,5521,20232,2.0.0,"Critical reception On Rotten Tomatoes, this mo...",346.0,397.0,383.0,396.0
1,List of Nazi concentration camps,Lead,Auschwitz concentration camp,287995,Q27821,33310,Q7341,https://simple.wikipedia.org/w/index.php?title...,mwOg,about 150 concentration camps,...,"Starting in 1940, the Nazis built <a rel=""mw:...",9012,9219,265584,1.0.0,Background Extermination camps were different ...,72.0,160.0,106.0,135.0
2,United States,Lead,"Bangor, California",219587,Q30,745575,Q3458774,https://simple.wikipedia.org/w/index.php?title...,mwDw,United States,...,"<p id=""mwCg""><b id=""mwCw"">Bangor</b> is a <a r...",22296,22767,63670,0.0.0,Bangor is a census-designated place (CDP) in B...,0.0,85.0,71.0,84.0
3,Technology,Lead,Augmented reality,15232,Q11016,791092,Q254183,https://simple.wikipedia.org/w/index.php?title...,mwBQ,technology,...,") is a computer <a rel=""mw:WikiLink"" href=""./T...",1480,1726,3469,0.0.0,"Augmented reality (often known as AR, not to b...",95.0,179.0,109.0,119.0
4,List of Urdu-language poets,Lead,Mir Taqi Mir,278200,Q3307018,278198,Q1771132,https://simple.wikipedia.org/w/index.php?title...,mwCg,Urdu poet,...,"\n<p id=""mwBA""><b id=""mwBQ"">Mir Taqi Mir</b> (...",6621,7445,15038,0.0.0,Mir Taqi Mir (born 1723 in Agra - died Septemb...,71.0,145.0,81.0,90.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,Race (sociology),Lead,Johann Friedrich Blumenbach,200803,Q3254959,331650,Q58588,https://simple.wikipedia.org/w/index.php?title...,mwFA,races,...,"<p id=""mwEg"">Blumenbach divided the human <a r...",7925,8153,22516,1.0.0,Blumenbach's classification of races Blumenbac...,37.0,98.0,84.0,89.0
99996,Bas-Rhin,Lead,"Châtenois, Bas-Rhin",139778,Q12717,1012887,Q21440,https://simple.wikipedia.org/w/index.php?title...,mwCg,Bas-Rhin,...,"It is in <a rel=""mw:WikiLink"" href=""./Grand_E...",2401,2772,4408,0.0.0,Châtenois is a commune. It is in Grand Est in ...,24.0,90.0,50.0,58.0
99997,HCard,Lead,Google Search,97369,Q85327,264968,Q9366,https://simple.wikipedia.org/w/index.php?title...,mwgA,hCard,...,"<p id=""mwfw"">On 12 May 2009, Google announced ...",31116,31903,194697,2.0.0,Personalized Google homepages retained both bu...,571.0,762.0,635.0,640.0
99998,Organization,Lead,Spin-off,32644,Q43229,13495,Q1186399,https://simple.wikipedia.org/w/index.php?title...,mwCA,organisations,...,"It is often used for <a rel=""mw:WikiLink"" hre...",1507,2065,9325,0.0.0,A spin-off is something new that comes from so...,157.0,296.0,282.0,295.0


In [7]:
dfs = []
for file in page_files:
    dfs.append(pd.read_parquet(file, columns=['title', 'lead_paragraph']))
df_pages = pd.concat(dfs)
df_pages['title'] = df_pages['title'].apply(unencode_title)
df_pages

Unnamed: 0,title,lead_paragraph
0,1296,The year 1296 was a leap year which started on...
1,12 Rounds,12 Rounds is a British rock band that was star...
2,127 Hours,127 Hours is a 2010 American biographical dram...
3,12 Monkeys,12 Monkeys is a 1995 American science fiction ...
4,12 Angry Men,12 Angry Men is an American drama movie direct...
...,...,...
39247,Stjørdal,Stjørdal is a municipality in Trøndelag county...
39248,"Gol, Norway","Gol is a municipality in Viken county, Norway...."
39249,"Sel, Norway","Sel is a municipality in Innlandet county, Nor..."
39250,Stranda,Stranda is a municipality in Møre og Romsdal c...


In [8]:
df_links = df_links.to_dict(orient='records')
df_pages = df_pages.to_dict(orient='records')

In [9]:
mention_map = pd.read_parquet(os.path.join(root, "mention_map.parquet"))
mention_map = mention_map.to_dict(orient='records')
entity_map = {}
for row in mention_map:
    title = unencode_title(row['target_title'])
    mention = row['mention']    
    if title in entity_map:
        entity_map[title].add(mention)
    else:
        entity_map[title] = set([mention])
entity_map

{'1282': ['1282'],
 'March': ['March', 'march', 'marched'],
 'Dafydd ap Gruffydd': ['Dafydd ap Gruffydd'],
 'Prince of Wales': ['Prince of Wales',
  'Princess of Wales',
  'prince of Wales'],
 'Llywelyn the Last': ['Llywelyn the Last', 'Llywelyn ap Gruffyd'],
 'English language': ['English',
  'English language',
  'eng.',
  ' English language',
  'English-language',
  'English-speaking',
  'English Language',
  'standard English',
  'Modern English',
  'English-speaking countries',
  'Standard English',
  'English speaking',
  'Eng.',
  'English linguistics',
  'engl.',
  'English version',
  'the English language',
  'in English',
  'English language versions',
  'eng',
  'English languages',
  'english',
  'English:',
  'en',
  'English Department',
  'Anglophone',
  'Simple English Language'],
 'Wales': ['Wales',
  'Wales, London',
  'Welsh international',
  'Cymru',
  'Prydain',
  'Welshman'],
 'Edward I of England': ['Edward I of England',
  'Edward I',
  'King Edward I',
  'Edwa

# Create auxiliary data structures

In [10]:
source_to_all_targets = {}
target_to_all_sources = {}
for link in tqdm(df_links):
    source = link['source_title']
    target = link['target_title']
    source_section = link['source_section'].split('<sep>')[0]
    if source not in source_to_all_targets:
        source_to_all_targets[source] = []
    source_to_all_targets[source].append(target)
    if target not in target_to_all_sources:
        target_to_all_sources[target] = []
    target_to_all_sources[target].append(source)

100%|██████████| 100000/100000 [00:00<00:00, 330401.94it/s]


In [11]:
page_leads = {}
for page in tqdm(df_pages):
    title = page['title']
    lead = page['lead_paragraph']
    page_leads[title] = lead

100%|██████████| 237977/237977 [00:00<00:00, 1118573.51it/s]


# Set-up positive samples

In [12]:
positive_samples = []
for row in tqdm(df_links):
    sample = {}
    sample['source_title'] = row['source_title']
    sample['source_lead'] = page_leads[sample['source_title']]
    sample['target_title'] = row['target_title']
    sample['target_lead'] = page_leads[sample['target_title']]
    sample['link_context'] = row['context']
    sample['source_section'] = row['source_section'].split('<sep>')[0]
    sample['label'] = 1

    positive_samples.append(sample)

100%|██████████| 100000/100000 [00:00<00:00, 327312.03it/s]


# Set-up negative samples

## Define hyper-parameters

In [18]:
negative_strategies = {
    'easy_replace_source': True,
    'easy_replace_target': True,
    'hard_replace_source': True,
    'hard_replace_target': True,
    'replace_context': True
}
negative_samples_per_positive = 50

## Build negative samples from positive ones

In [19]:
strategies = [key for key in negative_strategies if negative_strategies[key]]
strategies

['easy_replace_source',
 'easy_replace_target',
 'hard_replace_source',
 'hard_replace_target',
 'replace_context']

In [26]:
negative_samples = []
for i in tqdm(range(len(positive_samples))):
    valid_strategies = strategies.copy()
    if len(source_to_all_targets[positive_samples[i]['source_title']]) == 1:
        valid_strategies.remove('hard_replace_target')
    if len(target_to_all_sources[positive_samples[i]['target_title']]) == 1:
        valid_strategies.remove('hard_replace_source')
    list_strategies = random.choices(valid_strategies, k=negative_samples_per_positive)
    new_samples = []
    print(positive_samples[i])
    for strategy in list_strategies:
        if strategy == 'easy_replace_source':
            new_source = random.choices(positive_samples, k=1)[0]['source_title']
            while new_source in target_to_all_sources[positive_samples[i]['target_title']]:
                new_source = random.choices(positive_samples, k=1)[0]['source_title']
            new_sample = positive_samples[i].copy()
            new_sample['source_title'] = new_source
            new_sample['source_lead'] = page_leads[new_source]
            new_sample['neg_type'] = 'easy_replace_source'
        elif strategy == 'easy_replace_target':
            new_target = random.choices(positive_samples, k=1)[0]['target_title']
            while new_target in source_to_all_targets[positive_samples[i]['source_title']]:
                new_target = random.choices(positive_samples, k=1)[0]['target_title']
            new_sample = positive_samples[i].copy()
            new_sample['target_title'] = new_target
            new_sample['target_lead'] = page_leads[new_target]
            new_sample['neg_type'] = 'easy_replace_target'
        elif strategy == 'hard_replace_source':
            new_source_section = random.choices(target_to_all_sources[positive_samples[i]['target_title']], k=1)[0]
            new_sample = positive_samples[i].copy()
            new_sample['source_title'] = new_source_section
            new_sample['source_lead'] = page_leads[new_source_section]
            new_sample['neg_type'] = 'hard_replace_source'
        elif strategy == 'hard_replace_target':
            safe_targets = []
            for target in source_to_all_targets[positive_samples[i]['source_title']]:
                found = False
                for mention in entity_map[target]:
                    if mention in positive_samples[i]['link_context']:
                        found = True
                        break
                if not found:
                    safe_targets.append(target)
            if len(safe_targets) == 0:
                new_target = random.choices(positive_samples, k=1)[0]['target_title']
                while new_target in source_to_all_targets[positive_samples[i]['source_title']]:
                    new_target = new_target = random.choices(positive_samples, k=1)[0]['target_title']
            else:
                new_target = random.choices(safe_targets, k=1)[0]
            new_sample = positive_samples[i].copy()
            new_sample['target_title'] = new_target
            new_sample['target_lead'] = page_leads[new_target]
            new_sample['neg_type'] = 'hard_replace_target'
        elif strategy == 'replace_context':
            new_sample = positive_samples[i].copy()
            new_context = random.choices(positive_samples, k=1)[0]['link_context']
            mention_words = []
            for mention in entity_map[new_sample['target_title']]:
                mention_words.append(mention)
            while True:
                found = False
                for mention in entity_map[new_sample['target_title']]:
                    if mention in new_context:
                        found = True
                        break
                if not found:
                    break
                new_context = random.choices(positive_samples, k=1)[0]['link_context']
            new_sample['link_context'] = new_context
            new_sample['neg_type'] = 'replace_context'
        new_sample['label'] = 0
        new_samples.append(new_sample)
    negative_samples.extend(new_samples)    

  0%|          | 15/100000 [00:00<24:08, 69.04it/s]

  0%|          | 16/100000 [00:00<22:56, 72.65it/s]

{'source_title': 'Marco Beltrami', 'source_lead': 'Marco Beltrami is an American composer and conductor of film and television scores.', 'target_title': 'Lalo Schifrin', 'target_lead': 'Boris Claudio "Lalo" Schifrin is an Argentine pianist, composer, arranger and conductor. He is best known for his movie and TV scores since the 1950s, including the theme from Mission: Impossible and Enter the Dragon. Schifrin has received five Grammy Awards and six Oscar nominations. Schifrin is also known for his works with Clint Eastwood from the late 1960s to the 1980s, particularly the Dirty Harry movies.', 'link_context': 'Influences Beltrami stated composers such as Jerry Goldsmith, Bruce Broughton, Paul Chihara, Henry Mancini, Alfred Newman, Mark Snow, Michael Kamen, Alex North, Buddy Baker, Ennio Morricone, Marty Paich, Lalo Schifrin, Angelo Badelamenti, Oliver Wallace, Mychael Danna, Randy Edelman, Frank Zappa, Marty Paich, Basil Poledouris, Joe Hisaishi, Lalo Schifrin, Leonard Bernstein, Fran




KeyError: '1934'

In [None]:
df = pd.DataFrame(positive_samples + negative_samples)
df = df.sample(frac=1).reset_index(drop=True)
df

In [None]:
train_df = df.sample(frac=0.8)
val_df = df.drop(train_df.index).sample(frac=0.5)
test_df = df.drop(train_df.index).drop(val_df.index)

In [21]:
for page in df_pages:
    if page['title'] == '1934':
        print(page['lead_paragraph'])

1934 (MCMXXXIV)
was a common year starting on Monday of the Gregorian calendar, the 1934th year of the Common Era (CE) and Anno Domini (AD) designations, the 934th year of the 2nd millennium, the 34th year of the 20th century, and the 5th year of the 1930s decade.


In [22]:
for link in df_links:
    if link['source_title'] == '1934' or link['target_title'] == '1934':
        print(link)

{'target_title': '1934', 'target_section': 'Lead', 'source_title': 'April 6', 'target_ID': '8436', 'target_QID': 'Q18714', 'source_ID': '9876', 'source_QID': 'Q2506', 'source_version': 'https://simple.wikipedia.org/w/index.php?title=April_6&oldid=8843984', 'link_ID': 'mwApM', 'mention': '1934', 'source_section': 'Births', 'link_start_index': 106396, 'link_end_index': 106463, 'sentence': '1934 - Mario Merola, Italian singer', 'sentence_raw': '<li id="mwApI"><a rel="mw:WikiLink" href="./1934" title="1934" id="mwApM">1934</a> - <a rel="mw:WikiLink" href="./Mario_Merola?action=edit&amp;redlink=1" title="Mario Merola" class="new" typeof="mw:LocalizedAttrs" data-mw-i18n=\'{"title":{"lang":"x-page","key":"red-link-title","params":["Mario Merola"]}}\' id="mwApQ">Mario Merola</a>, Italian singer</li>\n', 'sentence_start_index': 106381, 'sentence_end_index': 106736, 'source_page_length': 169462, 'link_section_depth': '2.0.0', 'context': '1928 – James D. Watson, American geneticist 1929 – André P

In [25]:
for mention in mention_map:
    if '1934' == mention['mention'] or '1934' == mention['target_title']:
        print(mention)

{'mention': '1934', 'target_title': '1933%E2%80%9334_Austrian_Cup'}
