# Imports and Definitions

In [1]:
import pandas as pd
from glob import glob
from tqdm import tqdm
import os
import random

In [2]:
root = "/scratch/tsoares/wikidumps/simplewiki-NS0-20231001/processed_data/"

# Load data

In [3]:
link_files = glob(os.path.join(root, "good_links*"))
page_files = glob(os.path.join(root, "good_pages*"))
link_files.sort()
page_files.sort()

In [4]:
print(link_files)
print(page_files)

['/scratch/tsoares/wikidumps/simplewiki-NS0-20231001/processed_data/good_links_0.parquet', '/scratch/tsoares/wikidumps/simplewiki-NS0-20231001/processed_data/good_links_1.parquet']
['/scratch/tsoares/wikidumps/simplewiki-NS0-20231001/processed_data/good_pages_0.parquet', '/scratch/tsoares/wikidumps/simplewiki-NS0-20231001/processed_data/good_pages_1.parquet']


In [5]:
dfs = []
for file in link_files:
    dfs.append(pd.read_parquet(file))
df_links = pd.concat(dfs)
df_links = df_links.sample(100_000).reset_index(drop=True)
df_links

Unnamed: 0,target_title,target_section,source_title,target_ID,target_QID,source_ID,source_QID,mention,source_section,link_start_index,link_end_index,sentence,sentence_raw,sentence_start_index,sentence_end_index,source_page_length,link_section_depth,context
0,Album,Lead,B%27Day_Anthology_Video_Album,18202,Q482994,74938,Q148931,album,Lead,1714,1783,"B'Day Anthology Video Album is a video album, ...","<p id=""mwAw""><i id=""mwBA""><b id=""mwBQ"">B'Day A...",1552,2114,32709,0.0.0,It was released alongside B'Day Deluxe Edition...
1,Dementia,Lead,Terry_Jones,19054,Q83030,7467,Q166159,dementia,Lead,8873,8951,\nIn September 2016 it was announced that Jone...,"\n<p id=""mwDQ"">In September 2016 it was announ...",8637,9053,26178,0.0.0,Terence Graham Parry Jones (1 February 1942 – ...
2,Motoo_Tatsuhara,Lead,Yasuo_Suzuki,245596,Q1817515,245595,Q614306,Motoo Tatsuhara,Biography,10694,10793,"At this club, he played many Japan national t...","At this club, he played many <a rel=""mw:WikiL...",10526,10893,33460,1.0.0,Biography Suzuki was born in Kanagawa Prefectu...
3,Prime_Minister_of_Bangladesh,Lead,Shah_Azizur_Rahman,345471,Q14565638,174598,Q3249683,Prime Minister of Bangladesh,Lead,2016,2154,He was the Prime Minister of Bangladesh.,"He was the <a rel=""mw:WikiLink"" href=""./Prime...",2004,2155,4527,0.0.0,Shah Azizur Rahman (Bengali: শাহ আজিজুর রহমান;...
4,Association_football,Lead,Kim_Do-keun,2062,Q2736,221409,Q166650,association football,Lead,10817,10931,He is best known as an association football ...,"He is best known as an <a rel=""mw:WikiLink"" ...",10792,10939,37430,0.0.0,Kim Do-Keun (born 2 March 1972) is a South Kor...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,Equator,Lead,Decimal_degrees,6792,Q23538,698880,Q1798227,equator,Precision,2645,2720,The radius of the semi-major axis (the distanc...,"<p id=""mwCw"">The radius of the <a rel=""mw:Wiki...",2363,3368,9189,1.0.0,Precision The number of decimal places require...
99996,Mumps,Lead,Salivary_gland,105992,Q176741,33151,Q134938,mumps,Medical notes,10673,10742,See mumps\n,"<dl id=""mwXQ""><dd id=""mwXg""><i id=""mwXw"">See <...",10628,10757,25464,3.0.0,Medical notes Ducts can get blocked. This woul...
99997,L%C3%A1szl%C3%B3_Marton_%28director%29,Lead,Deaths_in_September_2019,713097,Q463061,710658,Q66777856,László Marton,September,105699,105815,"László Marton, 76, Hungarian theatre director.","<li id=""mwAng""><a rel=""mw:WikiLink"" href=""./Lá...",105684,105848,352687,1.0.0,", 58, American singer-songwriter (""Walking the..."
99998,Book_of_Genesis,Lead,Creationism,11278,Q9184,14220,Q130352,Genesis,Islamic Creationism,19188,19283,Many scholars say that Genesis is a corrupted...,"Many scholars say that <a rel=""mw:WikiLink"" h...",19164,19324,58611,3.0.0,Islamic Creationism Islamic creationism is the...


In [6]:
dfs = []
for file in page_files:
    dfs.append(pd.read_parquet(file, columns=['title', 'lead_paragraph']))
df_pages = pd.concat(dfs)
df_pages

Unnamed: 0,title,lead_paragraph
0,1296,The year 1296 was a leap year which started on...
1,12_Rounds,12 Rounds is a British rock band that was star...
2,127_Hours,127 Hours is a 2010 American biographical dram...
3,12_Monkeys,12 Monkeys is a 1995 American science fiction ...
4,12_Angry_Men,12 Angry Men is an American drama movie direct...
...,...,...
39247,Stj%C3%B8rdal,Stjørdal is a municipality in Trøndelag county...
39248,Gol%2C_Norway,"Gol is a municipality in Viken county, Norway...."
39249,Sel%2C_Norway,"Sel is a municipality in Innlandet county, Nor..."
39250,Stranda,Stranda is a municipality in Møre og Romsdal c...


# Create auxiliary data structures

In [7]:
source_to_all_targets = {}
target_to_all_sources = {}
for i in tqdm(range(len(df_links))):
    source = df_links.iloc[i]['source_title']
    target = df_links.iloc[i]['target_title']
    source_section = df_links.iloc[i]['source_section'].split('<sep>')[0]
    if source not in source_to_all_targets:
        source_to_all_targets[source] = []
    source_to_all_targets[source].append({'target': target, 'section': source_section})
    if target not in target_to_all_sources:
        target_to_all_sources[target] = []
    target_to_all_sources[target].append(source)

100%|██████████| 100000/100000 [00:23<00:00, 4345.58it/s]


In [8]:
page_leads = {}
for i in tqdm(range(len(df_pages))):
    title = df_pages.iloc[i]['title']
    lead = df_pages.iloc[i]['lead_paragraph']
    page_leads[title] = lead

100%|██████████| 237977/237977 [00:16<00:00, 14791.52it/s]


# Set-up positive samples

In [18]:
# positive_samples = []
# for i in tqdm(range(len(df_links))):
#     sample = {}
#     sample['source_title'] = df_links.iloc[i]['source_title']
#     sample['source_lead'] = page_leads[sample['source_title']]
#     sample['target_title'] = df_links.iloc[i]['target_title']
#     sample['target_lead'] = page_leads[sample['target_title']]
#     sample['link_context'] = df_links.iloc[i]['context']
#     sample['source_section'] = df_links.iloc[i]['source_section'].split('<sep>')[0]
#     sample['label'] = 1

#     positive_samples.append(sample)
    
positive_samples = []
for i in tqdm(range(len(df_links))):
    row = df_links.iloc[i]
    sample = {}
    sample['source_title'] = row['source_title']
    sample['source_lead'] = page_leads[sample['source_title']]
    sample['target_title'] = row['target_title']
    sample['target_lead'] = page_leads[sample['target_title']]
    sample['link_context'] = row['context']
    sample['source_section'] = row['source_section'].split('<sep>')[0]
    sample['label'] = 1

    positive_samples.append(sample)

positive_samples = [{
    'source_title': row['source_title'],
    'source_lead': page_leads[row['source_title']],
    'target_title': row['target_title'],
    'target_lead': page_leads[row['target_title']],
    'link_context': row['context'],
    'source_section': row['source_section'].split('<sep>')[0],
    'label': 1
} for index, row in tqdm(df_links.iterrows(), total=len(df_links))]

  0%|          | 0/100000 [00:00<?, ?it/s]

100%|██████████| 100000/100000 [00:28<00:00, 3480.73it/s]
100%|██████████| 100000/100000 [00:08<00:00, 11139.24it/s]
100%|██████████| 100000/100000 [00:08<00:00, 11694.50it/s]


# Set-up negative samples

## Define hyper-parameters

In [11]:
negative_strategies = {
    'easy_replace_source': True,
    'easy_replace_target': True,
    'hard_replace_source': True,
    'hard_replace_target': True,
    'replace_context': True,
    'replace_section_source': True,
}
negative_samples_per_positive = 10

## Build negative samples from positive ones

In [12]:
strategies = [key for key in negative_strategies if negative_strategies[key]]
strategies

['easy_replace_source',
 'easy_replace_target',
 'hard_replace_source',
 'hard_replace_target',
 'replace_context',
 'replace_section_source']

In [14]:
negative_samples = []
for i in tqdm(range(len(positive_samples))):
    valid_strategies = strategies.copy()
    if len(source_to_all_targets[positive_samples[i]['source_title']]) == 1:
        valid_strategies.remove('hard_replace_target')
    if len(target_to_all_sources[positive_samples[i]['target_title']]) == 1:
        valid_strategies.remove('hard_replace_source')
    list_strategies = random.choices(valid_strategies, k=negative_samples_per_positive)
    new_samples = []
    for strategy in list_strategies:
        if strategy == 'easy_replace_source':
            new_source = random.choices(positive_samples, k=1)[0]['source_title']
            while new_source in target_to_all_sources[positive_samples[i]['target_title']]:
                new_source = random.choices(positive_samples, k=1)[0]['source_title']
            new_sample = positive_samples[i].copy()
            new_sample['source_title'] = new_source
            new_sample['source_lead'] = page_leads[new_source]
        elif strategy == 'easy_replace_target':
            new_target = random.choices(positive_samples, k=1)[0]['target_title']
            while new_target in source_to_all_targets[positive_samples[i]['source_title']]:
                new_target = random.choices(positive_samples, k=1)[0]['target_title']
            new_sample = positive_samples[i].copy()
            new_sample['target_title'] = new_target
            new_sample['target_lead'] = page_leads[new_target]
        elif strategy == 'hard_replace_source':
            new_source_section = random.choices(target_to_all_sources[positive_samples[i]['target_title']], k=1)[0]
            new_sample = positive_samples[i].copy()
            new_sample['source_title'] = new_source_section
            new_sample['source_lead'] = page_leads[new_source_section]
        elif strategy == 'hard_replace_target':
            safe_targets = [target['target'] for target in source_to_all_targets[positive_samples[i]['source_title']] if target['section'] != positive_samples[i]['source_section']]
            if len(safe_targets) == 0:
                new_target = random.choices(positive_samples, k=1)[0]['target_title']
                while new_target in source_to_all_targets[positive_samples[i]['source_title']]:
                    new_target = new_target = random.choices(positive_samples, k=1)[0]['target_title']
            else:
                new_target = random.choices(safe_targets, k=1)[0]
            new_sample = positive_samples[i].copy()
            new_sample['target_title'] = new_target
            new_sample['target_lead'] = page_leads[new_target]
        elif strategy == 'replace_context':
            new_sample = positive_samples[i].copy()
            new_context = random.choices(positive_samples, k=1)[0]['link_context']
            while new_context == positive_samples[i]['link_context']:
                new_context = random.choices(positive_samples, k=1)[0]['link_context']
            new_sample['link_context'] = new_context
        elif strategy == 'replace_section_source':
            new_sample = positive_samples[i].copy()
            new_source_section = random.choices(positive_samples, k=1)[0]['source_section']
            while new_source_section == positive_samples[i]['source_section']:
                new_source_section = random.choices(positive_samples, k=1)[0]['source_section']
            new_sample['source_section'] = new_source_section
        new_sample['label'] = 0
        new_samples.append(new_sample)
    negative_samples.extend(new_samples)    

  1%|▏         | 1267/100000 [00:00<00:07, 12637.02it/s]

100%|██████████| 100000/100000 [00:04<00:00, 21671.30it/s]


In [15]:
df = pd.DataFrame(positive_samples + negative_samples)
df = df.sample(frac=1).reset_index(drop=True)
df

Unnamed: 0,source_title,source_lead,target_title,target_lead,link_context,source_section,label
0,Neal_Cassady,Neal Cassady was an American writer. He was a ...,Denver,Denver is the capital and largest city in the ...,'s latest writings. Howl became a landmark wor...,Works,0
1,Alexander_Volkov_%28tennis%29,Alexander Vladimirovich Volkov was a Russian p...,Russian_language,Russian is a Slavic language. It is the main l...,Clothesline A clothesline happens when a wrest...,Lead,0
2,English_language,English is a language that started in Anglo-Sa...,Germanic_languages,The Germanic languages are a group of Indo-Eur...,a few words.The closest language to English th...,History,1
3,Matt_Bellamy,Matthew James Bellamy is an English singer. He...,Chuck_Bednarik,"Charles Philip ""Chuck"" Bednarik, or Concrete C...",UPI Comeback Player of the Year award (1962–19...,UPI Comeback Player of the Year award (1962–1969),0
4,Xanadu_%28Rush_song%29,Xanadu is a song by the Canadian progressive r...,Instrumental,An instrumental is music that has no words or ...,Xanadu is a song by the Canadian progressive r...,Early life,0
...,...,...,...,...,...,...,...
1099995,Emmy_Award,The Emmy Awards are United States television p...,New_Delhi,New Delhi is the capital of India. It is also ...,The Emmy Awards are United States television p...,Lead,0
1099996,Ethiopia%2C_Be_Happy,"Ethiopia, Be Happy or Ityoṗya hoy des Ybelish ...",Haile_Selassie_I,Haile Selassie I was the Emperor of Ethiopia f...,"Characteristics The dark, elliptically-shaped ...",Lead,0
1099997,September_4,September 4 is the 247th day of the year in th...,Elizabeth_II,Elizabeth II was Head of the Commonwealth and ...,"As of the 2010 census, the township had a popu...",Events,0
1099998,Egyptian_Arabic,The modern Egyptian language is a group of dia...,French_language,French is a Romance language that was first sp...,"It includes the city of Montreal, the rest of ...",Lead,0


In [16]:
train_df = df.sample(frac=0.8)
val_df = df.drop(train_df.index).sample(frac=0.5)
test_df = df.drop(train_df.index).drop(val_df.index)