# Imports and Definitions

In [None]:
import pandas as pd
import os
from glob import glob
from tqdm import tqdm
import random
import urllib
import re

In [None]:
month_1 = '/scratch/tsoares/wikidumps/simplewiki-NS0-20230901'
month_2 = '/scratch/tsoares/wikidumps/simplewiki-NS0-20231001'

In [None]:
# data corruption splits
no_mask = 0.4
mask_sentence = 0.3
mask_mention = 0.2
mask_span = 0.1

# Load Data

In [None]:
files_1 = glob(os.path.join(month_1, 'processed_data', "links*"))
dfs = []
for file in tqdm(files_1):
    dfs.append(pd.read_parquet(file))
df_1 = pd.concat(dfs)
df_1

In [None]:
files_2 = glob(os.path.join(month_2, 'processed_data', "links*"))
dfs = []
for file in tqdm(files_2):
    dfs.append(pd.read_parquet(file))
df_2 = pd.concat(dfs)
df_2

In [None]:
dfs = []
for file in tqdm(glob(os.path.join(month_2, 'processed_data', "pages*"))):
    dfs.append(pd.read_parquet(file))
df_pages = pd.concat(dfs)

# Convert data into better structure

In [None]:
df_links_1 = df_1.to_dict(orient='records')
df_links_2 = df_2.to_dict(orient='records')

In [None]:
for row in tqdm(df_links_1):
    for key in row:
        if 'index' in key and row[key] == row[key]:
            row[key] = int(row[key])

In [None]:
for row in tqdm(df_links_2):
    for key in row:
        if 'index' in key and row[key] == row[key]:
            row[key] = int(row[key])

In [None]:
old_data = {}
for mod_link in tqdm(df_links_1):
    if mod_link['source_title'] not in old_data:
        old_data[mod_link['source_title']] = {}
    if mod_link['target_title'] not in old_data[mod_link['source_title']]:
        old_data[mod_link['source_title']][mod_link['target_title']] = []
    old_data[mod_link['source_title']][mod_link['target_title']].append(mod_link)

In [None]:
new_data = {}
for mod_link in tqdm(df_links_2):
    if mod_link['source_title'] not in new_data:
        new_data[mod_link['source_title']] = {}
    if mod_link['target_title'] not in new_data[mod_link['source_title']]:
        new_data[mod_link['source_title']][mod_link['target_title']] = []
    new_data[mod_link['source_title']][mod_link['target_title']].append(mod_link)

In [None]:
no_html = set(df_pages[(df_pages['HTML'].isna()) | (df_pages['HTML'] == '')]['title'].tolist())
no_lead = set(df_pages[(df_pages['lead_paragraph'].isna()) | (df_pages['lead_paragraph'] == '')]['title'].tolist())
short_lead = set(df_pages[df_pages['lead_paragraph'].apply(lambda x: x is not None and len(x.split()) < 6)]['title'].tolist())

# Find added links

In [None]:
new_pages = 0
new_page_links = 0
new_links = []
no_id_found = 0
no_id_not_found = 0

for source_page in tqdm(new_data):
    if source_page not in old_data:
        new_pages += 1
        new_page_links += len(new_data[source_page])
        continue
    for target_page in new_data[source_page]:
        if target_page not in old_data[source_page]:
            new_links.extend(new_data[source_page][target_page])
        else:
            links_with_id = []
            links_without_id = []
            for mod_link in new_data[source_page][target_page]:
                if mod_link['link_ID'] is None:
                    links_without_id.append(mod_link)
                else:
                    links_with_id.append(mod_link)
            
            for mod_link in links_with_id:
                found = False
                for old_link in old_data[source_page][target_page]:
                    if mod_link['link_ID'] == old_link['link_ID']:
                        found = True
                        break
                if not found:
                    new_links.append(mod_link)
            
            # try to find matches in the links without ID
            used = set([])
            for mod_link in links_without_id:
                found = False
                for i, old_link in enumerate(old_data[source_page][target_page]):
                    if old_link['link_ID'] is None and old_link['mention'] == mod_link['mention'] and old_link['source_section'] == mod_link['source_section'] and i not in used:
                        used.add(i)
                        found = True
                        no_id_found += 1
                        break
                if not found:
                    no_id_not_found += 1
                    new_links.append(mod_link)

print(f"The new data has {new_pages} new pages and {new_page_links} in these new pages")
print(f"There are {len(new_links)} new links in the new data")
print(f"From the links without ID, {no_id_found} ({no_id_found / (no_id_found + no_id_not_found) * 100:.2f}%) were matched to old links, and {no_id_not_found} ({no_id_not_found / (no_id_found + no_id_not_found) * 100:.2f}%) were not matched")

In [None]:
for link in new_links[:10]:
    print(link['context'])
    print()

# Clean new links

In [None]:
clean_links = []
for link in tqdm(new_links):
    if link['target_ID'] is None:
        continue
    if link['source_QID'] is None:
        continue
    if link['source_title'] in no_lead:
        continue
    if link['source_title'] in short_lead:
        continue
    if link['target_QID'] is None:
        continue
    if link['target_title'] in no_html:
        continue
    if link['target_title'] in no_lead:
        continue
    if link['target_title'] in short_lead:
        continue
    if link['target_title'] == mod_link['source_title']:
        continue
    if link['context'] is None:
        continue
    link['source_title'] = urllib.parse.unquote(link['source_title']).replace('_', ' ')
    link['target_title'] = urllib.parse.unquote(link['target_title']).replace('_', ' ')
    link['context'] = "\n".join(line for line in link['context'].split("\n") if line.strip() != '')
    clean_links.append(link)

print(f"Out of the {len(new_links)} new links, {len(clean_links)} ({len(clean_links) / len(new_links) * 100:.2f}%) are valid")

In [None]:
for link in clean_links[:10]:
    print(link['context'])
    print()

# Apply corruption to new links

## Link triage

Let's find which links can handle each corruption. The corruptions are narrower and narrower in the following order no mask > mask mention > mask sentence > mask span. This means that any context which can handle "mask sentence" can also handle "mask mention" and "no mask", but not necessarily "mask span". All contexts can handle "no mask" (no corruption).

In [None]:
mask_span_links = []
mask_mention_links = []
mask_sentence_links = []
no_mask_links = []
for link in clean_links:
    # mask span
    if (link['context'][:link['context_span_start_index']] + link['context'][link['context_span_end_index']:]).strip() != '':
        if link['context_span_start_index'] <= link['context_sentence_start_index'] and link['context_span_end_index'] >= link['context_sentence_end_index']:
            mask_span_links.append(link)
            continue
    
    # mask sentence
    if (link['context'][:link['context_sentence_start_index']] + link['context'][link['context_sentence_end_index']:]).strip() != '':
        if link['context_sentence_start_index'] <= link['context_mention_start_index'] and link['context_sentence_end_index'] > link['context_mention_end_index'] + 1:
            mask_sentence_links.append(link)
            continue
    
    # mask mention
    if (link['context'][:link['context_mention_start_index']] + link['context'][link['context_mention_end_index']:]).strip() != '':
        mask_mention_links.append(link)
        continue
    
    # no mask
    no_mask_links.append(link)
    
print(f"Out of the {len(clean_links)} clean links, we got the following results:")
print(f"\t- Mask span: {len(mask_span_links)} ({len(mask_span_links) / len(clean_links) * 100:.2f}%)")
print(f"\t- Mask sentence: {len(mask_sentence_links)} ({len(mask_sentence_links) / len(clean_links) * 100:.2f}%)")
print(f"\t- Mask mention: {len(mask_mention_links)} ({len(mask_mention_links) / len(clean_links) * 100:.2f}%)")
print(f"\t- No mask: {len(no_mask_links)} ({len(no_mask_links) / len(clean_links) * 100:.2f}%)")

## Generate the corrupted contexts

In [None]:
final_links = []
# shuffle mask span links
random.shuffle(mask_span_links)
for link in mask_span_links[:int(len(clean_links) * mask_span)]:
    mod_link = link.copy()
    mod_link['original_context'] = mod_link['context']
    mod_link['context'] = mod_link['context'][:int(mod_link['context_span_start_index'])] + mod_link['context'][int(mod_link['context_span_end_index']):]
    mod_link['context'] = re.sub(' +', ' ', mod_link['context'])
    mod_link['context'] = re.sub('\n ', '\n', mod_link['context'])
    mod_link['context'] = re.sub('\n+', '\n', mod_link['context'])
    mod_link['context'] = mod_link['context'].strip()
    mod_link['noise_strategy'] = 'mask_span'
    final_links.append(mod_link)

mask_sentence_links.extend(mask_span_links[int(len(clean_links) * mask_span):])
random.shuffle(mask_sentence_links)
for link in mask_sentence_links[:int(len(clean_links) * mask_sentence)]:
    mod_link = link.copy()
    mod_link['original_context'] = mod_link['context']
    mod_link['context'] = mod_link['context'][:int(mod_link['context_sentence_start_index'])] + mod_link['context'][int(mod_link['context_sentence_end_index']):]
    mod_link['context'] = re.sub(' +', ' ', mod_link['context'])
    mod_link['context'] = re.sub('\n ', '\n', mod_link['context'])
    mod_link['context'] = mod_link['context'].strip()
    mod_link['noise_strategy'] = 'mask_sentence'
    final_links.append(mod_link)
    
mask_mention_links.extend(mask_sentence_links[int(len(clean_links) * mask_sentence):])
random.shuffle(mask_mention_links)
for link in mask_mention_links[:int(len(clean_links) * mask_mention)]:
    mod_link = link.copy()
    mod_link['original_context'] = mod_link['context']
    mod_link['context'] = mod_link['context'][:int(mod_link['context_mention_start_index'])] + mod_link['context'][int(mod_link['context_mention_end_index']):]
    mod_link['context'] = re.sub(' +', ' ', mod_link['context'])
    mod_link['context'] = re.sub('\n ', '\n', mod_link['context'])
    mod_link['context'] = mod_link['context'].strip()
    mod_link['noise_strategy'] = 'mask_mention'
    final_links.append(mod_link)
    
no_mask_links.extend(mask_mention_links[int(len(clean_links) * mask_mention):])
random.shuffle(no_mask_links)
for link in no_mask_links:
    mod_link = link.copy()
    mod_link['original_context'] = mod_link['context']
    mod_link['context'] = re.sub(' +', ' ', mod_link['context'])
    mod_link['context'] = re.sub('\n ', '\n', mod_link['context'])
    mod_link['context'] = mod_link['context'].strip()
    mod_link['noise_strategy'] = 'no_mask'
    final_links.append(mod_link)
    
print('In the end, we have the following distribution:')
print(f"\t- Mask span: {len([link for link in final_links if link['noise_strategy'] == 'mask_span'])} ({len([link for link in final_links if link['noise_strategy'] == 'mask_span']) / len(final_links) * 100:.2f}%)")
print(f"\t- Mask sentence: {len([link for link in final_links if link['noise_strategy'] == 'mask_sentence'])} ({len([link for link in final_links if link['noise_strategy'] == 'mask_sentence']) / len(final_links) * 100:.2f}%)")
print(f"\t- Mask mention: {len([link for link in final_links if link['noise_strategy'] == 'mask_mention'])} ({len([link for link in final_links if link['noise_strategy'] == 'mask_mention']) / len(final_links) * 100:.2f}%)")
print(f"\t- No mask: {len([link for link in final_links if link['noise_strategy'] == 'no_mask'])} ({len([link for link in final_links if link['noise_strategy'] == 'no_mask']) / len(final_links) * 100:.2f}%)")

In [None]:
random.shuffle(final_links)
for link in final_links[:10]:
    print('MODIFIED CONTEXT')
    print(link['context'])
    print()
    print('ORIGINAL CONTEXT')
    print(link['original_context'])
    print()
    print('MENTION')
    print(link['mention'])
    print()
    print('NOISE STRATEGY')
    print(link['noise_strategy'])
    print()
    print('SOURCE TITLE')
    print(link['source_title'])
    print('CONTEXT MENTION START INDEX')
    print(link['context_mention_start_index'])
    print('CONTEXT MENTION END INDEX')
    print(link['context_mention_end_index'])
    print('CONTEXT SENTENCE START INDEX')
    print(link['context_sentence_start_index'])
    print('CONTEXT SENTENCE END INDEX')
    print(link['context_sentence_end_index'])
    print('CONTEXT SPAN START INDEX')
    print(link['context_span_start_index'])
    print('CONTEXT SPAN END INDEX')
    print(link['context_span_end_index'])
    print('####################')

In [None]:
for link in clean_links[:10]:
    for key in link:
        print(key, link[key])
    print('####################')