In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import ConnectionPatch
from ast import literal_eval
import seaborn as sns
import seaborn.objects as so
sns.set_theme()

In [None]:
df_pages = pd.concat([pd.read_parquet('/scratch/tsoares/wikidumps/simplewiki-NS0-20230901/processed_data/good_pages_0.parquet', columns=['title', 'lead_paragraph']),
                      pd.read_parquet('/scratch/tsoares/wikidumps/simplewiki-NS0-20230901/processed_data/good_pages_1.parquet', columns=['title', 'lead_paragraph']),
                      pd.read_parquet('/scratch/tsoares/wikidumps/simplewiki-NS0-20231001/processed_data/good_pages_0.parquet', columns=['title', 'lead_paragraph']),
                      pd.read_parquet('/scratch/tsoares/wikidumps/simplewiki-NS0-20231001/processed_data/good_pages_1.parquet', columns=['title', 'lead_paragraph'])]).reset_index(drop=True)
df_pages = df_pages.drop_duplicates(
    subset=['title']).reset_index(drop=True)
df_pages = df_pages.to_dict(orient='records')
page_leads = {row['title']: row['lead_paragraph'] for row in df_pages}

In [None]:
df = pd.read_parquet('/scratch/tsoares/wikidumps/simplewiki-NS0-20230901/eval/test_data.parquet')
df['language'] = 'simple'
df

In [None]:
print(f"There are {len(df)} links in the test set.")

In [None]:
print('Cleaning links')
print(f'We started with {len(df)} links')
no_context = df['context'] == ''
print(f'There are {no_context.sum()} links with no context')
no_neg_contexts = df['negative_contexts'] == '[]'
print(f'There are {no_neg_contexts.sum()} links with no negative contexts')
missing_page = ~df['target_title'].isin(page_leads)
print(f'There are {missing_page.sum()} links with missing pages')
missing_section = df['missing_category'] == 'missing_section'
print(f'There are {missing_section.sum()} links with missing sections')
df_clean = df[~no_context & ~no_neg_contexts & ~missing_page & ~missing_section]
print(f"After cleaning, there are {len(df_clean)} links")

In [None]:
for i in range(10):
    sample = df[df['context'] == ''].sample(1).to_dict(orient='records')[0]
    for key in sample:
        if key != 'negative_contexts':
            print(key, sample[key])
    print('#############')

## Plots

In [None]:
# matplotlib stacked bar plot
fig, ax = plt.subplots(figsize=(6, 5))
ax.set_title('Link Insertion Strategy Distribution')
ax.set_ylabel('Percentage of Links')
ax.set_xlabel('Language')
ax.set_ylim(0, 100)
ax.set_xticks(np.arange(len(df['language'].unique())+2))
ax.set_xticklabels([''] + df['language'].unique().tolist() + [''], rotation=0)

present_text = [0]
missing_mention = [0]
missing_sentence = [0]
missing_span = [0]
missing_section = [0]

df = df[df['context'] != '']
for language in df['language'].unique():
    present_text.append(len(df[(df['language'] == language) & (df['missing_category'].isna())]) / len(df[df['language'] == language]) * 100)
    missing_mention.append(len(df[(df['language'] == language) & (df['missing_category'] == 'missing_mention')]) / len(df[df['language'] == language]) * 100)
    missing_sentence.append(len(df[(df['language'] == language) & (df['missing_category'] == 'missing_sentence')]) / len(df[df['language'] == language]) * 100)
    missing_span.append(len(df[(df['language'] == language) & (df['missing_category'] == 'missing_span')]) / len(df[df['language'] == language]) * 100)
    missing_section.append(len(df[(df['language'] == language) & (df['missing_category'] == 'missing_section')]) / len(df[df['language'] == language]) * 100)

present_text.append(0)
missing_mention.append(0)
missing_sentence.append(0)
missing_span.append(0)
missing_section.append(0)
    
bottom = np.zeros(len(df['language'].unique())+2)
present_text = np.array(present_text)
missing_mention = np.array(missing_mention)
missing_sentence = np.array(missing_sentence)
missing_span = np.array(missing_span)
missing_section = np.array(missing_section)

ax.bar(np.arange(len(df['language'].unique())+2), present_text, label='Text Present', bottom=bottom)
bottom += present_text
ax.bar(np.arange(len(df['language'].unique())+2), missing_mention, label='Mention Missing', bottom=bottom)
bottom += missing_mention
ax.bar(np.arange(len(df['language'].unique())+2), missing_sentence, label='Sentence Missing', bottom=bottom)
bottom += missing_sentence
ax.bar(np.arange(len(df['language'].unique())+2), missing_span, label='Span Missing', bottom=bottom)
bottom += missing_span
ax.bar(np.arange(len(df['language'].unique())+2), missing_section, label='Section Missing', bottom=bottom)

ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.0))
# flip order of legend
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles[::-1], labels[::-1], loc='upper right', bbox_to_anchor=(1.3, 1.0))

# add labels inside each bar
for i, v in enumerate(present_text):
    if v > 0:
        ax.text(i, v/2, f'{v:.2f}%', color='white', ha='center', va='center', fontweight='bold')
for i, v in enumerate(missing_mention):
    if v > 0:
        ax.text(i, present_text[i]+v/2, f'{v:.2f}%', color='white', ha='center', va='center', fontweight='bold')
for i, v in enumerate(missing_sentence):
    if v > 0:
        ax.text(i, present_text[i]+missing_mention[i]+v/2, f'{v:.2f}%', color='white', ha='center', va='center', fontweight='bold')
for i, v in enumerate(missing_span):
    if v > 0:
        ax.text(i, present_text[i]+missing_mention[i]+missing_sentence[i]+v/2, f'{v:.2f}%', color='white', ha='center', va='center', fontweight='bold')
for i, v in enumerate(missing_section):
    if v > 0:
        ax.text(i, present_text[i]+missing_mention[i]+missing_sentence[i]+missing_span[i]+v/2, f'{v:.2f}%', color='white', ha='center', va='center', fontweight='bold')

In [None]:
# plot a histogram with the number of negative contexts per link
negative_contexts = df[df['context'] != '']['negative_contexts'].reset_index(drop=True).apply(literal_eval).apply(len)
negative_contexts.hist(bins=100)
# set log x axis
plt.xscale('log')
plt.yscale('log')
plt.xlabel('Number of negative contexts')
plt.ylabel('Number of links')
plt.show()

In [None]:
negative_contexts.describe()

# Analyze processing failures

In [None]:
import urllib
def process_title(title):
    return urllib.parse.unquote(title).replace('_', ' ')

def update_targets(target_name, redirect_map):
    counter = 0
    while target_name in redirect_map:
        target_name = redirect_map[target_name]
        counter += 1
        if counter > 10:
            break
    return target_name

In [None]:
df_1 = pd.concat([pd.read_parquet(f'/scratch/tsoares/wikidumps/simplewiki-NS0-20231001/processed_data/good_links_{i}.parquet') for i in range(2)]).reset_index(drop=True)
df_2 = pd.concat([pd.read_parquet(f'/scratch/tsoares/wikidumps/simplewiki-NS0-20231101/processed_data/good_links_{i}.parquet') for i in range(2)]).reset_index(drop=True)
# df_pages_1 = pd.concat([pd.read_parquet(f'/scratch/tsoares/wikidumps/simplewiki-NS0-20231001/processed_data/good_pages_{i}.parquet') for i in range(2)]).reset_index(drop=True)

In [None]:
redirect_map = pd.concat([pd.read_parquet(f'/scratch/tsoares/wikidumps/simplewiki-NS0-2023{month}01/processed_data/redirect_map.parquet') for month in [10, 11]]).reset_index(drop=True)
redirect_map = redirect_map['redirect'].to_dict()

df_1['target_title'] = df_1['target_title'].apply(
    lambda x: update_targets(x, redirect_map))
df_2['target_title'] = df_2['target_title'].apply(
    lambda x: update_targets(x, redirect_map))

df_1 = df_1[['source_title', 'target_title',
                'source_ID', 'target_ID', 'source_version']]
df_2 = df_2[['source_title', 'target_title',
                'source_ID', 'target_ID', 'source_version']]

# group the links by source and target and count the number of links
df_1 = df_1.groupby(['source_title', 'target_title', 'source_ID',
                    'target_ID', 'source_version']).size().reset_index(name='count')
df_2 = df_2.groupby(['source_title', 'target_title', 'source_ID',
                    'target_ID', 'source_version']).size().reset_index(name='count')

# find all new links added in df_2. Consider two cases
# 1. The row is not present in df_1
# 2. The row is present in df_1 but the count is smaller in df_1
df_diff = df_2.merge(df_1, how='left', on=[
                    'source_title', 'target_title', 'source_ID', 'target_ID'], suffixes=('_2', '_1'))
df_diff = df_diff[(df_diff['count_1'].isna()) | (df_diff['count_2'] > df_diff['count_1'])]
df_diff['count_1'] = df_diff['count_1'].fillna(0)
# df_diff['source_version_1'] = df_diff['source_version_1'].fillna('&oldid=0')
df_diff['count'] = df_diff['count_2'] - df_diff['count_1']
df_diff = df_diff[['source_title', 'target_title', 'source_ID',
                'target_ID', 'source_version_1', 'source_version_2', 'count']]


initial_size = df_diff['count'].sum()
print(f'Initially, there are {df_diff["count"].sum()} new candidate links, from {len(df_diff)} unique src-tgt pairs.')

In [None]:
df_diff

In [None]:
df_diff['source_ID'] = df_diff['source_ID'].astype(int)

In [None]:
# compare links in df with links in df_diff
# find links with source title in df_diff and not in df
df_diff['source_title'] = df_diff['source_title'].apply(process_title)
df['source_title'] = df['source_title'].apply(process_title)
df_diff_2 = df_diff.merge(df[['source_title', 'source_ID']].drop_duplicates(), how='left', on='source_title', suffixes=('_diff', '_df'))
df_diff_2 = df_diff_2[df_diff_2['source_ID_df'].isna()]
df_diff_2 = df_diff_2[['source_title', 'target_title', 'source_ID_diff', 'target_ID', 'source_version_1', 'source_version_2', 'count']]
df_diff_2['source_ID'] = df_diff_2['source_ID_diff']
df_diff_2 = df_diff_2.drop(columns=['source_ID_diff'])
df_diff_2 = df_diff_2.rename(columns={'source_ID': 'source_ID_diff'})
df_diff_2['source_ID_diff'] = df_diff_2['source_ID_diff'].astype(int)
# df_diff['source_version_1'] = df_diff['source_version_1'].apply(lambda x: x.replace('&oldid=', ''))
# df_diff['source_version_1'] = df_diff['source_version_1'].astype(int)
# df_diff['source_version_2'] = df_diff['source_version_2'].apply(lambda x: x.replace('&oldid=', ''))
# df_diff['source_version_2'] = df_diff['source_version_2'].astype(int)
df_diff_2['count'] = df_diff_2['count'].astype(int)
df_diff_2 = df_diff_2.sort_values(by=['source_ID_diff', 'target_ID', 'source_version_1', 'source_version_2']).reset_index(drop=True)
df_diff_2

In [None]:
df_diff[df_diff['source_version_1'] != '&oldid=0']

In [None]:
# compare links in df with links in df_diff
# find links with target title in df_diff and not in df
df_diff_3 = df_diff.merge(df[['target_title']].drop_duplicates(), how='left', on='target_title', suffixes=('_diff', '_df'))
df_diff_3 = df_diff_3[['source_title', 'target_title', 'source_ID', 'source_version_1', 'source_version_2', 'count']]
# df_diff['source_version_1'] = df_diff['source_version_1'].apply(lambda x: x.replace('&oldid=', ''))
# df_diff['source_version_1'] = df_diff['source_version_1'].astype(int)
# df_diff['source_version_2'] = df_diff['source_version_2'].apply(lambda x: x.replace('&oldid=', ''))
# df_diff['source_version_2'] = df_diff['source_version_2'].astype(int)
df_diff_3['count'] = df_diff_3['count'].astype(int)
df_diff_3 = df_diff_3.sort_values(by=['source_ID', 'source_version_1', 'source_version_2']).reset_index(drop=True)
df_diff_3

In [None]:
df_diff_3[df_diff_3['source_version_1'] != '&oldid=0']

In [None]:
# compare links in df with links in df_diff
# find links with target title and source title in df_diff and not in df
df_diff_4 = df_diff.merge(df[['source_title', 'target_title', 'source_ID']].drop_duplicates(), how='left', on=['source_title', 'target_title'], suffixes=('_diff', '_df'))
df_diff_4 = df_diff_4[df_diff_4['source_ID_df'].isna()]
df_diff_4 = df_diff_4[['source_title', 'target_title', 'source_ID_diff', 'target_ID', 'source_version_1', 'source_version_2', 'count']]
df_diff_4['source_ID'] = df_diff_4['source_ID_diff']
df_diff_4 = df_diff_4.drop(columns=['source_ID_diff'])
df_diff_4 = df_diff_4.rename(columns={'source_ID': 'source_ID_diff'})
df_diff_4['source_ID_diff'] = df_diff_4['source_ID_diff'].astype(int)
# df_diff['source_version_1'] = df_diff['source_version_1'].apply(lambda x: x.replace('&oldid=', ''))
# df_diff['source_version_1'] = df_diff['source_version_1'].astype(int)
# df_diff['source_version_2'] = df_diff['source_version_2'].apply(lambda x: x.replace('&oldid=', ''))
# df_diff['source_version_2'] = df_diff['source_version_2'].astype(int)
df_diff_4['count'] = df_diff_4['count'].astype(int)
df_diff_4 = df_diff_4.sort_values(by=['source_ID_diff', 'target_ID', 'source_version_1', 'source_version_2']).reset_index(drop=True)
df_diff_4

In [None]:
df_diff_4[df_diff_4['source_version_1'] != '&oldid=0']

In [None]:
df_diff.sample(10)

In [None]:
df

In [None]:
df[df['source_title'] == 'Acanthopterygii']

In [None]:
df_diff[df_diff['source_title'] == 'Acanthopterygii']

In [None]:
df_versions = pd.read_parquet('/scratch/tsoares/wikidumps/simplewiki-NS0-20231001/eval/link_versions.parquet')
df_versions

In [None]:
df_versions[df_versions['source_title'] == 'Acanthopterygii']

In [None]:
#print df_diff sorted by source_ID
# df_diff[df_diff['source_version_1']  != '&oldid=0'].sort_values(by=['source_ID']).reset_index(drop=True)
df_diff.sort_values(by=['source_ID']).reset_index(drop=True)