In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import ConnectionPatch
from ast import literal_eval
import seaborn as sns
import seaborn.objects as so
from glob import glob
from tqdm import tqdm
import os
import math
import pyarrow.parquet as pq

In [None]:
languages = {'af': {'full': 'Afrikaans'},
             'cs': {'full': 'Czech'},
             'cy': {'full': 'Welsh'},
             'en': {'full': 'English'},
             'fr': {'full': 'French'},
             'ga': {'full': 'Irish'},
             'gu': {'full': 'Gujarati'},
             'hi': {'full': 'Hindi'},
             'is': {'full': 'Icelandic'},
             'it': {'full': 'Italian'},
             'ja': {'full': 'Japanese'},
             'kk': {'full': 'Kazakh'},
             'kn': {'full': 'Kannada'},
             'ms': {'full': 'Malay'},
             'ps': {'full': 'Pashto'},
             'pt': {'full': 'Portuguese'},
             'simple': {'full': 'Simple English'},
             'sk': {'full': 'Slovak'},
             'sw': {'full': 'Swahili'},
             'ur': {'full': 'Urdu'},
             'uz': {'full': 'Uzbek'},
            }

In [None]:
for language in languages:
    languages[language]['total_dirty'] = 0
    languages[language]['missing_context'] = 0
    languages[language]['missing_negative_contexts'] = 0
    languages[language]['bad_target'] = 0
    languages[language]['missing_section'] = 0
    languages[language]['total_clean'] = 0
    languages[language]['total_clean_eval'] = 0 # also excluding the missing section because our model doesn't apply to that
    languages[language]['present_text'] = 0
    languages[language]['missing_mention'] = 0
    languages[language]['missing_sentence'] = 0
    languages[language]['missing_span'] = 0
    languages[language]['missing_section'] = 0
    languages[language]['candidates'] = 0

language_keys = list(languages.keys())
for language in (pbar := tqdm(language_keys)):
    if not os.path.exists(f'/dlabdata1/tsoares/wikidumps/{language}wiki-NS0-20231001/eval/test_data.parquet'):
        del languages[language]
        continue
    good_pages = []
    files = glob(f'/dlabdata1/tsoares/wikidumps/{language}wiki-NS0-20231001/processed_data/good_pages/*.parquet') + \
            glob(f'/dlabdata1/tsoares/wikidumps/{language}wiki-NS0-20231101/processed_data/good_pages/*.parquet')
    for i, file in enumerate(files):
        pbar.set_description(f'{language} - {i} / {len(files)}')
        df = pd.read_parquet(file, columns=['title'])
        good_pages.extend(list(df['title'].values))
    good_pages = set(good_pages)
    languages[language]['good_pages'] = good_pages
    # read test data in chunks
    # only read columns: context, negative_contexts, target_title, missing_category
    parquet_file = pq.ParquetFile(f'/dlabdata1/tsoares/wikidumps/{language}wiki-NS0-20231001/eval/test_data.parquet')
    num_rows = parquet_file.metadata.num_rows
    for batch in parquet_file.iter_batches(batch_size=10_000):
        pbar.set_description(f'{language} - {languages[language]["total_dirty"]} / {num_rows}')
        df = batch.to_pandas()
        
        languages[language]['total_dirty'] += len(df)
        
        no_context = df['context'] == ''
        languages[language]['missing_context'] += no_context.sum()
        
        no_neg_contexts = df['negative_contexts'] == '[]'
        languages[language]['missing_negative_contexts'] += no_neg_contexts.sum()
        
        bad_target = ~df['target_title'].isin(languages[language]['good_pages'])
        languages[language]['bad_target'] += bad_target.sum()

        df_clean = df[~no_context & ~no_neg_contexts & ~bad_target]
        languages[language]['total_clean'] += len(df_clean)
        languages[language]['total_clean_eval'] += len(df_clean[df_clean['missing_category'] != 'missing_section'])
        
        # find the value counts for column "missing category"
        # include NA values
        categories = df_clean['missing_category'].value_counts(dropna=False)
        languages[language]['present_text'] += categories[None] if None in categories.index else 0
        languages[language]['missing_mention'] += categories['missing_mention'] if 'missing_mention' in categories.index else 0
        languages[language]['missing_sentence'] += categories['missing_sentence'] if 'missing_sentence' in categories.index else 0
        languages[language]['missing_span'] += categories['missing_span'] if 'missing_span' in categories.index else 0
        languages[language]['missing_section'] += categories['missing_section'] if 'missing_section' in categories.index else 0
        languages[language]['candidates'] += df_clean['negative_contexts'].reset_index(drop=True).apply(lambda x: x.count("'context'") + 1).mean()

    languages[language]['present_text'] /= languages[language]['total_clean'] / 100
    languages[language]['missing_mention'] /= languages[language]['total_clean'] / 100
    languages[language]['missing_sentence'] /= languages[language]['total_clean'] / 100
    languages[language]['missing_span'] /= languages[language]['total_clean'] / 100
    languages[language]['missing_section'] /= languages[language]['total_clean'] / 100

## Plots

In [None]:
for lang in languages:
    if languages[lang]['total_clean'] < 1_000:
        languages[lang]['total_text'] = languages[lang]['total_clean']
    elif languages[lang]['total_clean'] < 1_000_000:
        languages[lang]['total_text'] = f'{round(languages[lang]["total_clean"] / 1_000):d}K'
    else:
        languages[lang]['total_text'] = f'{round(languages[lang]["total_clean"] / 1_000_000):d}M'

In [None]:
# matplotlib stacked bar plot
# create a 1x4 grid of subplots
fig, axs = plt.subplots(1, 4, figsize=(40, 15), width_ratios=[7, 7, 7, 3])

# set global title
fig.suptitle('Link Insertion Strategy Distribution (Oct 2023 $\\rightarrow$ Nov 2023)', fontsize=60, y=1.1, weight='bold')
# set y label for all subplots
for ax in axs:
    # ax.set_xlabel('Language', fontsize=20)
    ax.set_ylim(-5, 105)
    ax.tick_params(axis='y', which='major', labelsize=30)
    # set to bold
    ax.tick_params(axis='x', which='major', labelsize=30)
axs[0].set_ylabel('Percentage of Links', fontsize=50, weight='bold')
languages = {k: v for k, v in sorted(languages.items(), key=lambda item: item[1]['total_clean'], reverse=False)}
labels = [f"{language}\n({languages[language]['total_text']})" if language != 'en' else \
        f"{language}\n({languages[language]['total_text']}*)" for language in languages]
labels.append('Micro\nAverage')
labels.append('Macro\nAverage')
labels = [labels[0:7], labels[7:14], labels[14:21], [labels[21],labels[22]]]
for i, ax in enumerate(axs):
    ax.set_xticks(np.arange(len(labels[i])))
    ax.set_xticklabels(labels[i], rotation=0, fontsize=29, weight='bold')
    ax.set_yticks([])
axs[0].set_yticks(np.arange(0, 101, 25))
axs[0].set_yticklabels(np.arange(0, 101, 25), fontsize=30, weight='bold')

present_text = []
missing_mention = []
missing_sentence = []
missing_span = []
missing_section = []

total = {'present_text': 0,
         'missing_mention': 0,
         'missing_sentence': 0,
         'missing_span': 0,
         'missing_section': 0,
         'total': 0}
for language in languages:
    present_text.append(languages[language]['present_text'])
    missing_mention.append(languages[language]['missing_mention'])
    missing_sentence.append(languages[language]['missing_sentence'])
    missing_span.append(languages[language]['missing_span'])
    missing_section.append(languages[language]['missing_section'])
    total['present_text'] += languages[language]['present_text'] * languages[language]['total_clean'] / 100
    total['missing_mention'] += languages[language]['missing_mention'] * languages[language]['total_clean'] / 100
    total['missing_sentence'] += languages[language]['missing_sentence'] * languages[language]['total_clean'] / 100
    total['missing_span'] += languages[language]['missing_span'] * languages[language]['total_clean'] / 100
    total['missing_section'] += languages[language]['missing_section'] * languages[language]['total_clean'] / 100
    total['total'] += languages[language]['total_clean']

# append micro average results
present_text.append(total['present_text'] / total['total'] * 100)
missing_mention.append(total['missing_mention'] / total['total'] * 100)
missing_sentence.append(total['missing_sentence'] / total['total'] * 100)
missing_span.append(total['missing_span'] / total['total'] * 100)
missing_section.append(total['missing_section'] / total['total'] * 100)
# append macro average results
present_text.append(sum(present_text[:-1]) / len(present_text[:-1]))
missing_mention.append(sum(missing_mention[:-1]) / len(missing_mention[:-1]))
missing_sentence.append(sum(missing_sentence[:-1]) / len(missing_sentence[:-1]))
missing_span.append(sum(missing_span[:-1]) / len(missing_span[:-1]))
missing_section.append(sum(missing_section[:-1]) / len(missing_section[:-1]))

# split into 4 subplots
present_text = [present_text[0:7], present_text[7:14], present_text[14:21], [present_text[21], present_text[22]]]
missing_mention = [missing_mention[0:7], missing_mention[7:14], missing_mention[14:21], [missing_mention[21], missing_mention[22]]]
missing_sentence = [missing_sentence[0:7], missing_sentence[7:14], missing_sentence[14:21], [missing_sentence[21], missing_sentence[22]]]
missing_span = [missing_span[0:7], missing_span[7:14], missing_span[14:21], [missing_span[21], missing_span[22]]]
missing_section = [missing_section[0:7], missing_section[7:14], missing_section[14:21], [missing_section[21], missing_section[22]]]

bottom = [np.zeros(len(p)) for p in present_text]
present_text = [np.array(p) for p in present_text]
missing_mention = [np.array(p) for p in missing_mention]
missing_sentence = [np.array(p) for p in missing_sentence]
missing_span = [np.array(p) for p in missing_span]
missing_section = [np.array(p) for p in missing_section]

# use a colorblind friendly palette
for i, ax in enumerate(axs):
    ax.bar(np.arange(len(present_text[i])), present_text[i], label='Text Present', bottom=bottom[i], width=0.85, color=sns.color_palette('colorblind')[0])
    bottom[i] += present_text[i]
    ax.bar(np.arange(len(present_text[i])), missing_mention[i], label='Mention Missing', bottom=bottom[i], width=0.85, color=sns.color_palette('colorblind')[1])
    bottom[i] += missing_mention[i]
    ax.bar(np.arange(len(present_text[i])), missing_sentence[i], label='Sentence Missing', bottom=bottom[i], width=0.85, color=sns.color_palette('colorblind')[2])
    bottom[i] += missing_sentence[i]
    ax.bar(np.arange(len(present_text[i])), missing_span[i], label='Span Missing', bottom=bottom[i], width=0.85, color=sns.color_palette('colorblind')[3])
    bottom[i] += missing_span[i]
    ax.bar(np.arange(len(present_text[i])), missing_section[i], label='Section Missing', bottom=bottom[i], width=0.85, color=sns.color_palette('colorblind')[4])
    bottom[i] += missing_section[i]

# add legend to figure
# only show the legend for the first subplot
# make the legend horizontal and centered
# axs[0].legend(bbox_to_anchor=(2, 1.1), ncol=5, fontsize=20)
handles, labels = axs[0].get_legend_handles_labels()
fig.legend(handles=handles, labels=labels, loc='upper center', ncol=5, fontsize=45, bbox_to_anchor=(0.5, 1.05))
# reverse the order of the legend
# handles, labels = axs[0].get_legend_handles_labels()
# axs[0].legend(handles[::-1], labels[::-1],bbox_to_anchor=(1, 1.07), ncol=5, fontsize=18)


# add labels inside each bar
for row in range(len(axs)):
    for i, v in enumerate(present_text[row]):
        if v > 0:
            axs[row].text(i, v/2, f'{v:.0f}%', color='white', ha='center', va='center', fontweight='bold', fontsize=25)
    for i, v in enumerate(missing_mention[row]):
        if v > 0:
            axs[row].text(i, present_text[row][i]+v/2, f'{v:.0f}%', color='white', ha='center', va='center', fontweight='bold', fontsize=25)
    for i, v in enumerate(missing_sentence[row]):
        if v > 0:
            axs[row].text(i, present_text[row][i]+missing_mention[row][i]+v/2, f'{v:.0f}%', color='white', ha='center', va='center', fontweight='bold', fontsize=25)
    for i, v in enumerate(missing_span[row]):
        if v > 0:
            axs[row].text(i, present_text[row][i]+missing_mention[row][i]+missing_sentence[row][i]+v/2, f'{v:.0f}%', color='white', ha='center', va='center', fontweight='bold', fontsize=25)
    for i, v in enumerate(missing_section[row]):
        if v > 0:
            axs[row].text(i, present_text[row][i]+missing_mention[row][i]+missing_sentence[row][i]+missing_span[row][i]+v/2, f'{v:.0f}%', color='white', ha='center', va='center', fontweight='bold', fontsize=25)
plt.tight_layout()
plt.show()

In [None]:
# plot a bar chart of the number of negative contexts per language
fig, ax = plt.subplots(figsize=(35, 10))
ax.set_title('Number of Candidates per Language')
ax.set_ylabel('Number of Candidates')
ax.set_xlabel('Language')
# sort the languages by number of candidates
languages = {k: v for k, v in sorted(languages.items(), key=lambda item: item[1]['candidates'])}
ax.set_xticks(np.arange(len(languages)))
ax.set_xticklabels([languages[language]['full'] for language in languages], rotation=0)
ax.bar(np.arange(len(languages)), [languages[language]['candidates'] for language in languages])


# Full analysis

In [5]:
files = glob('/dlabdata1/tsoares/wikidumps/*wiki-NS0-20231001/eval/')
# find the list of all languages
languages = [file.split('/')[-3][:-17] for file in files]
languages.sort()
len(languages), languages

(110,
 ['af',
  'am',
  'an',
  'ar',
  'ary',
  'as',
  'ast',
  'az',
  'azb',
  'ba',
  'bar',
  'be',
  'bg',
  'br',
  'bs',
  'ca',
  'ce',
  'cs',
  'cv',
  'cy',
  'da',
  'de',
  'el',
  'en',
  'eo',
  'es',
  'et',
  'eu',
  'fa',
  'fi',
  'fr',
  'fy',
  'ga',
  'gd',
  'gl',
  'gu',
  'ha',
  'he',
  'hi',
  'hr',
  'hu',
  'hy',
  'id',
  'io',
  'is',
  'it',
  'ja',
  'jv',
  'ka',
  'kk',
  'km',
  'kn',
  'ko',
  'ku',
  'la',
  'lb',
  'lmo',
  'lo',
  'lt',
  'lv',
  'mg',
  'min',
  'mk',
  'mn',
  'mr',
  'ms',
  'nds-nl',
  'ne',
  'nl',
  'no',
  'oc',
  'om',
  'pa',
  'pl',
  'pms',
  'ps',
  'pt',
  'ro',
  'ru',
  'sa',
  'scn',
  'sco',
  'sd',
  'sh',
  'si',
  'simple',
  'sk',
  'sl',
  'so',
  'sq',
  'sr',
  'su',
  'sv',
  'sw',
  'ta',
  'te',
  'tg',
  'tl',
  'tr',
  'tt',
  'ug',
  'uk',
  'ur',
  'uz',
  'vi',
  'vo',
  'war',
  'xh',
  'yi',
  'zh'])

In [6]:
stats = {lang: {'text_present': 0, 'missing_mention': 0, 'missing_sentence': 0,
                'missing_span': 0, 'missing_section': 0} for lang in languages}

for language in (pbar := tqdm(languages)):
    if not os.path.exists(f'/dlabdata1/tsoares/wikidumps/{language}wiki-NS0-20231001/eval/test_data.parquet'):
        continue
    good_pages = []
    files = glob(f'/dlabdata1/tsoares/wikidumps/{language}wiki-NS0-20231001/processed_data/good_pages/*.parquet') + \
        glob(
            f'/dlabdata1/tsoares/wikidumps/{language}wiki-NS0-20231101/processed_data/good_pages/*.parquet')
    for i, file in enumerate(files):
        pbar.set_description(f'{language} - {i} / {len(files)}')
        try:
            df = pd.read_parquet(file, columns=['title'])
        except:
            pass
        good_pages.extend(list(df['title'].values))
    good_pages = set(good_pages)
    # read test data in chunks
    # only read columns: context, negative_contexts, target_title, missing_category
    parquet_file = pq.ParquetFile(
        f'/dlabdata1/tsoares/wikidumps/{language}wiki-NS0-20231001/eval/test_data.parquet')
    num_rows = parquet_file.metadata.num_rows
    total = 0
    for batch in parquet_file.iter_batches(batch_size=25_000):
        pbar.set_description(
            f'{language} - {total} / {num_rows}')
        df = batch.to_pandas()
        total += len(df)
        no_context = df['context'] == ''
        no_neg_contexts = df['negative_contexts'] == '[]'
        bad_target = ~df['target_title'].isin(good_pages)

        df_clean = df[~no_context & ~no_neg_contexts & ~bad_target]

        # find the value counts for column "missing category"
        # include NA values
        categories = df_clean['missing_category'].value_counts(dropna=False)
        stats[language]['text_present'] += categories[None] if None in categories.index else 0
        stats[language]['missing_mention'] += categories['missing_mention'] if 'missing_mention' in categories.index else 0
        stats[language]['missing_sentence'] += categories['missing_sentence'] if 'missing_sentence' in categories.index else 0
        stats[language]['missing_span'] += categories['missing_span'] if 'missing_span' in categories.index else 0
        stats[language]['missing_section'] += categories['missing_section'] if 'missing_section' in categories.index else 0

zh - 25000 / 45658: 100%|██████████| 110/110 [18:51<00:00, 10.28s/it]   


In [8]:
stats_df = pd.DataFrame(stats).T
stats_df.to_csv('stats.csv')

In [12]:
stats = [(k, v) for k, v in stats.items()]

AttributeError: 'list' object has no attribute 'items'

In [13]:
stats = sorted(stats, key = lambda x: sum(x[1].values()), reverse=True)

[('af', {'text_present': 695, 'missing_mention': 104, 'missing_sentence': 161, 'missing_span': 262, 'missing_section': 227}), ('am', {'text_present': 9, 'missing_mention': 3, 'missing_sentence': 0, 'missing_span': 0, 'missing_section': 0}), ('an', {'text_present': 8, 'missing_mention': 6, 'missing_sentence': 13, 'missing_span': 18, 'missing_section': 25}), ('ar', {'text_present': 7229, 'missing_mention': 2661, 'missing_sentence': 1670, 'missing_span': 3996, 'missing_section': 2304}), ('ary', {'text_present': 43, 'missing_mention': 29, 'missing_sentence': 14, 'missing_span': 32, 'missing_section': 11}), ('as', {'text_present': 214, 'missing_mention': 77, 'missing_sentence': 21, 'missing_span': 54, 'missing_section': 93}), ('ast', {'text_present': 14, 'missing_mention': 22, 'missing_sentence': 1, 'missing_span': 11, 'missing_section': 1}), ('az', {'text_present': 1982, 'missing_mention': 703, 'missing_sentence': 666, 'missing_span': 2586, 'missing_section': 376}), ('azb', {'text_present'

In [21]:
same_line = True
for stat in stats:
    if stat[0] == 'ru' or stat[0] == 'de':
        continue
    print(stat[0] + ' & ', end='')
    total_links = sum(stat[1].values())
    if total_links == 0:
        print('0 & 0 & 0 & 0 & 0 & 0', end='')
    else:
        for key in ['text_present', 'missing_mention', 'missing_sentence', 'missing_span', 'missing_section']:
            print(f'{stat[1][key]} ({stat[1][key] / total_links * 100:.1f} \\%) & ', end='')
        print(total_links, end='')
    if same_line:
        print(' & ', end='')
        same_line = False
    else:
        print(' \\\\')
        same_line = True

en & 41492 (23.9 \%) & 33679 (19.4 \%) & 32583 (18.7 \%) & 48325 (27.8 \%) & 17808 (10.2 \%) & 173887 & ja & 22607 (28.6 \%) & 21860 (27.7 \%) & 17935 (22.7 \%) & 12282 (15.5 \%) & 4374 (5.5 \%) & 79058 \\
es & 20113 (30.3 \%) & 19183 (28.9 \%) & 5899 (8.9 \%) & 13763 (20.7 \%) & 7419 (11.2 \%) & 66377 & fr & 19626 (30.4 \%) & 11290 (17.5 \%) & 10479 (16.2 \%) & 18293 (28.3 \%) & 4851 (7.5 \%) & 64539 \\
it & 12783 (28.0 \%) & 7754 (17.0 \%) & 7516 (16.5 \%) & 11158 (24.4 \%) & 6430 (14.1 \%) & 45641 & he & 13190 (35.9 \%) & 5837 (15.9 \%) & 5087 (13.8 \%) & 9014 (24.5 \%) & 3653 (9.9 \%) & 36781 \\
pl & 7488 (27.4 \%) & 3956 (14.5 \%) & 4455 (16.3 \%) & 7525 (27.6 \%) & 3873 (14.2 \%) & 27297 & zh & 7870 (31.2 \%) & 6582 (26.1 \%) & 4600 (18.2 \%) & 4641 (18.4 \%) & 1562 (6.2 \%) & 25255 \\
pt & 6677 (27.5 \%) & 3728 (15.4 \%) & 2477 (10.2 \%) & 7109 (29.3 \%) & 4279 (17.6 \%) & 24270 & nl & 6021 (25.4 \%) & 5130 (21.6 \%) & 4085 (17.2 \%) & 5465 (23.1 \%) & 2996 (12.6 \%) & 23697 \\


In [25]:
micro_avg = {'text_present': 0, 'missing_mention': 0, 'missing_sentence': 0, 'missing_span': 0, 'missing_section': 0, 'total': 0}
for stat in stats:
    for key in ['text_present', 'missing_mention', 'missing_sentence', 'missing_span', 'missing_section']:
        micro_avg[key] += stat[1][key]
    micro_avg['total'] += sum(stat[1].values())
for key in ['text_present', 'missing_mention', 'missing_sentence', 'missing_span', 'missing_section', 'total']:
    micro_avg[key] /= len(stats)
for key in micro_avg:
    print(key, micro_avg[key], micro_avg[key] / micro_avg['total'] * 100)

text_present 2364.9 29.653343850845864
missing_mention 1533.8 19.23222918450141
missing_sentence 1188.3545454545454 14.900708678201733
missing_span 1999.6727272727273 25.073780274420443
missing_section 888.4272727272727 11.139938012030544
total 7975.154545454546 100.0


In [26]:
macro_avg = {'text_present': 0, 'missing_mention': 0, 'missing_sentence': 0, 'missing_span': 0, 'missing_section': 0, 'total': 0}
for stat in stats:
    for key in ['text_present', 'missing_mention', 'missing_sentence', 'missing_span', 'missing_section']:
        if sum(stat[1].values()) == 0:
            macro_avg[key] += 0
        else:
            macro_avg[key] += stat[1][key] / sum(stat[1].values()) * 100
    macro_avg['total'] += 100
for key in ['text_present', 'missing_mention', 'missing_sentence', 'missing_span', 'missing_section', 'total']:
    macro_avg[key] /= len(stats)
for key in macro_avg:
    print(key, macro_avg[key])

ZeroDivisionError: division by zero