# Imports and Definitions

In [None]:
import pandas as pd
from glob import glob
from tqdm import tqdm
import os
tqdm.pandas()

In [None]:
root = '/dlabdata1/tsoares/wikidumps/enwiki-NS0-20231001/processed_data'
lang = 'en'

# Get data

In [None]:
page_files = glob(os.path.join(root, "pages", "pages*.parquet"))
link_files = glob(os.path.join(root, "links", "links_*.parquet"))
redirect_files = glob(os.path.join(root, "redirect*.parquet"))
print(page_files)
print(link_files)
print(redirect_files)

# Analyze Pages

In [None]:
def simplify_html(html):
    if html is None:
        return None
    if html == '':
        return ''
    return 'a'
    
def split_text(x):
    if x is None:
        return float('inf')
    return len(x.split(' ', 10))

no_html_set = set([])
no_lead_set = set([])
short_lead_set = set([])

no_qid = 0
no_html = 0
no_lead = 0
short_lead = 0
good_pages = 0
total_pages = 0
for file in tqdm(page_files):
    df_pages = pd.read_parquet(file)
    if 'HTML' in df_pages.columns:
        df_pages['HTML'] = df_pages['HTML'].apply(lambda x: simplify_html(x)) # simpify html so it is not too big
    
    no_html_set = no_html_set.union(set(df_pages[df_pages['page_length'].isna()]['title'].to_list()))
    no_lead_set = no_lead_set.union(set(df_pages[df_pages['lead_paragraph'].isna() | (df_pages['lead_paragraph'] == '')]['title'].to_list()))
    short_lead_set = short_lead_set.union(set(df_pages[df_pages['lead_paragraph'].apply(lambda x: split_text(x) < 6)]['title'].to_list()))
    
    no_qid_mask = df_pages['QID'].isna()
    no_html_mask = df_pages['page_length'].isna()
    no_lead_mask = df_pages['lead_paragraph'].isna() | (df_pages['lead_paragraph'] == '')
    if lang not in ['ja']:
        short_lead_mask = df_pages['lead_paragraph'].apply(lambda x: split_text(x) < 6)
    
    no_qid += no_qid_mask.sum()
    no_html += no_html_mask.sum()
    no_lead += no_lead_mask.sum()
    if lang not in ['ja']:
        short_lead += short_lead_mask.sum()
    if lang not in ['ja']:
        good_pages += (~no_qid_mask & ~no_html_mask & ~no_lead_mask & ~short_lead_mask).sum()
    else:
        good_pages += (~no_qid_mask & ~no_html_mask & ~no_lead_mask).sum()
    total_pages += len(df_pages)
df_pages

In [None]:
dfs = []
for file in tqdm(redirect_files):
    dfs.append(pd.read_parquet(file))
df_redirects = pd.concat(dfs)
df_redirects

In [None]:
dfs = []
for file in tqdm(redirect_files):
    dfs.append(pd.read_parquet(file))
df_redirects = pd.concat(dfs)
df_redirects

What is the total number of pages?

In [None]:
total_pages

What is the total number of redirects?

In [None]:
len(df_redirects)

What is the number of pages with missing QID?

In [None]:
no_qid

What is the number of pages with missing HTML?

In [None]:
no_html

What is the number of pages without a lead paragraph?

In [None]:
no_lead

What is the number of pages with a lead paragraph with less than 6 words?

In [None]:
short_lead

What is the number of good pages if we exclude all the faulty ones?

In [None]:
good_pages

# Analyze Links

In [None]:
total_links = 0
link_no_target = 0
link_no_context = 0
link_no_source_qid = 0
link_no_source_lead = 0
link_short_source_lead = 0
link_no_target_qid = 0
link_no_target_html = 0
link_no_target_lead = 0
link_short_target_lead = 0
link_same_target_source = 0
good_links = 0
for file in tqdm(link_files):
    df_links = pd.read_parquet(file)
    
    link_no_target_mask = df_links['target_ID'].isna()
    link_no_context_mask = df_links['context'].isna()
    link_no_source_qid_mask = df_links['source_QID'].isna()
    link_no_source_lead_mask = df_links['source_title'].isin(no_lead_set)
    if lang not in ['ja']:
        link_short_source_lead_mask = df_links['source_title'].isin(short_lead_set)
    link_no_target_qid_mask = df_links['target_QID'].isna()
    link_no_target_html_mask = df_links['target_title'].isin(no_html_set)
    link_no_target_lead_mask = df_links['target_title'].isin(no_lead_set)
    if lang not in ['ja']:
        link_short_target_lead_mask = df_links['target_title'].isin(short_lead_set)
    link_same_target_source_mask = df_links['source_title'] == df_links['target_title']
    
    total_links += len(df_links)
    link_no_target += link_no_target_mask.sum()
    link_no_context += link_no_context_mask.sum()
    link_no_source_qid += link_no_source_qid_mask.sum()
    link_no_source_lead += link_no_source_lead_mask.sum()
    if lang not in ['ja']:
        link_short_source_lead += link_short_source_lead_mask.sum()
    link_no_target_qid += link_no_target_qid_mask.sum()
    link_no_target_html += link_no_target_html_mask.sum()
    link_no_target_lead += link_no_target_lead_mask.sum()
    if lang not in ['ja']:
        link_short_target_lead += link_short_target_lead_mask.sum()
    link_same_target_source += link_same_target_source_mask.sum()
    if lang not in ['ja']:
        good_links += (~link_no_target_mask & ~link_no_context_mask & ~link_no_source_qid_mask & ~link_no_source_lead_mask & ~link_short_source_lead_mask & ~link_no_target_qid_mask & ~link_no_target_html_mask & ~link_no_target_lead_mask & ~link_short_target_lead_mask & ~link_same_target_source_mask).sum()
    else:
        good_links += (~link_no_target_mask & ~link_no_context_mask & ~link_no_source_qid_mask & ~link_no_source_lead_mask & ~link_no_target_qid_mask & ~link_no_target_html_mask & ~link_no_target_lead_mask & ~link_same_target_source_mask).sum()


How many total links do we have?

In [None]:
total_links

Of all the links we have, how many were missing its target?

In [None]:
link_no_target

What is the number of links where the context couldn't be parsed?

In [None]:
link_no_context

What is the number of links with source pages without QID?

In [None]:
link_no_source_qid

What is the number of links with source pages without lead paragraph?

In [None]:
link_no_source_lead

What is the number of links where the source lead paragraph is too small (less than 6 words)?

In [None]:
link_short_source_lead

What is the number of links with target pages without QID?

In [None]:
link_no_target_qid

What is the number of links with target pages without HTML?

In [None]:
link_no_target_html

What is the number of links with target pages without lead paragraph?

In [None]:
link_no_target_lead

What is the number of links where the target lead paragraph is too small (less than 6 words)?

In [None]:
link_short_target_lead

What is the number of links where the source is the same as the target?

In [None]:
link_same_target_source

What is the number of good links?

In [None]:
good_links