# Imports and Definitions

In [None]:
import pandas as pd
from glob import glob
from tqdm import tqdm
import os

In [None]:
root = '/scratch/tsoares/wikidumps/simplewiki-NS0-20230801/processed_data'

# Get data

In [None]:
page_files = glob(os.path.join(root, "pages*.parquet"))
link_files = glob(os.path.join(root, "links*.parquet"))
redirect_files = glob(os.path.join(root, "redirect*.parquet"))
print(page_files)
print(link_files)
print(redirect_files)

In [None]:
def simplify_html(html):
    if html is None:
        return None
    if html == '':
        return ''
    return 'a'

dfs = []
for file in tqdm(page_files):
    temp_df = pd.read_parquet(file)
    temp_df['HTML'] = temp_df['HTML'].apply(lambda x: simplify_html(x)) # simpify html so it is not too big
    dfs.append(temp_df)
df_pages = pd.concat(dfs)
df_pages

In [None]:
dfs = []
for file in tqdm(link_files):
    dfs.append(pd.read_parquet(file))
df_links = pd.concat(dfs)
df_links

In [None]:
dfs = []
for file in tqdm(redirect_files):
    dfs.append(pd.read_parquet(file))
df_redirects = pd.concat(dfs)
df_redirects

# Analyze Pages (Nodes)

What is the total number of pages?

In [None]:
len(df_pages)

What is the total number of redirects?

In [None]:
len(df_redirects)

What is the number of pages with missing QID?

In [None]:
len(df_pages[df_pages['QID'].isna()])

What is the number of pages with missing HTML?

In [None]:
len(df_pages[(df_pages['HTML'].isna()) | (df_pages['HTML'] == '')])

What is the number of pages without a lead paragraph?

In [None]:
len(df_pages[(df_pages['lead_paragraph'].isna()) | (df_pages['lead_paragraph'] == '')])

What is the number of good pages if we exclude all the faulty ones?

In [None]:
len(df_pages[(~df_pages['QID'].isna()) & (~df_pages['HTML'].isna()) & (~df_pages['lead_paragraph'].isna()) & (df_pages['HTML'] != '') & (df_pages['lead_paragraph'] != '')])

Save good data

In [None]:
for file in tqdm(page_files):
    df = pd.read_parquet(file)
    df = df[(~df['QID'].isna()) & (~df['HTML'].isna()) & (~df['lead_paragraph'].isna()) & (df['HTML'] != '') & (df['lead_paragraph'] != '')]
    df = df.reset_index(drop=True)
    df.to_parquet(file.replace('pages', 'good_pages'))

# Analyze Links

## Auxiliary Data Structures

In [None]:
# Pages with no html
no_html = df_pages[(df_pages['HTML'].isna()) | (df_pages['HTML'] == '')]['title'].tolist()
no_html = set(no_html)

In [None]:
# Pages with no lead paragraph
no_lead = df_pages[(df_pages['lead_paragraph'].isna()) | (df_pages['lead_paragraph'] == '')]['title'].tolist()
no_lead = set(no_lead)

## Results

How many total links do we have?

In [None]:
len(df_links)

Of all the links we have, how many were missing its target?

In [None]:
len(df_links[df_links['target_ID'].isna()])

What is the number of links where context couldn't be parsed?

In [None]:
len(df_links[df_links['context'].isna()])

What is the number of links without context?

In [None]:
len(df_links[df_links['context'] == ''])

What is the number of links with source pages without QID?

In [None]:
len(df_links[df_links['source_QID'].isna()])

What is the number of links with source pages without lead paragraph?

In [None]:
len(df_links[df_links['source_title'].isin(no_lead)])

What is the number of links with target pages without QID?

In [None]:
len(df_links[df_links['target_QID'].isna()])

What is the number of links with target pages without HTML?

In [None]:
len(df_links[df_links['target_title'].isin(no_html)])

What is the number of links with target pages without lead paragraph?

In [None]:
len(df_links[df_links['target_title'].isin(no_lead)])

What is the number of good links?

In [None]:
len(df_links[(~df_links['target_ID'].isna()) & (~df_links['source_QID'].isna()) & (~df_links['target_QID'].isna()) & (~df_links['target_title'].isin(no_html)) & (~df_links['target_title'].isin(no_lead)) & (~df_links['source_title'].isin(no_lead)) & (~df_links['context'].isna()) & (df_links['context'] != '')])

Save good links

In [None]:
for file in tqdm(link_files):
    df = pd.read_parquet(file)
    df = df[(~df['target_ID'].isna()) & (~df['source_QID'].isna()) & (~df['target_QID'].isna()) & (~df['target_title'].isin(no_html)) & (~df['target_title'].isin(no_lead)) & (~df['source_title'].isin(no_lead)) & (~df['context'].isna()) & (df['context'] != '')]
    df = df.reset_index(drop=True)
    df.to_parquet(file.replace('links', 'good_links'))

In [None]:
for i in range(10):
    sample = df_links.sample()
    print('SOURCE', sample['source_title'].item())
    print('TARGET', sample['target_title'].item())
    print('CONTEXT', sample['context'].item())
    print('SENTENCE', sample['sentence'].item())
    print('SECTION', sample['source_section'].item())
    print('###############')