# Imports and Definitions

In [None]:
import pandas as pd
from glob import glob
from tqdm import tqdm
import os
tqdm.pandas()

In [None]:
root = '/scratch/tsoares/wikidumps/simplewiki-NS0-20231001/processed_data'

# Get data

In [None]:
page_files = glob(os.path.join(root, "pages*.parquet"))
link_files = glob(os.path.join(root, "links*.parquet"))
redirect_files = glob(os.path.join(root, "redirect*.parquet"))
print(page_files)
print(link_files)
print(redirect_files)

In [None]:
def simplify_html(html):
    if html is None:
        return None
    if html == '':
        return ''
    return 'a'

dfs = []
for file in tqdm(page_files):
    temp_df = pd.read_parquet(file)
    temp_df['HTML'] = temp_df['HTML'].apply(lambda x: simplify_html(x)) # simpify html so it is not too big
    dfs.append(temp_df)
df_pages = pd.concat(dfs)
df_pages

In [None]:
dfs = []
for file in tqdm(link_files):
    dfs.append(pd.read_parquet(file))
df_links = pd.concat(dfs)
df_links

In [None]:
dfs = []
for file in tqdm(redirect_files):
    dfs.append(pd.read_parquet(file))
df_redirects = pd.concat(dfs)
df_redirects

# Analyze Pages (Nodes)

What is the total number of pages?

In [None]:
len(df_pages)

What is the total number of redirects?

In [None]:
len(df_redirects)

What is the number of pages with missing QID?

In [None]:
no_qid = df_pages['QID'].isna()
len(df_pages[no_qid])

What is the number of pages with missing HTML?

In [None]:
no_html = (df_pages['HTML'].isna()) | (df_pages['HTML'] == '')
len(df_pages[no_html])

What is the number of pages without a lead paragraph?

In [None]:
no_lead = (df_pages['lead_paragraph'].isna()) | (df_pages['lead_paragraph'] == '')
len(df_pages[no_lead])

What is the number of pages with a lead paragraph with less than 6 words?

In [None]:
# number of pages where lead paragraph has less than 6 words
def split_text(x):
    if x is None:
        return 0
    return len(x.split(' ', 5)) 
short_lead = df_pages['lead_paragraph'].apply(lambda x: split_text(x) < 6)
len(df_pages[short_lead])

What is the number of good pages if we exclude all the faulty ones?

In [None]:
len(df_pages[~no_qid & ~no_html & ~no_lead & ~short_lead])

Save good data

In [None]:
# for file in tqdm(page_files):
#     df = pd.read_parquet(file)
#     df = df[(~df['QID'].isna()) & (~df['HTML'].isna()) & (~df['lead_paragraph'].isna()) & (df['HTML'] != '') & (df['lead_paragraph'] != '') & (df['lead_paragraph'].apply(lambda x: split_text(x) >= 6))]
#     df = df.reset_index(drop=True)
#     df.to_parquet(file.replace('pages', 'good_pages'))

# Analyze Links

## Auxiliary Data Structures

In [None]:
# Pages with no html
no_html = df_pages[(df_pages['HTML'].isna()) | (df_pages['HTML'] == '')]['title'].tolist()
no_html = set(no_html)

In [None]:
# Pages with no lead paragraph
no_lead = df_pages[(df_pages['lead_paragraph'].isna()) | (df_pages['lead_paragraph'] == '')]['title'].tolist()
no_lead = set(no_lead)

In [None]:
# Pages with lead paragraph with less than 6 words
short_lead = df_pages[(df_pages['lead_paragraph'].apply(lambda x: split_text(x) < 6))]['title'].tolist()
short_lead = set(short_lead)

## Results

How many total links do we have?

In [None]:
len(df_links)

Of all the links we have, how many were missing its target?

In [None]:
no_target = df_links['target_ID'].isna()
len(df_links[no_target])

What is the number of links where the context couldn't be parsed?

In [None]:
no_context = df_links['context'].isna()
len(df_links[no_context])

What is the number of links with source pages without QID?

In [None]:
no_source_qid = df_links['source_QID'].isna()
len(df_links[no_source_qid])

What is the number of links with source pages without lead paragraph?

In [None]:
no_source_lead = df_links['source_title'].isin(no_lead)
len(df_links[no_source_lead])

What is the number of links where the source lead paragraph is too small (less than 6 words)?

In [None]:
short_source_lead = df_links['source_title'].isin(short_lead)
len(df_links[short_source_lead])

What is the number of links with target pages without QID?

In [None]:
no_target_qid = df_links['target_QID'].isna()
len(df_links[no_target_qid])

What is the number of links with target pages without HTML?

In [None]:
target_no_html = df_links['target_title'].isin(no_html)
len(df_links[target_no_html])

What is the number of links with target pages without lead paragraph?

In [None]:
target_no_lead = df_links['target_title'].isin(no_lead)
len(df_links[target_no_lead])

What is the number of links where the target lead paragraph is too small (less than 6 words)?

In [None]:
target_short_lead = df_links['target_title'].isin(short_lead)
len(df_links[target_short_lead])

What is the number of links where the source is the same as the target?

In [None]:
target_same_source = df_links['target_title'] == df_links['source_title']
len(df_links[target_same_source])

What is the number of good links?

In [None]:
len(df_links[~no_target & ~no_context & ~no_source_qid & ~no_source_lead & ~short_source_lead & ~no_target_qid & ~target_no_html & ~target_no_lead & ~target_short_lead & ~target_same_source])

Save good links

In [None]:
# for file in tqdm(link_files):
#     df = pd.read_parquet(file)
#     df = df[(~df['target_ID'].isna()) & (~df['source_QID'].isna()) & (~df['target_QID'].isna()) & (~df['target_title'].isin(no_html)) & (~df['target_title'].isin(no_lead)) & (~df['source_title'].isin(no_lead)) & (~df['context'].isna()) & (~df['source_title'].isin(short_lead)) & (~df['target_title'].isin(short_lead)) & (df['source_title'] != df['target_title'])]
#     df = df.reset_index(drop=True)
#     df.to_parquet(file.replace('links', 'good_links'))

In [None]:
for i in range(10):
    sample = df_links.sample()
    print('SOURCE', sample['source_title'].item())
    print('TARGET', sample['target_title'].item())
    print('CONTEXT', sample['context'].item())
    print('SENTENCE', sample['sentence'].item())
    print('MENTION', sample['mention'].item())
    print('CONTEXT SPAN START INDEX', sample['context_span_start_index'].item())
    print('CONTEXT SPAN END INDEX', sample['context_span_end_index'].item())
    print('CONTEXT SENTENCE START INDEX', sample['context_sentence_start_index'].item())
    print('CONTEXT SENTENCE END INDEX', sample['context_sentence_end_index'].item())
    print('CONTEXT MENTION START INDEX', sample['context_mention_start_index'].item())
    print('CONTEXT MENTION END INDEX', sample['context_mention_end_index'].item())
    print('SENTENCE TEST', sample['context'].item()[int(sample['context_sentence_start_index'].item()):int(sample['context_sentence_end_index'].item())])
    print('MENTION TEST', sample['context'].item()[int(sample['context_mention_start_index'].item()):int(sample['context_mention_end_index'].item())])
    print('SECTION', sample['source_section'].item())
    print('###############')