# Imports and Definitions

In [25]:
import pandas as pd
from glob import glob
from tqdm import tqdm
import os

In [26]:
root = '/scratch/tsoares/wikidumps/simplewiki-NS0-20231001/processed_data'

# Get data

In [27]:
page_files = glob(os.path.join(root, "pages*.parquet"))
link_files = glob(os.path.join(root, "links*.parquet"))
redirect_files = glob(os.path.join(root, "redirect*.parquet"))
print(page_files)
print(link_files)
print(redirect_files)

['/scratch/tsoares/wikidumps/simplewiki-NS0-20231001/processed_data/pages_0.parquet']
['/scratch/tsoares/wikidumps/simplewiki-NS0-20231001/processed_data/links_0.parquet']
['/scratch/tsoares/wikidumps/simplewiki-NS0-20231001/processed_data/redirect_map.parquet']


In [28]:
def simplify_html(html):
    if html is None:
        return None
    if html == '':
        return ''
    return 'a'

dfs = []
for file in page_files:
    temp_df = pd.read_parquet(file)
    temp_df['HTML'] = temp_df['HTML'].apply(lambda x: simplify_html(x)) # simpify html so it is not too big
    dfs.append(temp_df)
df_pages = pd.concat(dfs)
df_pages

Unnamed: 0,title,ID,version,HTML,page_length,language,lead_paragraph,QID
0,1280s,24221,https://simple.wikipedia.org/w/index.php?title...,a,7848.0,simple,,Q83352
1,1283,69408,https://simple.wikipedia.org/w/index.php?title...,a,6277.0,simple,1283 was a common year.,Q5543
2,1272,46219,https://simple.wikipedia.org/w/index.php?title...,a,13916.0,simple,,Q5524
3,1267,4652,https://simple.wikipedia.org/w/index.php?title...,a,13275.0,simple,,Q5519
4,1296,39604,https://simple.wikipedia.org/w/index.php?title...,a,5002.0,simple,The year 1296 was a leap year which started on...,Q5561
...,...,...,...,...,...,...,...,...
240242,109_%28number%29,1051195,,,,simple,,Q604950
240243,110_%28number%29,1051196,,,,simple,,Q715456
240244,Krste_Petkov_Misirkov,1051200,,,,simple,,
240245,Anna_Odobescu,1051218,,,,simple,,Q61947352


In [29]:
dfs = []
for file in link_files:
    dfs.append(pd.read_parquet(file))
df_links = pd.concat(dfs)
df_links

Unnamed: 0,target_title,target_section,source_title,target_ID,target_QID,source_ID,source_QID,mention,source_section,link_start_index,link_end_index,sentence,sentence_start_index,sentence_end_index,source_page_length,link_section_depth
0,1282,Lead,1280s,22372,Q5541,24221,Q83352,1282,Events,4763,4829,"1282 – March – Dafydd ap Gruffydd, brother to ...",4735,5643,7848,1.0.0
1,March,Lead,1280s,468,Q110,24221,Q83352,March,Events,4832,4901,"1282 – March – Dafydd ap Gruffydd, brother to ...",4735,5643,7848,1.0.0
2,Dafydd_ap_Gruffydd,Lead,1280s,813653,Q592561,24221,Q83352,Dafydd ap Gruffydd,Events,4904,5012,"1282 – March – Dafydd ap Gruffydd, brother to ...",4735,5643,7848,1.0.0
3,Prince_of_Wales,Lead,1280s,67672,Q180729,24221,Q83352,Prince of Wales,Events,5025,5124,"1282 – March – Dafydd ap Gruffydd, brother to ...",4735,5643,7848,1.0.0
4,Llywelyn_the_Last,Lead,1280s,885492,Q369651,24221,Q83352,Llywelyn the Last,Events,5125,5230,"1282 – March – Dafydd ap Gruffydd, brother to ...",4735,5643,7848,1.0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2758418,CM_Punk,Lead,Maxwell_Jacob_Friedman,134378,Q215447,1051087,Q61102628,CM Punk,Championships,32228,32303,CM Punk)[28]\n,32227,32655,108191,2.0.0
2758419,Ring_of_Honor,Lead,Maxwell_Jacob_Friedman,360980,Q588459,1051087,Q61102628,Ring of Honor,Championships,33171,33264,Ring of Honor\n,33144,33269,108191,2.0.0
2758420,Adam_Cole,Lead,Maxwell_Jacob_Friedman,864331,Q928045,1051087,Q61102628,Adam Cole,Championships,33645,33726,"ROH World Tag Team Championship (1 time, curre...",33269,34088,108191,2.0.0
2758421,The_New_York_Times,Lead,Maxwell_Jacob_Friedman,70967,Q9684,1051087,Q61102628,The New York Times,Championships,34580,34688,The New York Times\n,34553,34693,108191,2.0.0


In [30]:
dfs = []
for file in redirect_files:
    dfs.append(pd.read_parquet(file))
df_redirects = pd.concat(dfs)
df_redirects

Unnamed: 0_level_0,redirect
title,Unnamed: 1_level_1
Albigensian,Catharism
American_Units_Of_Measurement,United_States_customary_units
As,AS
Animalia,Animal
Bootstrap,Boot
...,...
Hol,HOL
Voss_%28village%29,Vossevangen
Grandmaster_Sexay,Brian_Christopher
Perfume_%28J-Pop%29,Perfume_%28Japanese_band%29


# Analyze Pages (Nodes)

What is the total number of pages?

In [31]:
len(df_pages)

240247

What is the total number of redirects?

In [32]:
len(df_redirects)

93857

What is the number of pages with missing QID?

In [33]:
len(df_pages[df_pages['QID'].isna()])

184

What is the number of pages with missing HTML?

In [34]:
len(df_pages[(df_pages['HTML'].isna()) | (df_pages['HTML'] == '')])

726

What is the number of pages without a lead paragraph?

In [35]:
len(df_pages[(df_pages['lead_paragraph'].isna()) | (df_pages['lead_paragraph'] == '')])

1406

What is the number of good pages if we exclude all the faulty ones?

In [36]:
len(df_pages[(~df_pages['QID'].isna()) & (~df_pages['HTML'].isna()) & (~df_pages['lead_paragraph'].isna()) & (df_pages['HTML'] != '') & (df_pages['lead_paragraph'] != '')])

238680

Save good data

In [37]:
for file in page_files:
    df = pd.read_parquet(file)
    df = df[(~df['QID'].isna()) & (~df['HTML'].isna()) & (~df['lead_paragraph'].isna()) & (df['HTML'] != '') & (df['lead_paragraph'] != '')]
    df.to_parquet(file.replace('.parquet', '_good.parquet'))

# Analyze Links

## Auxiliary Data Structures

In [38]:
# Pages with no html
no_html = df_pages[(df_pages['HTML'].isna()) | (df_pages['HTML'] == '')]['title'].tolist()
no_html = set(no_html)

In [39]:
# Pages with no lead paragraph
no_lead = df_pages[(df_pages['lead_paragraph'].isna()) | (df_pages['lead_paragraph'] == '')]['title'].tolist()
no_lead = set(no_lead)

## Results

How many total links do we have?

In [40]:
len(df_links)

2758423

Of all the links we have, how many couldn't be parsed (e.g., target page not in storage)?

In [41]:
len(df_links[df_links['target_ID'].isna()])

636

What is the number of links with source pages without QID?

In [42]:
len(df_links[df_links['source_QID'].isna()])

1633

What is the number of links with target pages without QID?

In [43]:
len(df_links[df_links['target_QID'].isna()])

868

What is the number of links with target pages without HTML?

In [44]:
len(df_links[df_links['target_title'].isin(no_html)])

3367

What is the number of links with target pages without lead paragraph?

In [45]:
len(df_links[df_links['target_title'].isin(no_lead)])

40545

What is the number of links with source pages without lead paragraph?

In [46]:
len(df_links[df_links['source_title'].isin(no_lead)])

15211

What is the number of good links?

In [47]:
len(df_links[(~df_links['target_ID'].isna()) & (~df_links['source_QID'].isna()) & (~df_links['target_QID'].isna()) & (~df_links['target_title'].isin(no_html)) & (~df_links['target_title'].isin(no_lead)) & (~df_links['source_title'].isin(no_lead))])

2701217

Save good links

In [48]:
for file in link_files:
    df = pd.read_parquet(file)
    df = df[(~df['target_ID'].isna()) & (~df['source_QID'].isna()) & (~df['target_QID'].isna()) & (~df['target_title'].isin(no_html)) & (~df['target_title'].isin(no_lead)) & (~df['source_title'].isin(no_lead))]
    df.to_parquet(file.replace('.parquet', '_good.parquet'))