# Imports and Definitions

In [3]:
import pandas as pd
from glob import glob
from tqdm import tqdm
import os

In [5]:
root = '/scratch/tsoares/wikidumps/simplewiki-NS0-20231001/processed_data'

# Get data

In [8]:
page_files = glob(os.path.join(root, "pages*.parquet"))
link_files = glob(os.path.join(root, "links*.parquet"))
redirect_files = glob(os.path.join(root, "redirect*.parquet"))

In [24]:
dfs = []
for file in page_files:
    dfs.append(pd.read_parquet(file))
df_pages = pd.concat(dfs)
df_pages

Unnamed: 0,ID,title,language,QID,HTML,page_length,lead_paragraph,version
0,1,April,simple,Q118,"<!DOCTYPE html>\n<html prefix=""dc: http://purl...",99917.0,April (Apr.) is the fourth month of the year i...,https://simple.wikipedia.org/w/index.php?title...
1,2,August,simple,Q122,"<!DOCTYPE html>\n<html prefix=""dc: http://purl...",76710.0,August (Aug.) is the eighth month of the year ...,https://simple.wikipedia.org/w/index.php?title...
2,6,Art,simple,Q735,"<!DOCTYPE html>\n<html prefix=""dc: http://purl...",30897.0,Art is a creative activity and technical skill...,https://simple.wikipedia.org/w/index.php?title...
3,8,A,simple,Q9659,"<!DOCTYPE html>\n<html prefix=""dc: http://purl...",88163.0,A or a is the first letter of the English alph...,https://simple.wikipedia.org/w/index.php?title...
4,9,Air,simple,Q7391292,"<!DOCTYPE html>\n<html prefix=""dc: http://purl...",20548.0,Air is the Earth's atmosphere. Air is a mixtur...,https://simple.wikipedia.org/w/index.php?title...
...,...,...,...,...,...,...,...,...
240087,1051195,109_%28number%29,simple,Q604950,,,,
240088,1051196,110_%28number%29,simple,Q715456,,,,
240089,1051200,Krste_Petkov_Misirkov,simple,,,,,
240090,1051218,Anna_Odobescu,simple,Q61947352,,,,


In [25]:
dfs = []
for file in link_files:
    dfs.append(pd.read_parquet(file))
df_links = pd.concat(dfs)
df_links

Unnamed: 0,target_title,target_section,source_title,target_ID,target_QID,source_ID,source_QID,mention,source_section,link_start_index,link_end_index,sentence,sentence_start_index,sentence_end_index,source_page_length,link_section_depth
0,Month,Lead,April,3641,Q5151,1,Q118,month,Lead,4613,4682,) is the fourth month of the year in the Julia...,4597,5126,99917,0.0.0
1,Year,Lead,April,944,Q577,1,Q118,year,Lead,4690,4756,) is the fourth month of the year in the Julia...,4597,5126,99917,0.0.0
2,Julian_calendar,Lead,April,12158,Q11184,1,Q118,Julian,Lead,4764,4854,) is the fourth month of the year in the Julia...,4597,5126,99917,0.0.0
3,Gregorian_calendar,Lead,April,12159,Q12138,1,Q118,Gregorian calendars,Lead,4859,4968,) is the fourth month of the year in the Julia...,4597,5126,99917,0.0.0
4,March,Lead,April,468,Q110,1,Q118,March,Lead,4988,5057,) is the fourth month of the year in the Julia...,4597,5126,99917,0.0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2758165,Tom_Petty,Lead,Yer_So_Bad,594236,Q311655,1051098,Q22043272,Tom Petty,Lead,12904,12985,"\n""Yer So Bad"" is a 1990 song by Tom Petty and...",12690,13325,34920,0.0.0
2758166,Anime,Lead,Angel_Beats%21,15044,Q1107,1051107,Q531552,anime,Lead,1573,1642,is an anime series.,1562,1650,33446,0.0.0
2758167,Visual_novel,Lead,Angel_Beats%21,393148,Q689445,1051107,Q531552,visual novel,Lead,3953,4043,[2][3] There was a visual novel of the anime r...,3282,4115,33446,0.0.0
2758168,Blu-ray_Disc,Lead,Angel_Beats%21,47945,Q47770,1051107,Q531552,Blu-ray,Lead,5389,5484,[5][6] The Blu-ray discs sold very well.,4726,5506,33446,0.0.0


In [26]:
dfs = []
for file in redirect_files:
    dfs.append(pd.read_parquet(file))
df_redirects = pd.concat(dfs)
df_redirects

Unnamed: 0_level_0,redirect
title,Unnamed: 1_level_1
Albigensian,Catharism
American_Units_Of_Measurement,United_States_customary_units
As,AS
Animalia,Animal
Bootstrap,Boot
...,...
Hol,HOL
Voss_%28village%29,Vossevangen
Grandmaster_Sexay,Brian_Christopher
Perfume_%28J-Pop%29,Perfume_%28Japanese_band%29


# Analyze Pages (Nodes)

What is the total number of pages?

In [12]:
len(df_pages)

240092

What is the total number of redirects?

In [13]:
len(df_redirects)

93857

What is the number of pages with missing QID?

In [14]:
len(df_pages[df_pages['QID'].isna()])

96

What is the number of pages with missing HTML?

In [16]:
len(df_pages[(df_pages['HTML'].isna()) | (df_pages['HTML'] == '')])

726

What is the number of pages without a lead paragraph?

In [17]:
len(df_pages[(df_pages['lead_paragraph'].isna()) | (df_pages['lead_paragraph'] == '')])

1385

What is the number of good pages if we exclude all the faulty ones?

In [19]:
len(df_pages[(~df_pages['QID'].isna()) & (~df_pages['HTML'].isna()) & (~df_pages['lead_paragraph'].isna()) & (df_pages['HTML'] != '') & (df_pages['lead_paragraph'] != '')])

238622

Save good data

In [27]:
df_pages[(~df_pages['QID'].isna()) & (~df_pages['HTML'].isna()) & (~df_pages['lead_paragraph'].isna()) & (df_pages['HTML'] != '') & (df_pages['lead_paragraph'] != '')].to_parquet(os.path.join(root, 'good_pages.parquet'))

# Analyze Links

## Auxiliary Data Structures

In [32]:
# Pages with no html
no_html = df_pages[(df_pages['HTML'].isna()) | (df_pages['HTML'] == '')]['title'].tolist()
no_html = set(no_html)

In [33]:
# Pages with no lead paragraph
no_lead = df_pages[(df_pages['lead_paragraph'].isna()) | (df_pages['lead_paragraph'] == '')]['title'].tolist()
no_lead = set(no_lead)

## Results

How many total links do we have?

In [20]:
len(df_links)

2758170

Of all the links we have, how many couldn't be parsed (e.g., target page not in storage)?

In [21]:
len(df_links[df_links['target_ID'].isna()])

620

What is the number of links with source pages without QID?

In [22]:
len(df_links[df_links['source_QID'].isna()])

774

What is the number of links with target pages without QID?

In [34]:
len(df_links[df_links['target_QID'].isna()])

859

What is the number of links with target pages without HTML?

In [35]:
len(df_links[df_links['target_title'].isin(no_html)])

3366

What is the number of links with target pages without lead paragraph?

In [36]:
len(df_links[df_links['target_title'].isin(no_lead)])

40326

What is the number of links with source pages without lead paragraph?

In [37]:
len(df_links[df_links['source_title'].isin(no_lead)])

15199

What is the number of good links?

In [38]:
len(df_links[(~df_links['target_ID'].isna()) & (~df_links['source_QID'].isna()) & (~df_links['target_QID'].isna()) & (~df_links['target_title'].isin(no_html)) & (~df_links['target_title'].isin(no_lead)) & (~df_links['source_title'].isin(no_lead))])

2701844

Save good links

In [39]:
df_links[(~df_links['target_ID'].isna()) & (~df_links['source_QID'].isna()) & (~df_links['target_QID'].isna()) & (~df_links['target_title'].isin(no_html)) & (~df_links['target_title'].isin(no_lead)) & (~df_links['source_title'].isin(no_lead))].to_parquet(os.path.join(root, 'good_links.parquet'))