# Imports and Definitions

In [1]:
import pandas as pd
from glob import glob
from tqdm import tqdm
import os

In [2]:
root = '/scratch/tsoares/wikidumps/simplewiki-NS0-20231001/processed_data'

# Get data

In [3]:
page_files = glob(os.path.join(root, "pages*.parquet"))
link_files = glob(os.path.join(root, "links*.parquet"))
redirect_files = glob(os.path.join(root, "redirect*.parquet"))
print(page_files)
print(link_files)
print(redirect_files)

['/scratch/tsoares/wikidumps/simplewiki-NS0-20231001/processed_data/pages_0.parquet', '/scratch/tsoares/wikidumps/simplewiki-NS0-20231001/processed_data/pages_1.parquet']
['/scratch/tsoares/wikidumps/simplewiki-NS0-20231001/processed_data/links_1.parquet', '/scratch/tsoares/wikidumps/simplewiki-NS0-20231001/processed_data/links_0.parquet']
['/scratch/tsoares/wikidumps/simplewiki-NS0-20231001/processed_data/redirect_map.parquet']


In [4]:
def simplify_html(html):
    if html is None:
        return None
    if html == '':
        return ''
    return 'a'

dfs = []
for file in page_files:
    temp_df = pd.read_parquet(file)
    temp_df['HTML'] = temp_df['HTML'].apply(lambda x: simplify_html(x)) # simpify html so it is not too big
    dfs.append(temp_df)
df_pages = pd.concat(dfs)
df_pages

Unnamed: 0,title,ID,version,HTML,page_length,language,lead_paragraph,QID
0,1280s,24221,https://simple.wikipedia.org/w/index.php?title...,a,7848.0,simple,,Q83352
1,1283,69408,https://simple.wikipedia.org/w/index.php?title...,a,6277.0,simple,1283 was a common year.,Q5543
2,1272,46219,https://simple.wikipedia.org/w/index.php?title...,a,13916.0,simple,,Q5524
3,1267,4652,https://simple.wikipedia.org/w/index.php?title...,a,13275.0,simple,,Q5519
4,1296,39604,https://simple.wikipedia.org/w/index.php?title...,a,5002.0,simple,The year 1296 was a leap year which started on...,Q5561
...,...,...,...,...,...,...,...,...
40242,109_%28number%29,1051195,,,,simple,,Q604950
40243,110_%28number%29,1051196,,,,simple,,Q715456
40244,Krste_Petkov_Misirkov,1051200,,,,simple,,
40245,Anna_Odobescu,1051218,,,,simple,,Q61947352


In [5]:
dfs = []
for file in link_files:
    dfs.append(pd.read_parquet(file))
df_links = pd.concat(dfs)
df_links

Unnamed: 0,target_title,target_section,source_title,target_ID,target_QID,source_ID,source_QID,mention,source_section,link_start_index,link_end_index,sentence,sentence_raw,sentence_start_index,sentence_end_index,source_page_length,link_section_depth,context
0,Georgian_language,Lead,David_Khakhaleishvili,52487,Q8108,813041,Q155514,Georgian,Lead,1600,1875,David Khakhaleishvili (Georgian: დავით ხახალეი...,"<section data-mw-section-id=""0"" id=""mwAQ""><p i...",16,2464,9058,0.0.0,"He was born in Kutaisi, Georgia. He won the g..."
1,Judo,Lead,David_Khakhaleishvili,78553,Q11420,813041,Q155514,judoka,Lead,2233,2301,David Khakhaleishvili (Georgian: დავით ხახალეი...,"<section data-mw-section-id=""0"" id=""mwAQ""><p i...",16,2464,9058,0.0.0,"He was born in Kutaisi, Georgia. He won the g..."
2,Mixed_martial_arts,Lead,David_Khakhaleishvili,31326,Q114466,813041,Q155514,mixed martial artist,Lead,2303,2437,David Khakhaleishvili (Georgian: დავით ხახალეი...,"<section data-mw-section-id=""0"" id=""mwAQ""><p i...",16,2464,9058,0.0.0,"He was born in Kutaisi, Georgia. He won the g..."
3,Georgia_%28country%29,Lead,David_Khakhaleishvili,14986,Q230,813041,Q155514,Georgia,Lead,2709,2804,"He was born in Kutaisi, Georgia.","He was born in <a rel=""mw:WikiLink"" href=""./K...",2464,2805,9058,0.0.0,David Khakhaleishvili (Georgian: დავით ხახალეი...
4,1992_Summer_Olympics,Lead,David_Khakhaleishvili,47889,Q8488,813041,Q155514,1992 Summer Olympics,Lead,2835,2949,He won the gold medal at the 1992 Summer Olym...,"He won the gold medal at the <a rel=""mw:WikiL...",2805,2950,9058,0.0.0,David Khakhaleishvili (Georgian: დავით ხახალეი...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2045439,San_Diego_County,Lead,Escondido%2C_California,298599,Q108143,650261,Q372454,San Diego County,Lead,5259,5405,"Created in 1888, it is one of the oldest citi...","Created in 1888, it is one of the oldest citi...",5207,5406,53507,0.0.0,Escondido (/ˌɛskənˈdiːdoʊ/ ES-kən-DEE-doh) is ...
2045440,Ice_hockey,Lead,Milan_Kajkl,9451,Q41466,694013,Q1386108,ice hockey,Lead,9428,9512,"\nMilan Kajkl (May 14, 1950 – January 18, 2014...","\n<p id=""mwBA""><b id=""mwBQ"">Milan Kajkl</b> (M...",8997,9520,23054,0.0.0,(1950-05-14)\n1950-05-14\n(2014-01-18)\n \n \n...
2045441,Plze%C5%88,Lead,Milan_Kajkl,160752,Q43453,694013,Q1386108,Plzeň,Lead,9539,9608,"Kajkl was born in Plzeň, Czechoslovakia.","Kajkl was born in <a rel=""mw:WikiLink"" href=""...",9520,9707,23054,0.0.0,(1950-05-14)\n1950-05-14\n(2014-01-18)\n \n \n...
2045442,Czechoslovakia,Lead,Milan_Kajkl,22914,Q33946,694013,Q1386108,Czechoslovakia,Lead,9610,9706,"Kajkl was born in Plzeň, Czechoslovakia.","Kajkl was born in <a rel=""mw:WikiLink"" href=""...",9520,9707,23054,0.0.0,(1950-05-14)\n1950-05-14\n(2014-01-18)\n \n \n...


In [6]:
dfs = []
for file in redirect_files:
    dfs.append(pd.read_parquet(file))
df_redirects = pd.concat(dfs)
df_redirects

Unnamed: 0_level_0,redirect
title,Unnamed: 1_level_1
Albigensian,Catharism
American_Units_Of_Measurement,United_States_customary_units
As,AS
Animalia,Animal
Bootstrap,Boot
...,...
Hol,HOL
Voss_%28village%29,Vossevangen
Grandmaster_Sexay,Brian_Christopher
Perfume_%28J-Pop%29,Perfume_%28Japanese_band%29


# Analyze Pages (Nodes)

What is the total number of pages?

In [7]:
len(df_pages)

240247

What is the total number of redirects?

In [8]:
len(df_redirects)

93857

What is the number of pages with missing QID?

In [9]:
len(df_pages[df_pages['QID'].isna()])

184

What is the number of pages with missing HTML?

In [10]:
len(df_pages[(df_pages['HTML'].isna()) | (df_pages['HTML'] == '')])

726

What is the number of pages without a lead paragraph?

In [11]:
len(df_pages[(df_pages['lead_paragraph'].isna()) | (df_pages['lead_paragraph'] == '')])

1406

What is the number of good pages if we exclude all the faulty ones?

In [12]:
len(df_pages[(~df_pages['QID'].isna()) & (~df_pages['HTML'].isna()) & (~df_pages['lead_paragraph'].isna()) & (df_pages['HTML'] != '') & (df_pages['lead_paragraph'] != '')])

238680

Save good data

In [13]:
for file in page_files:
    df = pd.read_parquet(file)
    df = df[(~df['QID'].isna()) & (~df['HTML'].isna()) & (~df['lead_paragraph'].isna()) & (df['HTML'] != '') & (df['lead_paragraph'] != '')]
    df.to_parquet(file.replace('pages', 'good_pages'))

# Analyze Links

## Auxiliary Data Structures

In [14]:
# Pages with no html
no_html = df_pages[(df_pages['HTML'].isna()) | (df_pages['HTML'] == '')]['title'].tolist()
no_html = set(no_html)

In [15]:
# Pages with no lead paragraph
no_lead = df_pages[(df_pages['lead_paragraph'].isna()) | (df_pages['lead_paragraph'] == '')]['title'].tolist()
no_lead = set(no_lead)

## Results

How many total links do we have?

In [16]:
len(df_links)

2756089

Of all the links we have, how many couldn't be parsed (e.g., target page not in storage)?

In [17]:
len(df_links[df_links['target_ID'].isna()])

636

What is the number of links with source pages without QID?

In [18]:
len(df_links[df_links['source_QID'].isna()])

1633

What is the number of links with target pages without QID?

In [19]:
len(df_links[df_links['target_QID'].isna()])

868

What is the number of links with target pages without HTML?

In [20]:
len(df_links[df_links['target_title'].isin(no_html)])

3367

What is the number of links with target pages without lead paragraph?

In [21]:
len(df_links[df_links['target_title'].isin(no_lead)])

40545

What is the number of links with source pages without lead paragraph?

In [22]:
len(df_links[df_links['source_title'].isin(no_lead)])

15209

What is the number of links without context?

In [25]:
len(df_links[df_links['context'].isna()])

11726

What is the number of good links?

In [23]:
len(df_links[(~df_links['target_ID'].isna()) & (~df_links['source_QID'].isna()) & (~df_links['target_QID'].isna()) & (~df_links['target_title'].isin(no_html)) & (~df_links['target_title'].isin(no_lead)) & (~df_links['source_title'].isin(no_lead))])

2698885

Save good links

In [24]:
for file in link_files:
    df = pd.read_parquet(file)
    df = df[(~df['target_ID'].isna()) & (~df['source_QID'].isna()) & (~df['target_QID'].isna()) & (~df['target_title'].isin(no_html)) & (~df['target_title'].isin(no_lead)) & (~df['source_title'].isin(no_lead))]
    df.to_parquet(file.replace('links', 'good_links'))

In [29]:
for i in range(10):
    sample = df_links.sample()
    print('SOURCE', sample['source_title'].item())
    print('TARGET', sample['target_title'].item())
    print('CONTEXT', sample['context'].item())
    print('SENTENCE', sample['sentence'].item())
    print('###############')

SOURCE Garry_McCarthy
TARGET Chicago_Police_Department
CONTEXT Garry Francis McCarthy

 (1959-05-04) 
1959-05-04
 (age 64)
 
The Bronx, New York
Garry Francis McCarthy (born May 4, 1959) an American police officer. [1]McCarthy was born in The Bronx, New York.McCarthy joined the New York City Police Department in 1981 at age 22. He rose through the ranks and became Deputy Commissioner of Operations in 2000.In 2006, McCarthy left his position with the New York Police Department to take over the Police Department of Newark, New Jersey.McCarthy was hired by Mayor Rahm Emanuel to take over the Chicago Police Department shortly after Emanuel's election in early 2011.  On December 1, 2015, the mayor asked for McCarthy's resignation.  He said this was because  public trust in the leadership of the city's police department had been shaken and that McCarthy had become "an issue" and "a distraction." [2]On March 21, 2018, McCarthy announced his candidacy for Mayor of Chicago in the 2019 election.

In [30]:
df_pages[df_pages['title'] == 'Garry_McCarthy']['HTML'].item()

'a'

In [31]:
for page in page_files:
    df = pd.read_parquet(page)
    print(df[df['title'] == 'Garry_McCarthy']['HTML'].item())


<!DOCTYPE html>
<html prefix="dc: http://purl.org/dc/terms/ mw: http://mediawiki.org/rdf/" about="https://simple.wikipedia.org/wiki/Special:Redirect/revision/8939102"><head prefix="mwr: https://simple.wikipedia.org/wiki/Special:Redirect/"><meta property="mw:TimeUuid" content="f3660b30-22da-11ee-a86f-cfe260082712"/><meta charset="utf-8"/><meta property="mw:pageId" content="458479"/><meta property="mw:pageNamespace" content="0"/><link rel="dc:replaces" resource="mwr:revision/8041653"/><meta property="mw:revisionSHA1" content="ad85a981407b5db3a1125c615354976793882522"/><meta property="dc:modified" content="2023-07-15T06:43:46.000Z"/><meta property="mw:htmlVersion" content="2.8.0"/><meta property="mw:html:version" content="2.8.0"/><link rel="dc:isVersionOf" href="//simple.wikipedia.org/wiki/Garry_McCarthy"/><base href="//simple.wikipedia.org/wiki/"/><title>Garry McCarthy</title><meta property="mw:moduleStyles" content="ext.cite.style|ext.cite.styles"/><link rel="stylesheet" href="/w/load.p

ValueError: can only convert an array of size 1 to a Python scalar