In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from src.data_loader import *
import pandas as pd

## What articles do we have in the data?

In [3]:
parser = htmlParser()
print(f'There are {len(parser.article_URLs)} valid articles')

There are 5232 valid articles


In [4]:
df_html_articles = parser.article_names # The html articles in the data
df_article_names = read_articles() # The articles used in the paths-and-graph data
intersect = pd.merge(df_html_articles, df_article_names, how='inner', on="article")
print(f'There are {len(df_article_names)} articles in the paths-and-graph data and {len(intersect)} of those are in the html articles (Oof).')

There are 4604 articles in the paths-and-graph data and 4604 of those are in the html articles (Oof).


In [5]:
difference = df_html_articles[~df_html_articles.isin(df_article_names)]
print(f'This means there are {len(difference)} articles more in the html data, such as "{difference.iloc[0]}", "{difference.iloc[5]}" or "{difference.iloc[10]}".')

This means there are 628 articles more in the html data, such as "2006 Autumn Newsletter Focus On Bethlehem", "About Child Sponsorship" or "Aids Africa".


## Parsing
Example of parsing for a single file. The individual links are also saved but not shown

In [6]:
parsed = parser.parse_html_article(parser.article_URLs[266]) # Azerbaijan as example: 498
if parsed: parser.get_overview(parsed)

Page Overview
-------------
Title: Amazon Rainforest
Total Words: 1554
Total Links: 33

Abstract Overview
-----------------
Abstract Length (words): 156
Abstract Links: 14

Categories and Subcategories Overview
-------------------------------------
Category 1: Etymology
 - Words: 53
 - Links: 3

Category 2: Biodiversity
 - Words: 243
 - Links: 8

Category 3: Deforestation
 - Words: 261
 - Links: 1

Category 4: Carbon dynamics
 - Words: 155
 - Links: 4

Category 5: Conservation
 - Words: 251
 - Links: 1

Category 6: Response to climate change
 - Words: 410
 - Links: 1
   Subcategory 1: Impact of Amazon drought
   - Words: 162
   - Links: 0

Category 7: Video
 - Words: 14
 - Links: 0

Tables Overview
---------------


In [None]:
# this parses only the articles that are actually those used in paths-and-graphs

indices_only_in_paths_and_graph = [i for i in df_html_articles.index if df_html_articles[i] in df_article_names.values]
parser.parse_selection(indices_only_in_paths_and_graph)
# parser.parse_all() # to parse everything

In [None]:
# parser allows for pickle saving...

parser.save_pickle()

In [7]:
# ... and loading

parser2 = htmlParser()
parser2.load_pickle()
parser2.parsed_articles

{'Áedán mac Gabráin': {'title': 'Áedán mac Gabráin',
  'total_words': 1841,
  'total_links': ['../../wp/d/D%25C3%25A1l_Riata.htm',
   '../../wp/m/Monarchy.htm',
   '../../wp/s/Scotland.htm',
   '../../wp/i/Ireland.htm',
   '../../wp/c/Columba.htm',
   '../../wp/i/Ireland.htm',
   '../../wp/g/Great_Britain.htm',
   '../../wp/o/Orkney.htm',
   '../../wp/i/Isle_of_Man.htm',
   '../../wp/b/Bede.htm',
   '../../wp/p/Picts.htm',
   '../../wp/w/Wales.htm'],
  'abstract_length': 169,
  'abstract_links': ['../../wp/d/D%25C3%25A1l_Riata.htm',
   '../../wp/m/Monarchy.htm',
   '../../wp/s/Scotland.htm',
   '../../wp/i/Ireland.htm',
   '../../wp/c/Columba.htm',
   '../../wp/i/Ireland.htm',
   '../../wp/g/Great_Britain.htm',
   '../../wp/o/Orkney.htm',
   '../../wp/i/Isle_of_Man.htm',
   '../../wp/b/Bede.htm'],
  'categories_data': [{'name': 'Background',
    'num_words': 312,
    'h2_links': [],
    'subcategories': []},
   {'name': 'Neighbours',
    'num_words': 225,
    'h2_links': ['../../wp/p/P

## Analysis

In [9]:
df_html_stats = parser2.get_df_html_stats()

Articles that could not be parsed:

Directdebit
Donation
Friend Directdebit
Sponsorship Directdebit
Wowpurchase


In [11]:
import ipywidgets as widgets
from IPython.display import display

sort_column = widgets.Dropdown(
    options=df_html_stats.columns,
    value='total_words',
    description='Sort by:'
)

n_slider = widgets.IntSlider(
    value=5,
    min=1,
    max=20,
    step=1,
    description='Number (n):'
)

# Display function to update table based on widget values
def display_sorted(n, sort_by):
    sorted_df = df_html_stats.sort_values(by=sort_by, ascending=False)
    top_n = sorted_df.head(n)
    bottom_n = sorted_df.tail(n)
    
    print(f"\nTop {n} Articles by {sort_by}:")
    display(top_n)
    print(f"\nBottom {n} Articles by {sort_by}:")
    display(bottom_n)

# Link widgets to display function
widgets.interactive(display_sorted, n=n_slider, sort_by=sort_column)


interactive(children=(IntSlider(value=5, description='Number (n):', max=20, min=1), Dropdown(description='Sort…