In [52]:
import collections
from itertools import compress
import json    #Used to pretty-print dictionaries
import re

from bs4 import BeautifulSoup
import requests

In [53]:
WIKI_BASE = "https://en.wikipedia.org/wiki/Template:"
STATE_LIST = ["Alabama","Alaska","Arizona","Arkansas","California","Colorado","Connecticut",
              "Delaware","Florida","Georgia_(U.S._state)","Hawaii","Idaho","Illinois","Indiana",
              "Iowa","Kansas","Kentucky","Louisiana","Maine","Maryland","Massachusetts",
              "Michigan","Minnesota","Mississippi","Missouri","Montana","Nebraska","Nevada",
              "New_Hampshire","New_Jersey","New_Mexico","New_York","North_Carolina","North_Dakota",
              "Ohio","Oklahoma","Oregon","Pennsylvania","Rhode_Island","South_Carolina",
              "South_Dakota","Tennessee","Texas","Utah","Vermont","Virginia","Washington",
              "West_Virginia","Wisconsin","Wyoming"]


state_template_urls = [WIKI_BASE + S.strip() for S in STATE_LIST]

In [54]:
# template is inside first <div> with class="navbox" inside <div> with class="mw-parser-output"
def get_state_template(state_template_url):
    """Grabs the HTML of the state template."""
    raw_page = requests.get(state_template_url)
    soup = BeautifulSoup(raw_page.text, "lxml")
    state_template = soup.find("div", attrs={'class':'mw-parser-output'})
    state_template = state_template.find("div", attrs={'class':'navbox'})
    return state_template

state_templates = {stu:get_state_template(stu) for stu in state_template_urls}

In [57]:
def get_links(table_row):
    """Collects the all text highlighted by a link in a template's table row.
    A few cases of tables in table rows are accounted for via recursion. Also
    eliminates links to pages that don't exist.
    """
    if table_row.find_all("table") != []:
        state_links = []
        trs = table_row.find("table").find_all("tr", recursive = False)
        for tr in trs:
            state_links.extend(get_links(tr))
        return state_links
    else:
        state_links = table_row.find_all("li")
        link_names = [sl.text for sl in state_links]
        does_link_exist = ['redlink' not in sl.a['href'] for sl in state_links]
        return compress(link_names, does_link_exist)

# Use collections.defaultdict in order to add new group/link names
# without throwing errors
group_dict = collections.defaultdict(list)
link_dict = collections.defaultdict(list)
for k,v in state_templates.items():
    
    table_rows = v.find("table").find_all("tr", recursive = False)[2:]
    
    for tr in table_rows:
        group = tr.find("th").text
        group = re.sub("\n\n", " ", group)
        group_dict[group].append(k)
        
        state_links = get_links(tr)
        for sl in state_links:
            link_dict[sl].append(k)

#print({k:len(v) for k,v in group_dict.items()})

In [58]:
print(json.dumps({k:len(v) for k,v in link_dict.items() if len(v) >= 30}, indent = 3))

{
   "Geography": 42,
   "People": 35,
   "Crime": 44,
   "Culture": 47,
   "Demographics": 49,
   "Economy": 49,
   "Education": 49,
   "Politics": 44,
   "Washington": 30,
   "History": 42,
   "Government": 35,
   "Tourist attractions": 40
}
