docker run -d --name sparkbook -p 8881:8888 -v "$PWD":/home/jovyan/work jupyter/pyspark-notebook start.sh jupyter lab --LabApp.token=''

m5.8xlarge  128GiB  
r5.4xlarge  128GiB  
x1e.xlarge  122GiB  
x1.16xlarge 976GiB  
i3.4xlarge  122GiB

In [1]:
import requests
from bs4 import BeautifulSoup
import pyspark as ps
import numpy as np
import time

response_cache = {}

def http_get(url):
    if response := response_cache.get(url):
        return response
    else:
        response = requests.request(url=url, method="GET")
        counties_list_html = response.content
        response_cache[url] = counties_list_html
        
        return response_cache[url]

spark = (ps.sql.SparkSession
         .builder
         .master('local[8]')
         .appName('lecture')
         .getOrCreate())

sc = spark.sparkContext

In [125]:
counties_list_url = "https://en.wikipedia.org/wiki/List_of_United_States_counties_and_county_equivalents"

def wikipedia_export_get(title):
    xml = http_get("https://en.wikipedia.org/wiki/Special:Export/" + title)
    
    if b'<redirect title' in xml:
        return wikipedia_export_get(title=BeautifulSoup(xml).select('redirect')[0]['title'])
    else:
        return xml

def wikipedia_standard_url(title):
    return "https://en.wikipedia.org/wiki/" + title

def wikipedia_counties_titles():
    soup = BeautifulSoup(http_get(counties_list_url))
    print(soup.select('.wikitable.sortable caption big'))
    
    rows = soup.select('.wikitable.sortable tbody tr')
    anchors = sum([row.select('td a')[:1] for row in rows], [])

    urls = [a['href'][len('/wiki/'):] for a in anchors]
    
    return urls
    
def wikipedia_communities_subheadings(text):
    import re

    regex = r"(^==Communities==)(.+?)(^==[^=].+?[^=]==)"
    post_communities = re.findall(regex, text, re.MULTILINE + re.DOTALL)
    
    if len(post_communities) == 0:
        return []

    # print(post_communities[0])
    post_communities = post_communities[0]
    
    regex = r"===(.+)==="
    res = [result.strip() for result in re.findall(regex, post_communities[1])]

    return res

time1 = time.time()
wikipedia_counties_titles()
f'time.{time.time() - time1}'

[<big>The 3,243 counties and county equivalents of the United States<sup class="reference" id="cite_ref-14"><a href="#cite_note-14">[c]</a></sup></big>]


'time.2.3139584064483643'

[<big>The 3,243 counties and county equivalents of the United States<sup class="reference" id="cite_ref-14"><a href="#cite_note-14">[c]</a></sup></big>]


---

In [134]:
%%time

twenty = wikipedia_counties_titles()

counties_export_text = (sc.parallelize(twenty)
                          .map(wikipedia_export_get)
                          .map(lambda export: BeautifulSoup(export).select('text')[0].getText())
                          .cache())

[<big>The 3,243 counties and county equivalents of the United States<sup class="reference" id="cite_ref-14"><a href="#cite_note-14">[c]</a></sup></big>]
CPU times: user 2.07 s, sys: 17 ms, total: 2.09 s
Wall time: 2.15 s


In [74]:
ls

Untitled1.ipynb


In [24]:
wikipedia_standard_url('Anchorage,_Alaska')

'https://en.wikipedia.org/wiki/Anchorage,_Alaska'

Count empty subheadings

In [140]:
counties_export_text.count()

3245

In [144]:
subs = counties_export_text.collect()
sum([set() == set(wikipedia_communities_subheadings(sub)) for title, sub in zip(twenty, subs)])

586

In [142]:
counties_export_text.take(1000)
sorted([(wikipedia_standard_url(title), set(wikipedia_communities_subheadings(sub))) for title, sub in zip(twenty, subs)], key=lambda x: x[1])

[('https://en.wikipedia.org/wiki/Anchorage,_Alaska', set()),
 ('https://en.wikipedia.org/wiki/Bristol_Bay_Borough,_Alaska', set()),
 ('https://en.wikipedia.org/wiki/Haines_Borough,_Alaska', set()),
 ('https://en.wikipedia.org/wiki/Juneau,_Alaska', set()),
 ('https://en.wikipedia.org/wiki/Sitka,_Alaska', set()),
 ('https://en.wikipedia.org/wiki/Skagway,_Alaska', set()),
 ('https://en.wikipedia.org/wiki/Wrangell,_Alaska', set()),
 ('https://en.wikipedia.org/wiki/Yakutat,_Alaska', set()),
 ('https://en.wikipedia.org/wiki/Eastern_District,_American_Samoa', set()),
 ('https://en.wikipedia.org/wiki/Manu%27a_District,_American_Samoa', set()),
 ('https://en.wikipedia.org/wiki/Rose_Atoll', set()),
 ('https://en.wikipedia.org/wiki/Swains_Island', set()),
 ('https://en.wikipedia.org/wiki/Western_District,_American_Samoa', set()),
 ('https://en.wikipedia.org/wiki/Navajo_County,_Arizona', set()),
 ('https://en.wikipedia.org/wiki/Cleveland_County,_Arkansas', set()),
 ('https://en.wikipedia.org/wiki/

---
#### Newtypes

---

#### Investigations

In [101]:
def wikipedia_communities_subheadings(text):
    import re

    regex = r"(^==Communities==)(.+?)(^==[^=].+?[^=]==)"
    post_communities = re.findall(regex, text, re.MULTILINE + re.DOTALL)
    
    print(post_communities)
    if len(post_communities) == 0:
        return []

    # print(post_communities[0])
    post_communities = post_communities[0]
    
    regex = r"===(.+)==="
    res = [result.strip() for result in re.findall(regex, post_communities[1])]

    return res

title = 'Prince_of_Wales_%E2%80%93_Hyder_Census_Area,_Alaska'
print(counties_export_text.take(100)[twenty.index(title)])
wikipedia_communities_subheadings(counties_export_text.take(100)[twenty.index(title)])

#REDIRECT [[Prince of Wales–Hyder Census Area, Alaska]]
{{R from move}}
[]


[]

In [124]:
def wikipedia_export_get(title):
    xml = http_get("https://en.wikipedia.org/wiki/Special:Export/" + title)
    
    if b'<redirect title' in xml:
        return wikipedia_export_get(title=BeautifulSoup(xml).select('redirect')[0]['title'])
    else:
        return xml
    
print(BeautifulSoup(wikipedia_export_get(title)))

<mediawiki version="0.10" xml:lang="en" xmlns="http://www.mediawiki.org/xml/export-0.10/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemalocation="http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd">
<siteinfo>
<sitename>Wikipedia</sitename>
<dbname>enwiki</dbname>
<base/>https://en.wikipedia.org/wiki/Main_Page
    <generator>MediaWiki 1.35.0-wmf.41</generator>
<case>first-letter</case>
<namespaces>
<namespace case="first-letter" key="-2">Media</namespace>
<namespace case="first-letter" key="-1">Special</namespace>
<namespace case="first-letter" key="0"></namespace>
<namespace case="first-letter" key="1">Talk</namespace>
<namespace case="first-letter" key="2">User</namespace>
<namespace case="first-letter" key="3">User talk</namespace>
<namespace case="first-letter" key="4">Wikipedia</namespace>
<namespace case="first-letter" key="5">Wikipedia talk</namespace>
<namespace case="first-letter" key="6">File</namespace>
<namespace case="f