docker run -d --name sparkbook -p 8881:8888 -v "$PWD":/home/jovyan/work jupyter/pyspark-notebook start.sh jupyter lab --LabApp.token=''

m5.8xlarge  128GiB  
r5.4xlarge  128GiB  
x1e.xlarge  122GiB  
x1.16xlarge 976GiB  
i3.4xlarge  122GiB

In [1]:
import requests
from bs4 import BeautifulSoup
import pyspark as ps
import numpy as np
import time

response_cache = {}

def http_get(url):
    if response := response_cache.get(url):
        return response
    else:
        response = requests.request(url=url, method="GET")
        counties_list_html = response.content
        response_cache[url] = counties_list_html
        
        return response_cache[url]

spark = (ps.sql.SparkSession
         .builder
         .master('local[8]')
         .appName('lecture')
         .getOrCreate())

sc = spark.sparkContext

In [None]:
Holmes_County,_Ohio

In [416]:
counties_list_url = "https://en.wikipedia.org/wiki/List_of_United_States_counties_and_county_equivalents"

def wikipedia_export_get(title):
    xml = http_get("https://en.wikipedia.org/wiki/Special:Export/" + title)
    
    if b'<redirect title' in xml:
        return wikipedia_export_get(title=BeautifulSoup(xml).select('redirect')[0]['title'])
    else:
        return xml

def wikipedia_standard_url(title):
    return "https://en.wikipedia.org/wiki/" + title

# def wikipedia_standard_url_encode(title):
#     import urllib.parse
#     return "https://en.wikipedia.org/wiki/" + urllib.parse.quote(title)

def wikipedia_counties_titles():
    soup = BeautifulSoup(http_get(counties_list_url))
    print(soup.select('.wikitable.sortable caption big'))
    
    rows = soup.select('.wikitable.sortable tbody tr')
    anchors = sum([row.select('td a')[:1] for row in rows], [])

    urls = [a['href'][len('/wiki/'):] for a in anchors]
    
    return urls
    
def wikipedia_communities_subheadings(text):
    import re

    regex = r"(^==\s?Communities\s?==)(.+?)(^==[^=].+?[^=]==)"
    post_communities = re.findall(regex, text, re.MULTILINE + re.DOTALL)
    
    if len(post_communities) == 0:
        return []

    # print(post_communities[0])
    post_communities = post_communities[0]
    
    regex = r"===(.+)==="
    res = [result.strip() for result in re.findall(regex, post_communities[1])]

    return res

time1 = time.time()
wikipedia_counties_titles()
f'time.{time.time() - time1}'

[<big>The 3,243 counties and county equivalents of the United States<sup class="reference" id="cite_ref-14"><a href="#cite_note-14">[c]</a></sup></big>]


'time.2.3295090198516846'

In [476]:
def wikipedia_communities_extract(text):
    import re

    regex = r"(^==\s?Communities\s?==)(.+?)(^==[^=].+?[^=]==)"
    post_communities = re.findall(regex, text, re.MULTILINE + re.DOTALL)
    
    regex = r"(^==\s?Municipalities\s?==)(.+?)(^==[^=].+?[^=]==)"
    municipalities = re.findall(regex, text, re.MULTILINE + re.DOTALL)
    
    regex = r"(^==\s?Cities and communities\s?==)(.+?)(^==[^=].+?[^=]==)"
    cities_and_communities = re.findall(regex, text, re.MULTILINE + re.DOTALL)
    
    regex = r"(^==\s?Community\s?==)(.+?)(^==[^=].+?[^=]==)"
    community = re.findall(regex, text, re.MULTILINE + re.DOTALL)
    
    types = []
    if len(post_communities) > 0:
        types.append("Communities")
        
    if len(municipalities) > 0:
        types.append("Municipalities")
        if len(municipalities) > len(post_communities):
            post_communities = municipalities

    if len(cities_and_communities) > 0:
        types.append("Cities and communities")
        if len(cities_and_communities) > len(post_communities):
            post_communities = cities_and_communities

    if len(community) > 0:
        types.append("Community")
        if len(community) > len(post_communities):
            post_communities = community
    
    if len(post_communities) == 0:
        return []

    post_communities = post_communities[0]
    
    regex = r"===(.+)==="
    communities_subheadings = [result.strip() for result in re.findall(regex, post_communities[1])]
    
    ## --
    
    # print(re.findall(r"\*\s*\[", post_communities[1]))
    regex = r"\*\s*\[\[(.+?),(.+?)\|(.+?)\]\]"
    communities = re.findall(regex, post_communities[1])
    
    regex = r"\*\s*\[\[(.+?)\|(.+?)\]\]"
    communities_all = re.findall(regex, post_communities[1])
    
    regex = r"\*\s*\[\[([^|]+?)\]\]"
    communities_untitled = re.findall(regex, post_communities[1])
        
    a = set([c[2] for c in communities])
    b = set([c[1] for c in communities_all])
    difference = b.difference(a).union(a.difference(b))
    
    return communities_subheadings, [c[0] for c in communities_all], difference, communities_untitled, types

In [478]:
title = 'Garvin_County,_Oklahoma'
print(wikipedia_standard_url(title))
text = BeautifulSoup(wikipedia_export_get(title)).select('text')[0].getText()
# print(text)
wikipedia_communities_extract(text)

https://en.wikipedia.org/wiki/Garvin_County,_Oklahoma


([],
 ['Antioch, Oklahoma',
  'Elmore City, Oklahoma',
  'Erin Springs, Oklahoma',
  'Fort Arbuckle (Oklahoma)',
  'Foster, Oklahoma',
  'Hennepin, Oklahoma',
  'Hoover, Oklahoma',
  'Katie, Oklahoma',
  'Lindsay, Oklahoma',
  'Maysville, Oklahoma',
  'Paoli, Oklahoma',
  'Pauls Valley, Oklahoma',
  'Pernell, Oklahoma',
  'Purdy, Oklahoma',
  'Stratford, Oklahoma',
  'Tussy, Oklahoma',
  'Wallville, Oklahoma',
  'Wynnewood, Oklahoma'],
 {'Fort Arbuckle'},
 [],
 ['Communities'])

---

### Init!

In [134]:
%%time

twenty = wikipedia_counties_titles()

counties_export_text = (sc.parallelize(twenty)
                          .map(wikipedia_export_get)
                          .map(lambda export: BeautifulSoup(export).select('text')[0].getText())
                          .cache())

[<big>The 3,243 counties and county equivalents of the United States<sup class="reference" id="cite_ref-14"><a href="#cite_note-14">[c]</a></sup></big>]
CPU times: user 2.07 s, sys: 17 ms, total: 2.09 s
Wall time: 2.15 s


In [480]:
communities_all = (counties_export_text.map(wikipedia_communities_extract)
                                       .filter(lambda x: x != [])
                                       .map(lambda x: x[1]))

# out = [o for o in out if o[0] == tuple() and ',_Indiana' in o[1]]
# out = [o for o in out if o[0] != tuple() and ',_Puerto_Rico' not in o[1]]
# out = [o for o in out if o[0] != tuple() and ',_Indiana' in o[1]]
# out = [o for o in out if o[0] != tuple() and ',_Virginia' in o[1]]
# out = [o for o in out if o[0] != tuple() and ',_Alaska' in o[1]]

communities_all_1 = list(communities_all.map(set).reduce(set.union))

In [482]:
with open('communities_all.txt', 'w') as f:
    f.write("\n".join(communities_all_1))

In [486]:
from IPython.display import HTML
HTML('<br>'.join([f'<a href="{wikipedia_standard_url(title)}">{title}</a>' for title in np.random.choice(communities_all_1, 250)]))

---
#### Newtypes

In [519]:
def test_1(x):
    with open('communities_all.txt') as f:
        return len(f.readlines()) + x

sc.parallelize(range(105)).map(test_1).collect()

[71209,
 71210,
 71211,
 71212,
 71213,
 71214,
 71215,
 71216,
 71217,
 71218,
 71219,
 71220,
 71221,
 71222,
 71223,
 71224,
 71225,
 71226,
 71227,
 71228,
 71229,
 71230,
 71231,
 71232,
 71233,
 71234,
 71235,
 71236,
 71237,
 71238,
 71239,
 71240,
 71241,
 71242,
 71243,
 71244,
 71245,
 71246,
 71247,
 71248,
 71249,
 71250,
 71251,
 71252,
 71253,
 71254,
 71255,
 71256,
 71257,
 71258,
 71259,
 71260,
 71261,
 71262,
 71263,
 71264,
 71265,
 71266,
 71267,
 71268,
 71269,
 71270,
 71271,
 71272,
 71273,
 71274,
 71275,
 71276,
 71277,
 71278,
 71279,
 71280,
 71281,
 71282,
 71283,
 71284,
 71285,
 71286,
 71287,
 71288,
 71289,
 71290,
 71291,
 71292,
 71293,
 71294,
 71295,
 71296,
 71297,
 71298,
 71299,
 71300,
 71301,
 71302,
 71303,
 71304,
 71305,
 71306,
 71307,
 71308,
 71309,
 71310,
 71311,
 71312,
 71313]

---

#### Investigations

In [514]:
print(BeautifulSoup(wikipedia_export_get(np.random.choice(communities_all_1))))

<mediawiki version="0.10" xml:lang="en" xmlns="http://www.mediawiki.org/xml/export-0.10/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemalocation="http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd">
<siteinfo>
<sitename>Wikipedia</sitename>
<dbname>enwiki</dbname>
<base/>https://en.wikipedia.org/wiki/Main_Page
    <generator>MediaWiki 1.35.0-wmf.41</generator>
<case>first-letter</case>
<namespaces>
<namespace case="first-letter" key="-2">Media</namespace>
<namespace case="first-letter" key="-1">Special</namespace>
<namespace case="first-letter" key="0"></namespace>
<namespace case="first-letter" key="1">Talk</namespace>
<namespace case="first-letter" key="2">User</namespace>
<namespace case="first-letter" key="3">User talk</namespace>
<namespace case="first-letter" key="4">Wikipedia</namespace>
<namespace case="first-letter" key="5">Wikipedia talk</namespace>
<namespace case="first-letter" key="6">File</namespace>
<namespace case="f