docker run -d --name sparkbook -p 8881:8888 -v "$PWD":/home/jovyan/work jupyter/pyspark-notebook start.sh jupyter lab --LabApp.token=''

In [1]:
import requests
from bs4 import BeautifulSoup
import pyspark as ps
import numpy as np
import time

response_cache = {}

def http_get(url):
    if response := response_cache.get(url):
        return response
    else:
        response = requests.request(url=url, method="GET")
        counties_list_html = response.content
        response_cache[url] = counties_list_html
        
        return response_cache[url]

spark = (ps.sql.SparkSession
         .builder
         .master('local[8]')
         .appName('lecture')
         .getOrCreate())

sc = spark.sparkContext

In [2]:
counties_list_url = "https://en.wikipedia.org/wiki/List_of_United_States_counties_and_county_equivalents"

def wikipedia_export_get(title):
    xml = http_get("https://en.wikipedia.org/wiki/Special:Export/" + title)
    
    if b'<redirect title' in xml:
        return wikipedia_export_get(title=BeautifulSoup(xml).select('redirect')[0]['title'])
    else:
        return xml

def wikipedia_standard_url(title):
    return "https://en.wikipedia.org/wiki/" + title

# def wikipedia_standard_url_encode(title):
#     import urllib.parse
#     return "https://en.wikipedia.org/wiki/" + urllib.parse.quote(title)

def wikipedia_counties_titles():
    soup = BeautifulSoup(http_get(counties_list_url))
    print(soup.select('.wikitable.sortable caption big'))
    
    rows = soup.select('.wikitable.sortable tbody tr')
    anchors = sum([row.select('td a')[:1] for row in rows], [])

    urls = [a['href'][len('/wiki/'):] for a in anchors]
    
    return urls
    
def wikipedia_communities_subheadings(text):
    import re

    regex = r"(^==\s?Communities\s?==)(.+?)(^==[^=].+?[^=]==)"
    post_communities = re.findall(regex, text, re.MULTILINE + re.DOTALL)
    
    if len(post_communities) == 0:
        return []

    # print(post_communities[0])
    post_communities = post_communities[0]
    
    regex = r"===(.+)==="
    res = [result.strip() for result in re.findall(regex, post_communities[1])]

    return res

time1 = time.time()
wikipedia_counties_titles()
f'time.{time.time() - time1}'

[<big>The 3,243 counties and county equivalents of the United States<sup class="reference" id="cite_ref-14"><a href="#cite_note-14">[c]</a></sup></big>]


'time.3.2653725147247314'

In [None]:
def wikipedia_communities_extract(text):
    import re

    regex = r"(^==\s?Communities\s?==)(.+?)(^==[^=].+?[^=]==)"
    post_communities = re.findall(regex, text, re.MULTILINE + re.DOTALL)
    
    regex = r"(^==\s?Municipalities\s?==)(.+?)(^==[^=].+?[^=]==)"
    municipalities = re.findall(regex, text, re.MULTILINE + re.DOTALL)
    
    regex = r"(^==\s?Cities and communities\s?==)(.+?)(^==[^=].+?[^=]==)"
    cities_and_communities = re.findall(regex, text, re.MULTILINE + re.DOTALL)
    
    regex = r"(^==\s?Community\s?==)(.+?)(^==[^=].+?[^=]==)"
    community = re.findall(regex, text, re.MULTILINE + re.DOTALL)
    
    types = []
    if len(post_communities) > 0:
        types.append("Communities")
        
    if len(municipalities) > 0:
        types.append("Municipalities")
        if len(municipalities) > len(post_communities):
            post_communities = municipalities

    if len(cities_and_communities) > 0:
        types.append("Cities and communities")
        if len(cities_and_communities) > len(post_communities):
            post_communities = cities_and_communities

    if len(community) > 0:
        types.append("Community")
        if len(community) > len(post_communities):
            post_communities = community
    
    if len(post_communities) == 0:
        return []

    post_communities = post_communities[0]
    
    regex = r"===(.+)==="
    communities_subheadings = [result.strip() for result in re.findall(regex, post_communities[1])]
    
    ## --
    
    # print(re.findall(r"\*\s*\[", post_communities[1]))
    regex = r"\*\s*\[\[(.+?),(.+?)\|(.+?)\]\]"
    communities = re.findall(regex, post_communities[1])
    
    regex = r"\*\s*\[\[(.+?)\|(.+?)\]\]"
    communities_all = re.findall(regex, post_communities[1])
    
    regex = r"\*\s*\[\[([^|]+?)\]\]"
    communities_untitled = re.findall(regex, post_communities[1])
        
    a = set([c[2] for c in communities])
    b = set([c[1] for c in communities_all])
    difference = b.difference(a).union(a.difference(b))
    
    return communities_subheadings, [c[0] for c in communities_all], difference, communities_untitled, types

In [None]:
title = 'Garvin_County,_Oklahoma'
print(wikipedia_standard_url(title))
text = BeautifulSoup(wikipedia_export_get(title)).select('text')[0].getText()
# print(text)
wikipedia_communities_extract(text)

---

### Init!

In [None]:
wikipedia_export_get(twenty[15])

In [None]:
%%time

twenty = wikipedia_counties_titles()

counties_export_text = (sc.parallelize(twenty)
                          .map(wikipedia_export_get)
                          .map(lambda export: BeautifulSoup(export).select('text')[0].getText())
                          .cache())

In [None]:
communities_all = (counties_export_text.map(wikipedia_communities_extract)
                                       .filter(lambda x: x != [])
                                       .map(lambda x: x[1]))

# out = [o for o in out if o[0] == tuple() and ',_Indiana' in o[1]]
# out = [o for o in out if o[0] != tuple() and ',_Puerto_Rico' not in o[1]]
# out = [o for o in out if o[0] != tuple() and ',_Indiana' in o[1]]
# out = [o for o in out if o[0] != tuple() and ',_Virginia' in o[1]]
# out = [o for o in out if o[0] != tuple() and ',_Alaska' in o[1]]

communities_all_1 = list(communities_all.map(set).reduce(set.union))

In [None]:
len(communities_all_1)

In [None]:
with open('communities_all.txt', 'w') as f:
    f.write("\n".join(communities_all_1))

In [None]:
from IPython.display import HTML
HTML('<br>'.join([f'<a href="{wikipedia_standard_url(title)}">{title}</a>' for title in np.random.choice(communities_all_1, 250)]))

---
#### Newtypes

In [1]:
states = [
    'Alabama',
    'Alaska',
    'Arizona',
    'Arkansas',
    'California',
    'Colorado',
    'Connecticut',
    'Delaware',
    'Florida',
    'Georgia',
    'Hawaii',
    'Idaho',
    'Illinois',
    'Indiana',
    'Iowa',
    'Kansas',
    'Kentucky',
    'Louisiana',
    'Maine',
    'Maryland',
    'Massachusetts',
    'Michigan',
    'Minnesota',
    'Mississippi',
    'Missouri',
    'Montana',
    'Nebraska',
    'Nevada',
    'New Hampshire',
    'New Jersey',
    'New Mexico',
    'New York',
    'North Carolina',
    'North Dakota',
    'Ohio',
    'Oklahoma',
    'Oregon',
    'Pennsylvania',
    'Rhode Island',
    'South Carolina',
    'South Dakota',
    'Tennessee',
    'Texas',
    'Utah',
    'Vermont',
    'Virginia',
    'Washington',
    'West Virginia',
    'Wisconsin',
    'Wyoming']

---

#### Investigations

In [None]:
| align=right | {{dts|1960|12|21}}
|-
! scope="row" | [[Atascadero, California|Atascadero]]
| City
| [[San Luis Obispo County, California|San Luis Obispo]]
| align=right | 28,310
| {{convert|25.64|sqmi|km2|disp=table|sortable=on}}
| align=right | {{dts|1979|07|02}}
|-
! scope="row" | [[Atherton, California|Atherton]]
| Town
| [[San Mateo County, California|San Mateo]]
| align=right | 6,914
| {{convert|5.02|sqmi|km2|disp=table|sortable=on}}
| align=right | {{dts|1923|09|12}}
|-
! scope="row" | [[Atwater, California|Atwater]]
| City
| [[Merced County, California|Merced]]
| align=right | 28,168
| {{convert|6.09|sqmi|km2|disp=table|sortable=on}}
| align=right | {{dts|1922|08|16}}
|-
! scope="row" style="background:#bfb;"| [[Auburn, California|Auburn]]{{dagger|alt=County seat}}
| City
| [[Placer County, California|Placer]]

In [None]:
|-
|[[Allard, California|Allard]]||[[Kern County, California|Kern]]
|-
|[[Alma, California|Alma]]||[[Santa Clara County, California|Santa Clara]]
|-

---
### Type C

In [755]:

def wikitable_extractor(column, table_index=0):
    return lambda soup: [a['href'][len('/wiki/'):] for a in soup.select(f'table.wikitable.sortable')[table_index].select(f'td:nth-child({column}) a[href^="/wiki/"]')]

def wikilist_extractor(table_index=0):
    return lambda soup: [a['href'][len('/wiki/'):] for a in soup.select('ul')[table_index].select('li a[href^="/wiki/"]')]

parameters = [
    ( 'List_of_cities_and_towns_in_Alabama', wikitable_extractor(column=1)),
    (      'List_of_ghost_towns_in_Alabama', wikitable_extractor(column=1)),
    
    (            'List_of_cities_in_Alaska', wikitable_extractor(column=1)),
    
    ( 'List_of_cities_and_towns_in_Arizona', wikitable_extractor(column=1)),
    (      'List_of_ghost_towns_in_Arizona', wikitable_extractor(column=1)),
    
    ('List_of_cities_and_towns_in_Arkansas', wikitable_extractor(column=2)),
    (     'List_of_ghost_towns_in_Arkansas', wikitable_extractor(column=1)),
    
    ('List_of_cities_and_towns_in_California', lambda soup: [a['href'][len('/wiki/'):] for a in soup.select('table.wikitable.sortable th[scope="row"] a[href^="/wiki/"]')]),
    (     'List_of_ghost_towns_in_California', wikitable_extractor(column=1)),
    
        
    ('List_of_cities_and_towns_in_Colorado', wikitable_extractor(column=1)),
    (     'List_of_ghost_towns_in_Colorado', wikitable_extractor(column=1)),
    
    (       'List_of_cities_in_Connecticut', wikitable_extractor(column=1)),
    (        'List_of_towns_in_Connecticut', wikitable_extractor(column=2)),
    (  'List_of_ghost_towns_in_Connecticut', wikilist_extractor()),
    
    (  'List_of_municipalities_in_Delaware', wikitable_extractor(column=2)),
    (     'List_of_ghost_towns_in_Delaware', wikilist_extractor()),

    (   'List_of_municipalities_in_Florida', wikitable_extractor(column=2)),
    (      'List_of_ghost_towns_in_Florida', wikitable_extractor(column=1)),
    
    ('List_of_municipalities_in_Georgia_(U.S._state)', wikitable_extractor(table_index=1, column=1)),
    ('List_of_ghost_towns_in_Georgia_(U.S._state)', wikilist_extractor()),
    
    (            'List_of_places_in_Hawaii', wikitable_extractor(column=2)),
    (       'List_of_ghost_towns_in_Hawaii', lambda soup: sum([wikilist_extractor(table_index=i)(soup) for i in range(1, 6)], [])),
    
    (             'List_of_cities_in_Idaho', wikitable_extractor(column=2)),
    (        'List_of_ghost_towns_in_Idaho', wikilist_extractor()),
    
    (  'List_of_municipalities_in_Illinois', wikitable_extractor(column=1)),
    (     'List_of_ghost_towns_in_Illinois', wikilist_extractor()),
    
    (           'List_of_cities_in_Indiana', wikitable_extractor(column=2)),
    (            'List_of_towns_in_Indiana', wikitable_extractor(column=1)),
    (      'List_of_ghost_towns_in_Indiana', wikilist_extractor()),
    
    (              'List_of_cities_in_Iowa', lambda soup: [a['href'][len('/wiki/'):] for a in soup.select('table.wikitable.sortable th[scope="row"] a[href^="/wiki/"]')]),
    (         'List_of_ghost_towns_in_Iowa', wikilist_extractor()),
    
    (            'List_of_cities_in_Kansas', lambda soup: sum([wikilist_extractor(table_index=i)(soup) for i in range(5, 5+25)], [])), # alphabetical, missing 'X'
    (       'List_of_ghost_towns_in_Kansas', wikitable_extractor(column=1)),
    
    (          'List_of_cities_in_Kentucky', wikitable_extractor(column=1)),
    (     'List_of_ghost_towns_in_Kentucky', wikilist_extractor()),
    
    ( 'List_of_municipalities_in_Louisiana', wikitable_extractor(column=1)),
    (    'List_of_ghost_towns_in_Louisiana', lambda soup: [a['href'][len('/wiki/'):] for a in soup.select('div')[2].select('ul li a[href^="/wiki/"]')[:-5] if '_Louisiana' in a['href']]),
    
    (             'List_of_cities_in_Maine', wikitable_extractor(column=2)),
    (              'List_of_towns_in_Maine', lambda soup: [a['href'][len('/wiki/'):] for a in soup.select(f'table.Wikitable.sortable')[0].select(f'td:nth-child(1) a[href^="/wiki/"]')]),
    (        'List_of_ghost_towns_in_Maine', wikilist_extractor()),
    
    (  'List_of_municipalities_in_Maryland', wikitable_extractor(column=1)),
    (     'List_of_ghost_towns_in_Maryland', lambda soup: [a['href'][len('/wiki/'):] for a in soup.select('ul a[href^="/wiki/"]') if 'ounty' not in a['href']][:12]),
    
    ('List_of_municipalities_in_Massachusetts', wikitable_extractor(column=1)),
    ('List_of_ghost_towns_in_Massachusetts', lambda soup: [title for title in wikilist_extractor()(soup) if 'Quabbin' not in title]),
    
    (  'List_of_municipalities_in_Michigan', wikitable_extractor(column=1)),
    (     'List_of_ghost_towns_in_Michigan', wikilist_extractor(table_index=2)),
    
    (         'List_of_cities_in_Minnesota', wikitable_extractor(column=2)),
    (    'List_of_ghost_towns_in_Minnesota', wikilist_extractor()),
    
    ('List_of_municipalities_in_Mississippi', wikitable_extractor(column=1)),
    (   'List_of_ghost_towns_in_Mississippi', wikilist_extractor()),
    
    # (           'List_of_cities_in_Missouri', lambda soup: sum([wikilist_extractor(table_index=i)(soup) for i in range(2, 25)], []))
    (      'List_of_ghost_towns_in_Missouri', wikilist_extractor()),
    
    (  'List_of_cities_and_towns_in_Montana', wikitable_extractor(column=1)),
    (       'List_of_ghost_towns_in_Montana', lambda soup: [a['href'][len('/wiki/'):] for a in soup.select('ul a[href^="/wiki/"]:not([href*="County"])')[:73]]),
    
    (           'List_of_cities_in_Nebraska', lambda soup: sum([wikilist_extractor(table_index=i)(soup) for i in range(1, 1+23)], [])),
    (      'List_of_ghost_towns_in_Nebraska', wikilist_extractor()),
    
    (            'List_of_cities_in_Nevada', wikitable_extractor(column=1)),
    (       'List_of_ghost_towns_in_Nevada', wikitable_extractor(column=1)),
    
    ('List_of_cities_and_towns_in_New_Hampshire', wikitable_extractor(column=1)),
    ('List_of_ghost_towns_in_New_Hampshire', lambda soup: wikilist_extractor()(soup)[:5]),
    
    ('List_of_municipalities_in_New_Jersey', wikitable_extractor(column=2)),
    (  'Category:Ghost_towns_in_New_Jersey', lambda soup: [a['href'][len('/wiki/'):] for a in soup.select('li a')[:17]]),
    
    ('List_of_municipalities_in_New_Mexico', wikitable_extractor(column=1)),
    (   'List_of_ghost_towns_in_New_Mexico', wikitable_extractor(column=1)),
    
    (          'List_of_cities_in_New_York', wikitable_extractor(column=1)),
    ('Category:Ghost_towns_in_New_York_(state)', lambda soup: [a['href'][len('/wiki/'):] for a in soup.select('li a')[:10]]),
    
    ('List_of_municipalities_in_North_Carolina', lambda soup: [title for title in sum([wikilist_extractor(table_index=i)(soup) for i in range(2, 26)], []) if 'County' not in title]),
    ('List_of_ghost_towns_in_North_Carolina', wikilist_extractor()),
    
    # (       'List_of_cities_in_North_Dakota', wikitable_extractor(table_index=0, column=2)),
    (  'List_of_ghost_towns_in_North_Dakota', wikilist_extractor()),
    
    (               'List_of_cities_in_Ohio', wikitable_extractor(column=1)),
    (             'List_of_villages_in_Ohio', wikitable_extractor(column=1)),
    # (          'List_of_ghost_towns_in_Ohio', lambda soup: [title for title in wikilist_extractor()(soup) if '_County' not in title][1:]),
    
    ( 'List_of_cities_and_towns_in_Oklahoma', wikitable_extractor(table_index=1, column=2)),
    (      'List_of_ghost_towns_in_Oklahoma', wikitable_extractor(column=1)),
    
    (             'List_of_cities_in_Oregon', wikitable_extractor(column=2)),
    (        'List_of_ghost_towns_in_Oregon', lambda soup: [a['href'][len('/wiki/'):] for a in soup.select('table.wikitable.sortable th[scope="row"] a[href^="/wiki/"]')]),
    
    (       'List_of_cities_in_Pennsylvania', wikitable_extractor(column=1)),
    # (    'List_of_ghost_towns_in_Pennsylvania', '')

    ('List_of_municipalities_in_Rhode_Island', wikitable_extractor(column=1)),
    (  'Category:Ghost_towns_in_Rhode_Island', lambda soup: ['Hanton_City,_Rhode_Island']),
    
    ('List_of_cities_and_towns_in_South_Carolina', wikitable_extractor(column=1)),
    ( 'List_of_ghost_towns_in_South_Carolina', wikilist_extractor()),
    
    (        'List_of_cities_in_South_Dakota', wikitable_extractor(column=2)), # partial
    (         'List_of_towns_in_South_Dakota', wikitable_extractor(column=1)),
    (   'List_of_ghost_towns_in_South_Dakota', wikitable_extractor(column=1)),
    
    (   'List_of_municipalities_in_Tennessee', wikitable_extractor(column=2)),
    (      'List_of_ghost_towns_in_Tennessee', lambda soup: wikilist_extractor()(soup)[:-1]),
    
    (               'List_of_cities_in_Texas', lambda soup: [a['href'][len('/wiki/'):] for a in soup.select('table.wikitable.sortable th[scope="row"] a[href^="/wiki/"]')]),
    (                'List_of_towns_in_Texas', wikitable_extractor(column=1)), # partial
    (          'List_of_ghost_towns_in_Texas', lambda soup: [title for title in sum([wikilist_extractor(table_index=i)(soup) for i in range(1, 9)], [])]),
    
    (        'List_of_municipalities_in_Utah', wikitable_extractor(column=1)),
    (           'List_of_ghost_towns_in_Utah', lambda soup: [title for title in sum([wikilist_extractor(table_index=i)(soup) for i in range(2, 6)], [])]),
    
    (             'List_of_cities_in_Vermont', wikitable_extractor(column=1)),
    (              'List_of_towns_in_Vermont', wikitable_extractor(column=2)),
    (        'List_of_ghost_towns_in_Vermont', wikilist_extractor()),
    
    (             'List_of_towns_in_Virginia', wikitable_extractor(column=1)),
    (       'List_of_ghost_towns_in_Virginia', wikilist_extractor()),
    
    ('List_of_cities_and_towns_in_Washington', wikitable_extractor(column=1)),
    (     'List_of_ghost_towns_in_Washington', wikitable_extractor(column=1)),
    
    (       'List_of_cities_in_West_Virginia', wikitable_extractor(column=2)),
    (        'List_of_towns_in_West_Virginia', lambda soup: [title for title in sum([wikilist_extractor(table_index=i)(soup) for i in range(1, 24)], [])]),
    (  'List_of_ghost_towns_in_West_Virginia', wikilist_extractor()),
    
    (           'List_of_cities_in_Wisconsin', wikitable_extractor(column=1)),
    # ('List_of_municipalities_in_Wisconsin_by_population', ''),
    (            'List_of_towns_in_Wisconsin', wikitable_extractor(column=1)),
    (      'List_of_ghost_towns_in_Wisconsin', wikitable_extractor(column=1)),
    
    (     'List_of_municipalities_in_Wyoming', wikitable_extractor(column=1)),
    (        'List_of_ghost_towns_in_Wyoming', wikilist_extractor())
]
    

state_communities_all = []
for title, extract_function in parameters:
    soup = BeautifulSoup(http_get(wikipedia_standard_url(title)))
    titles = extract_function(soup)
    state_communities_all += titles

    print(f'{title:>36}', f'{len(titles):3}', f'{titles[0]:>25}', f'{titles[-1]:>25}', np.random.choice(titles, 2))

 List_of_cities_and_towns_in_Alabama 460        Abbeville,_Alabama             York,_Alabama ['Somerville,_Alabama' 'Gainesville,_Alabama']
      List_of_ghost_towns_in_Alabama  38      Aigleville_(Alabama)       Washington,_Alabama ['Battelle,_Alabama' 'St._Stephens,_Alabama']
            List_of_cities_in_Alaska 149              Adak,_Alaska          Wrangell,_Alaska ['Hughes,_Alaska' 'Point_Hope,_Alaska']
 List_of_cities_and_towns_in_Arizona  91  Apache_Junction,_Arizona             Yuma,_Arizona ['Wickenburg,_Arizona' 'Sierra_Vista,_Arizona']
      List_of_ghost_towns_in_Arizona 115          Adamana,_Arizona           Zeniff,_Arizona ['Salero,_Arizona' 'San_Rafael,_Arizona']
List_of_cities_and_towns_in_Arkansas 501     Little_Rock,_Arkansas         Gilbert,_Arkansas ['Perry,_Arkansas' 'Cabot,_Arkansas']
     List_of_ghost_towns_in_Arkansas  12 Arkansas_Post_National_Memorial           Sneed,_Arkansas ['Rush,_Arkansas' 'Chalk_Bluff,_Arkansas']
List_of_cities_and_towns_in_California 

In [760]:
len(state_communities_all), len(set(state_communities_all)), len(set([title.replace('_', ' ').lower() for title in state_communities_all]))

(22485, 21983, 21982)

In [762]:
with open('data/communities_all_2.txt', 'w') as file:
    file.write('\n'.join([title.replace('_', ' ') for title in state_communities_all]))

In [None]:
with open('bin/California_cities.txt', 'w') as file:
    file.write('\n'.join(cities))

In [219]:
with open('bin/California_ghost_cities.txt', 'w') as file:
    file.write('\n'.join(cities))

In [221]:
with open('bin/counties_communities_all.txt', 'r') as file:
    communities_all = file.read().split("\n")
    
with open('bin/California_cities.txt', 'r') as file:
    california_all = file.read().split("\n")
    
with open('bin/California_ghost_cities.txt', 'r') as file:
    california_spooky = file.read().split("\n")
    
((len(communities_all), len(set(communities_all))),
 (len(california_all), len(set(california_all)), len(set(california_all).difference(communities_all))),
 (len(california_spooky), len(set(california_spooky)), len(set(california_spooky).difference(communities_all)))
)

((71240, 71240), (482, 482, 170), (349, 347, 305))