---
### Util.

In [None]:
import requests
from bs4 import BeautifulSoup
import pyspark as ps
import numpy as np
import time

response_cache = {}

def http_get(url):
    if response := response_cache.get(url):
        return response
    else:
        response = requests.request(url=url, method="GET")
        counties_list_html = response.content
        response_cache[url] = counties_list_html
        
        return response_cache[url]

spark = (ps.sql.SparkSession
         .builder
         .master('local[8]')
         .appName('lecture')
         .getOrCreate())

sc = spark.sparkContext

In [None]:
def wikipedia_export_get(title):
    xml = http_get("https://en.wikipedia.org/wiki/Special:Export/" + title)
    
    if b'<redirect title' in xml:
        return wikipedia_export_get(title=BeautifulSoup(xml).select('redirect')[0]['title'])
    else:
        return xml

def wikipedia_standard_url(title):
    return "https://en.wikipedia.org/wiki/" + title

---

### Init!

In [None]:
!wc -l data/communities_all.txt

##### Schema

In [None]:
import pyspark.sql.types as types

schema = types.StructType([
    types.StructField(name='id', dataType=types.LongType(), nullable=False),
    types.StructField(name='Name', dataType=types.StringType(), nullable=False),
    types.StructField(name='XML', dataType=types.StringType(), nullable=False) 
])

data_frame = spark.createDataFrame([(hash("a"), "1😱2", b"c".decode())], schema=schema)

data_frame.toPandas()

##### New Data

In [None]:


cities_titles = (spark.read.text('data/communities_all.txt').rdd
                      .map(lambda row: row.value)
                      .cache())

cities_titles.take(5)

###### Wikitext Sizes

##### Stored Data

In [None]:
data_frame_stored = spark.read.option('schema', schema).parquet("data/union13")

##### New Data

In [None]:
load_new_count = 1400

current_titles = data_frame_stored.rdd.map(lambda row: row[1])
cities_remaining_titles = cities_titles.subtract(current_titles)
print("current", current_titles.count())
print("total", cities_titles.count())
print("remaining", cities_remaining_titles.count())

cities_remaining_titles = cities_remaining_titles.sample(fraction=load_new_count/cities_remaining_titles.count(), withReplacement=False).cache()
print("remaining sampled", cities_remaining_titles.count())

cities_exports = cities_remaining_titles.map(wikipedia_export_get).cache()
cities_exports_keyed = cities_remaining_titles.map(hash).zip(cities_remaining_titles).zip(cities_exports)
cities_proper_exports_keyed = cities_exports_keyed.filter(lambda keys_export: len(BeautifulSoup(keys_export[1]).select('text')) > 0)

data_frame_new = spark.createDataFrame(cities_proper_exports_keyed.map(lambda keys_export: (keys_export[0][0], keys_export[0][1], keys_export[1].decode())), schema)

In [None]:
data_frame_stored.intersect(data_frame_new).show()

##### Union

In [None]:
data_frame_union = data_frame_stored.union(data_frame_new)
# data_frame_union.count(), data_frame_union.distinct().count()

##### Write to File

In [None]:
%%time
data_frame_union.distinct().write.save(path="data/union16")

In [None]:
data_frame_stored = data_frame_union

---
#### Newtypes

In [None]:
def wikipedia_infobox_key_values(text):
    """
    {{Infobox settlement
    | name = Dooley
    | settlement_type = [[Ghost town]]
    | image_skyline = 
    | imagesize =
    | image_caption = 
    | pushpin_map = Montana#USA
    | pushpin_label_position = left
    | map_caption = Location of Dooley in Montana
    | coordinates_footnotes = <ref>{{cite gnis |id=770722 |name=Dooley}}</ref>
    | subdivision_type = [[List of sovereign states|Country]]
    | subdivision_name = United States
    | subdivision_type1 = [[U.S. state|State]]
    | subdivision_name1 = [[Montana]]
    | subdivision_type2 = [[List of counties in Montana|County]]
    | subdivision_name2 = [[Sheridan County, Montana|Sheridan]]
    | established_title = Established
    | established_date = 1913
    | named_for = 
    | extinct_title = Abandoned
    | extinct_date = 1957
    | elevation_ft = 2461
    | coordinates = {{coord|48|52|52|N|104|23|22|W|region:US-MT|display=inline,title}}
    }}
    """
    import re
    
    regex = r"^\s*\|\s*(.+?)\s*=(\s*|(\s*(.*)\s*))$"
    key_values = re.findall(regex, text, re.MULTILINE)

    return [(key, value) for key, _, _, value in key_values]

def wikipedia_infobox_key_values_non_empty(text):
    return [(key, value) for key, value in wikipedia_infobox_key_values(text) if value != '']

wikipedia_infobox_key_values("abc")

In [None]:
sample = (data_frame_stored.rdd # .sample(fraction=50/data_frame_stored.count(), withReplacement=False)
                           .map(lambda row: BeautifulSoup(row.XML).select('text')[0].getText())
                           .cache())

In [None]:
search_terms = ['mining', 'ghost', 'awesome', 'death', 'taxes', 'high tax', 'low tax',
                'superhero', 'batman', ' men', ' women', 'spongebob', 'corporations']

search_terms += ['oil', ' oil', 'oil ', ' oil ', 'oil.', '. oil']

search_terms += ['gazette', 'journal', 'wall street', 'railroad', 'baron',
                 'polish', 'poland', 'french', 'france', 'film', 'movies', ' media ',
                 'technology', 'farm', 'farming', 'orchard', 'automotive', 'entertainment',
                 'immigrants', 'immigration', 'freelance', 'volcan',
                 'eruption', 'flood', 'tornado', 'hurricane', 'katrina', 'hurricane katrina']

# search_terms = map(str, range(1200, 2080))

dict_ = {}
for search_term in search_terms:
    samples_filtered = sample.filter(lambda text: search_term.lower() in text.lower())
    dict_[search_term] = samples_filtered.count()
    print(f'{search_term}\t\t{samples_filtered.count() / sample.count():0.5f} {samples_filtered.count()}/{sample.count()}')

In [None]:
import matplotlib.pyplot as plt

xs = np.array([int(key) for key, value in date_frequencies.items()])
ys = np.array([value for key, value in date_frequencies.items()])

fig, ax = plt.subplots(figsize=(15, 8))
ax.fill_between(xs, y1=0, y2=ys)

---

In [None]:
samples_infobox = (sample.map(wikipedia_infobox_key_values_non_empty)
                         .map(lambda key_values: [key for key, value in key_values])
                         .map(set).reduce(set.union))

len(samples_infobox)

In [None]:
infobox_keys_counts = (sample.map(wikipedia_infobox_key_values_non_empty)
                         .flatMap(lambda key_values: [key for key, value in key_values])
                         .countByValue())
                         
infobox_keys_counts = [key_count for key_count in infobox_keys_counts.items() if key_count[1] > 20]
sorted(infobox_keys_counts, key=lambda kv: -kv[1])

In [None]:
sample = (data_frame_stored.rdd # .sample(fraction=50/data_frame_stored.count(), withReplacement=False)
                               .map(lambda row: BeautifulSoup(row.XML).select('text')[0].getText())
                               .cache())

text = sample.take(1)[0]

wikipedia_infobox_key_values_non_empty(text)

In [None]:
text = BeautifulSoup(xml).select('text')[0].getText()

# wikipedia_infobox_key_values(text)
wikipedia_infobox_key_values_non_empty(text)

---

#### Investigations

 #### Potential inquiries:
 
 Nearness to rivers/lakes/roads  
 Mining  
 Text quantity as rough measure of significance  
 Dates extract to estimate approximate time

In [None]:
data_frame = spark.read.option('schema', schema).parquet("union9").rdd
data_frame.count()

In [None]:
spooky = (data_frame.filter(lambda row: 'ghost' in row.XML.lower())
                    .map(lambda row: row.XML)
                    .cache())

print(spooky.count())
print(BeautifulSoup(spooky.sample(fraction=0.5, withReplacement=False).take(20)[8]).select('text')[0].getText())

In [None]:
from IPython.display import HTML
import urllib


def wikipedia_file_url(file):
    # return "https://upload.wikimedia.org/wikipedia/commons/8/81/" + urllib.parse.quote(file, safe=' ').replace(' ', '_')
    return "https://en.wikipedia.org/wiki/Special:FilePath/" + file

file = 'Bellfonte, Alabama- The Chimney of the Local Inn.JPG'
file = 'Reno skyline.JPG'
file = 'Cape dec29-07 (23).JPG'
file = 'Abandoned school in Toyah, Texas.jpg'
HTML(f'<img src="{wikipedia_file_url(file)}" />') # , wikipedia_file_url(file), 'https://upload.wikimedia.org/wikipedia/commons/8/81/Bellfonte%2C_Alabama-_The_Chimney_of_the_Local_Inn.JPG'

In [None]:
print(BeautifulSoup(spooky.sample(fraction=0.05, withReplacement=False).take(1)[0]).select('text')[0].getText())

In [None]:
|settlement_type          = [[Ghost town]]
|settlement_type          = [[Ghost town|Ghost Town]]
|coordinates              = {{coord|34|42|40|N|85|56|43|W|region:US-AL|display=inline,title}}
|coordinates   = {{coord|31|15|21|N|91|36|30|W|region:US-MS|display=inline,title}}


In [None]:
[[Category:Former populated places in Minnesota]]
[[Category:Former populated places in Rock County, Minnesota]]
[[Category:Ghost towns in Alabama]]
[[Category:Ghost towns in West Texas]]
[[Category:Ghost towns in Nye County, Nevada]]
[[Category:Ghost towns in Nevada]]
[[Category:Former populated places in Adams County, Mississippi]]
[[Category:Former populated places in Mississippi]]
[[Category:Former populated places in Oregon]]
[[Category:Destroyed cities]]
{{US-ghost-town-stub}}

In [None]:
'''Bellefonte''' is a [[ghost town]]

In [None]:
==Demographics==
{{US Census population
|1830= 1414
|1840= 1743
|1850= 1761
|1860= 1942
|1870= 1704
|1880= 1767
|1890= 1728
|1900= 1548
|1910= 1644
|1920= 1518
|1930= 1518
|1940= 1700
|1950= 1778
|1960= 2145
|1970= 2505
|1980= 2792
|1990= 2667
|2000= 2692
|2010= 2775
|estyear=2016
|estimate=2783
|estref=<ref name="USCensusEst2016">{{cite web|url=https://www.census.gov/programs-surveys/popest/data/tables.2016.html|title=Population and Housing Unit Estimates|accessdate=June 9, 2017}}</ref>
|footnote=U.S. Decennial Census<ref name="DecennialCensus">{{cite web|url=https://www.census.gov/programs-surveys/decennial-census.html|title=Census of Population and Housing|publisher=Census.gov|accessdate=June 4, 2015}}</ref>
}}

In [None]:
&lt;!-- Population------------------&gt;
| population_total        = 320
| population_as_of        = 2010
| population_density_km2  = auto
|population_metro         = 

&lt;!-- Population --&gt;
|population_as_of         = 
|population_footnotes     =
|population_total         = 
|population_density_km2   = 
|population_density_sq_mi =

&lt;!-- postal codes, area code --&gt;
| geocode                 = 
| iso_code                = 