In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import feather
from tqdm import tqdm
import os

In [2]:
def read_html(url):
    return BeautifulSoup(requests.get(url).text, "lxml")

def get_language_links(url):
    return read_html(url).find("div", attrs = {'id':'p-lang'}).find_all("li")

def get_link_elements(state_name, language_link):
    link_title = re.search("^(.*) – (.*)$", language_link.a["title"])
    language_name = link_title.group(2)
    language_state_name = link_title.group(1)
    language_code = language_link.a["lang"]
    featured = "badge-featuredarticle" in language_link["class"]
    
    return (state_name, language_name, language_code, language_state_name, featured)

def current_page_size(page_url):
    try:
        page_html = read_html(page_url)
    except:
        return "Failed"
    history_size = page_html.find("ul", attrs = {'id':'pagehistory'}).find("span", attrs = {'class':'history-size'})
    return(history_size.text)

In [3]:
def get_lang_df(state_name):
    print("Now reading Wikipedia page for " + state_name)
    wiki_page = read_html("https://en.wikipedia.org/wiki/" + state_name)
    
    print("Processing page HTML...")
    language_links = wiki_page.find("div", attrs = {'id':'p-lang'})
    language_links = language_links.find_all("li")
    language_info = [get_link_elements(state_name, ll) for ll in language_links]
    
    language_df = pd.DataFrame(language_info)
    language_df.columns = ["StateName", "Language", "LanguageCode", "LanguageStateName", "Featured"]
    
    language_df = language_df.assign(EditPage = "https://" + language_df.LanguageCode +
                                     ".wikipedia.org/w/index.php?title=" + language_df.LanguageStateName +
                                     "&action=history")
    
    language_df["PageSize"] = "Failed"
    page_failed = language_df.PageSize == "Failed"
    
    print("Getting foreign language page sizes...")
    while sum(page_failed) > 0:
        print(str(sum(page_failed)) + " values unaccounted for...")
        language_df.PageSize = language_df.progress_apply(lambda row: current_page_size(row.EditPage) if row.PageSize == "Failed" else row.PageSize, axis = 1)
        page_failed = language_df.PageSize == "Failed"
    print("Done with foreign pages")
    
    language_df.PageSize = language_df.PageSize.apply(lambda ps: re.sub("[^0-9]","", ps))
    language_df.PageSize = language_df.PageSize.apply(lambda ps: int(ps) if ps != '' else None)
    
    feather.write_dataframe(language_df, "./state_dfs/" + state_name + ".feather")
    
    return(language_df)

In [4]:
with open("state_list.txt", "r") as f:
    STATE_NAMES = f.readlines()
STATE_NAMES = [sn.strip() for sn in STATE_NAMES]

tqdm.pandas(desc="Progress Bar")

state_dfs = [get_lang_df(sn) for sn in STATE_NAMES if not os.path.isfile("./state_dfs/" + sn + ".feather")]

Now reading Wikipedia page for West_Virginia


Progress Bar:   0%|          | 0/165 [00:00<?, ?it/s]

Processing page HTML...
Getting foreign language page sizes...
165 values unaccounted for...


Progress Bar: 100%|██████████| 165/165 [03:12<00:00,  1.01it/s]
Progress Bar:   0%|          | 0/165 [00:00<?, ?it/s]

58 values unaccounted for...


Progress Bar: 100%|██████████| 165/165 [00:57<00:00,  2.85it/s]


Done with foreign pages
Now reading Wikipedia page for Wisconsin


Progress Bar:   0%|          | 0/169 [00:00<?, ?it/s]

Processing page HTML...
Getting foreign language page sizes...
169 values unaccounted for...


Progress Bar: 100%|██████████| 169/169 [02:28<00:00,  1.07s/it]
Progress Bar:   0%|          | 0/169 [00:00<?, ?it/s]

33 values unaccounted for...


Progress Bar: 100%|██████████| 169/169 [00:42<00:00,  4.02it/s]
Progress Bar: 100%|██████████| 169/169 [00:00<00:00, 18322.37it/s]
Progress Bar: 100%|██████████| 169/169 [00:00<00:00, 17697.04it/s]
Progress Bar: 100%|██████████| 169/169 [00:00<00:00, 17597.31it/s]
Progress Bar: 100%|██████████| 169/169 [00:00<00:00, 18242.67it/s]
Progress Bar: 100%|██████████| 169/169 [00:00<00:00, 17519.89it/s]
Progress Bar: 100%|██████████| 169/169 [00:00<00:00, 14704.95it/s]
Progress Bar: 100%|██████████| 169/169 [00:00<00:00, 15273.05it/s]
Progress Bar: 100%|██████████| 169/169 [00:00<00:00, 15351.11it/s]
Progress Bar: 100%|██████████| 169/169 [00:00<00:00, 16979.36it/s]
Progress Bar: 100%|██████████| 169/169 [00:00<00:00, 18780.63it/s]
Progress Bar:   0%|          | 0/169 [00:00<?, ?it/s]

2 values unaccounted for...
2 values unaccounted for...
2 values unaccounted for...
2 values unaccounted for...
2 values unaccounted for...
2 values unaccounted for...
2 values unaccounted for...
2 values unaccounted for...
2 values unaccounted for...
2 values unaccounted for...
2 values unaccounted for...


Progress Bar: 100%|██████████| 169/169 [00:08<00:00, 20.10it/s]


Done with foreign pages
Now reading Wikipedia page for Wyoming


Progress Bar:   0%|          | 0/165 [00:00<?, ?it/s]

Processing page HTML...
Getting foreign language page sizes...
165 values unaccounted for...


Progress Bar: 100%|██████████| 165/165 [02:56<00:00,  1.01it/s]

Done with foreign pages



