
# Get country borders vertex graph

In [1]:

%pprint
import sys

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

from storage import Storage
from stats_scraping_utils import StatsScrapingUtilities
import wikipedia

s = Storage()
ssu = StatsScrapingUtilities(s=s)

Pretty printing has been turned OFF




In [2]:

secrets_dict = {
    'APP_NAME': 'dbabbitt',
    'APP_ID': 'QEWW7L-48KXHUQQA4',
    'USAGE TYPE': 'Personal/Non-commercial Only'
}

In [3]:

import urllib
import xml.etree.ElementTree as et
import time

eurasia_df = s.load_object('eurasia_df')

In [4]:

from urllib.error import HTTPError
%run ../load_magic/dataframes.py
%run ../load_magic/lists.py

def get_h2_tags(tables_url):
    page_soup = ssu.get_page_soup(tables_url)
    h2_tags_list = page_soup.find_all('h2')
    border_countries_list = []
    for h2_tag in h2_tags_list:
        if h2_tag is not None:
            h2_text = h2_tag.text.strip()
            if h2_text != 'More in World Facts':
                border_countries_list.append(h2_text)
    
    return border_countries_list
def get_h3_tags(tables_url):
    page_soup = ssu.get_page_soup(tables_url)
    h3_tags_list = page_soup.find_all('h3')
    border_countries_list = []
    for h3_tag in h3_tags_list:
        if h3_tag is not None:
            h3_text = h3_tag.text.strip()
            if h3_text != 'More in World Facts':
                border_countries_list.append(h3_text)
    
    return border_countries_list

In [5]:

import requests

WOLFRAM_URL = 'https://api.wolframalpha.com/v1/query.jsp?appid=QEWW7L-48KXHUQQA4&scantimeout=.5&formattimeout=2.5&input='
PREFIXES_LIST = ['what', 'which']
def get_status_code(prefix_str, worldatlas_title):
    tables_url = f'https://www.worldatlas.com/articles/{prefix_str}-countries-border-{worldatlas_title}.html'
    response = requests.get(tables_url)
    
    return response.status_code

In [6]:

def get_border_countries(wikipedia_title, verbose=True):
    border_countries_list = []
    page_tables_list = []
    worldatlas_title = wikipedia_title.lower().replace(' ', '-')
    if prefix_str:=next((x for x in PREFIXES_LIST if (get_status_code(x, worldatlas_title) == 200)), False):
        tables_url = f'https://www.worldatlas.com/articles/{prefix_str}-countries-border-{worldatlas_title}.html'
        try:
            page_tables_list = ssu.get_page_tables(tables_url)
            border_countries_list = page_tables_list[0].iloc[:, 1].tolist()
            if verbose:
                print(tables_url)

            return border_countries_list
        except ValueError as e:
            border_countries_list = get_h2_tags(tables_url)
            if not border_countries_list:
                border_countries_list = get_h3_tags(tables_url)
            if verbose:
                print(border_countries_list)
        except Exception as e:
            if verbose:
                error_prefix = f'Trying to get the border countries for {wikipedia_title} using {tables_url} gets this'
                print(f'{error_prefix} {e.__class__} error: {str(e).strip()}')
    url = WOLFRAM_URL + urllib.parse.quote(f'countries bordering {wikipedia_title}')
    time.sleep(100)
    if verbose:
        print(url)
    xml_binary = None
    with urllib.request.urlopen(url) as response:
        xml_binary = response.read()
    if xml_binary is not None:
        root = et.fromstring(xml_binary.decode('utf-8'))
        if (root.attrib['success'] == 'true'):
            border_countries_list = root[1][0][3].text.split(' | ')
            border_countries_list[-1] = border_countries_list[-1].split(' (')[0]

            return border_countries_list
    
    return border_countries_list

In [None]:

edge_list = []
for wikipedia_title in eurasia_df.wikipedia_title:
    border_countries_list = get_border_countries(wikipedia_title, verbose=True)
    print(border_countries_list)
    for border_country in border_countries_list:
        edge_tuple = (wikipedia_title, border_country)
        edge_list.append(edge_tuple)

In [28]:

import networkx as nx
import dwave_networkx as dnx
from dwave_networkx.algorithms.coloring import vertex_color
import dimod
%run ../load_magic/storage.py

s = Storage()
edge_list = s.load_object('wikipedia_country_border_edge_list')
# G = nx.from_edgelist(edge_list, create_using=nx.Graph())
# node_dict = nx.betweenness_centrality(G, k=None, normalized=False, weight=None, endpoints=False, seed=None)
# node_dict
colors_list = range(1, 32)
for i in range(1, 32):
    try:
        G = nx.from_edgelist(edge_list[:i], create_using=nx.Graph())
        color_dict = vertex_color(G, colors=7, sampler=dimod.ExactSolver())
        print(f'{i}: {color_dict}')
    except ValueError as e:
        print(f'{i}: {str(e).strip()}')
        raise

1: {'Montenegro': 4, 'Serbia': 5}
2: {'Croatia': 0, 'Montenegro': 3, 'Serbia': 5}


KeyboardInterrupt: 

In [39]:

import re
import wikipedia
%run ../load_magic/storage.py

s = Storage()
eurasia_df = s.load_object('eurasia_df')
rows_list = []
countries_str = '|'.join(eurasia_df.wikipedia_title.tolist())
country_regex = re.compile(rf'\b({countries_str})\b')
edge_list = []
for wikipedia_title in eurasia_df.wikipedia_title:
    summary_set = set()
    for i in range(1, 11):
        summary_str = wikipedia.summary(wikipedia_title, auto_suggest=False, sentences=i)
        if 'border' in summary_str:
            sents_list = re.split('\.+ *', summary_str, 0)
            for i, sent in enumerate(sents_list):
                if 'border' in sent:
                    summary_set.add(sent)
    border_countries_set = set()
    for sent in summary_set:
        if country_regex.search(sent):
            border_countries_list = country_regex.findall(sent)
            for border_country in border_countries_list:
                border_countries_set.add(border_country[0])
    for border_country in border_countries_set:
        row_dict = {}
        row_dict['wikipedia_title'] = wikipedia_title
        row_dict['border_country'] = border_country
        rows_list.append(row_dict)
        edge_tuple = (wikipedia_title, border_country)
        edge_list.append(edge_tuple)
summary_str_df = pd.DataFrame(rows_list)
s.store_objects(wikipedia_country_border_summaries_df=summary_str_df, wikipedia_country_border_edge_list=edge_list)

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pickle\wikipedia_country_border_summaries_df.pickle
Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pickle\wikipedia_country_border_edge_list.pickle


In [28]:

import networkx as nx
import dwave_networkx as dnx
from dwave_networkx.algorithms.coloring import vertex_color
import dimod
%run ../load_magic/storage.py

s = Storage()
edge_list = s.load_object('wikipedia_country_border_edge_list')
# G = nx.from_edgelist(edge_list, create_using=nx.Graph())
# node_dict = nx.betweenness_centrality(G, k=None, normalized=False, weight=None, endpoints=False, seed=None)
# node_dict
colors_list = range(1, 32)
for i in range(1, 32):
    try:
        G = nx.from_edgelist(edge_list[:i], create_using=nx.Graph())
        color_dict = vertex_color(G, colors=7, sampler=dimod.ExactSolver())
        print(f'{i}: {color_dict}')
    except ValueError as e:
        print(f'{i}: {str(e).strip()}')
        raise

1: {'Montenegro': 4, 'Serbia': 5}
2: {'Croatia': 0, 'Montenegro': 3, 'Serbia': 5}


KeyboardInterrupt: 

In [37]:

# wikipedia_title = 'Slovenia'
wikipedia_title = summary_str_df.groupby(['wikipedia_title']).count().sort_values(['summary_str'], ascending=False).head(1).index[0]
mask_series = (summary_str_df.wikipedia_title == wikipedia_title)
print(wikipedia.summary(wikipedia_title, auto_suggest=False))
print()
for summary_str in summary_str_df[mask_series].summary_str:
    print(summary_str)
print()
for border_country in summary_str_df[mask_series].border_country:
    print(border_country)

India, officially the Republic of India (Hindi: Bhārat Gaṇarājya), is a country in South Asia. It is the seventh-largest country by area, the second-most populous country, and the most populous democracy in the world. Bounded by the Indian Ocean on the south, the Arabian Sea on the southwest, and the Bay of Bengal on the southeast, it shares land borders with Pakistan to the west; China, Nepal, and Bhutan to the north; and Bangladesh and Myanmar to the east. In the Indian Ocean, India is in the vicinity of Sri Lanka and the Maldives; its Andaman and Nicobar Islands share a maritime border with Thailand, Myanmar and Indonesia.
Modern humans arrived on the Indian subcontinent from Africa no later than 55,000 years ago.
Their long occupation, initially in varying forms of isolation as hunter-gatherers, has made the region highly diverse, second only to Africa in human genetic diversity. Settled life emerged on the subcontinent in the western margins of the Indus river basin 9,000 years ag