# Wiki Scraper

#### Importing in all the required packages

In [1]:
import requests, time, re, json, string
from bs4 import BeautifulSoup

from multiprocessing.pool import ThreadPool
from requests.packages.urllib3.exceptions import *
from socket import gaierror
from googletrans import Translator
translator = Translator()

import csv

#### Setting up the functions to be used

In [4]:
"""
The stem url for api calls. manipulating this variable
could allow for calls referencing pages written in languages other than english.
"""
API_URL = 'http://en.wikipedia.org/w/api.php'

def get_page(title):
    if title is not None:
        # retry page processing as sometimes network errors
        # can occur. Most of the time an exception will not be thrown.
        try:
            page = requests.get('https://en.wikipedia.org/wiki/' + title)
        except requests.exceptions.RequestException as e:
            time.sleep(1)
            try:
                page = requests.get('https://en.wikipedia.org/wiki/' + title)
            except requests.exceptions.RequestException as e:
                return None
        return BeautifulSoup(page.content, 'lxml')
    
def get_pages(titles, num_workers=20):
    """
    Use a thread pool to download multiple pages for faster batch processing.
    Can't use the standard wikimedia api multi-title feature as it increases
    the likelihood useful data won't be returned.
    """
    if type(titles) == list and len(titles) > 0:
        if len(titles) < num_workers: num_workers = len(titles)
        pool = ThreadPool(num_workers)
        pages = pool.map(get_page, titles)
        pool.close()
        pool.join()
        return [i for i in pages if i]
    
def get_url(url):
    if url is not None:
        try:
            page = requests.get(url)
        except requests.exceptions.RequestException as e:
            time.sleep(1)
            try:
                page = requests.get(url)
            except requests.exceptions.RequestException as e:
                return None
        return BeautifulSoup(page.content, 'lxml')
    
def get_urls(urls, num_workers=20):
    """
    Use a thread pool to download multiple urls for faster batch processing.
    """
    if type(urls) == list and len(urls) > 0:
        if len(urls) < num_workers: num_workers = len(urls)
        pool = ThreadPool(num_workers)
        pages = pool.map(get_url, urls)
        pool.close()
        pool.join()
        return [i for i in pages if i]
    
def search(name):
    """
    queries the wikimedia API to return {limit} matches to the search term
    """
    titles = _search_query(name)
    return titles
    
def _search_query(name):
    """
    the underlying api query that powers the search function.
    """
    url = 'https://en.wikipedia.org/w/index.php'
    
    params = {
        'sort': 'relevance',
        'search': name,
        'profile': 'advanced',
        'fulltext': 1
    }

    r = requests.get(url, params=params)
    if r.status_code == 200:
        page = BeautifulSoup(r.content, 'lxml')
        titles = page.find_all('div', {'class': 'mw-search-result-heading'})
        titles = [t.find('a')['title'] for t in titles]
        return titles
    else:
        time.sleep(0.5)
        r = requests.get(API_URL, params=params)
        return titles


# # Get Country Data
# iso, wiki title mapping

# In[2]:


iso_page = requests.get('https://en.wikipedia.org/wiki/ISO_3166-1_alpha-3').content
iso_page = BeautifulSoup(iso_page, 'lxml')
iso_page = iso_page.find('div', {'class': 'plainlist'})

iso_title = {}
title_iso = {}

for i in iso_page.find_all('li'):
    tmp = {}
    iso = i.find('span').text
    title = i.find('a')['title']
    iso_title[iso] = title
    title_iso[title] = iso
    
# Adding in missing ISOs on wiki
iso_title['XKX'] = 'Kosovo'
title_iso['Kosovo'] = 'XKX'
    
# # Process Infobox

# In[3]:

def clean(x):
    x = x.strip()
    x = x.replace('\xa0', ' ')
    x = re.sub(r'\[.*\]', '', x)
    x = re.sub(r'\(.*\)', '', x)
    x = re.sub('\.[^\s\d]+?(\s|$)', '', x)
    x = x.replace('- ', '').replace('•', '').replace(' \n', ', ').replace('\n', ', ').strip()
    x = re.sub('\s+', ' ', x)
    return x

# In[4]:

def fix_newlines(infobox):
    # add some newlines to make lists look nicer.
    for br in infobox.find_all("br"):
        br.replace_with("\n")
    for img in infobox.find_all("img"):
        img.replace_with("")
    for img in infobox.find_all('a', {'class': 'image'}):
        img.replace_with("")
    for li in infobox.find_all("li"):
        li.replace_with(li.text + '\n')
        
def get_top_region(infobox, data):
    # Australian cities store region at the top.
    if infobox.find('span', {'class': 'region'}):
        data['region_aus'] = [infobox.find('span', {'class': 'region'}).text]
        data['region_aus_links'] = [infobox.find('span', {'class': 'region'}).find('a')]
        data['region_aus_links'] = [a['href'] for a in data['region_aus_links'] if a and a.has_attr('href')]
    return data

def get_cats(infobox, data):
    cat = infobox.find('div', {'class': 'category'})
    if cat:
        cat = cat.text
        data['category_label'] = cat
    return data

def get_infobox(page):
    
    data = {}
    
    infobox = page.find('table', {'class': 'infobox'})
    if not infobox:
        return {}
    
    fix_newlines(infobox)
    data = get_top_region(infobox, data)
    data = get_cats(infobox, data)

    section = ''
    section_year = None
    sect_change = False
    
    # iterate through each table row.
    for i in infobox.find_all('tr'):
        key = i.find('th')
        value = i.find('td')
        if key:
            
            # mergedtoprow defines sections. This is useful to keep track of years
            # that scope over the entire section, and labels.
            if not i.has_attr('class') or (i.has_attr('class') and (i['class'][0] == 'mergedtoprow')):
                sect_change = True
                section = ''
                section_year = None
                
                # these are the only things that reliably have sections that we're interested in.
                if any(i in key.text.lower() for i in ['gdp', 'population', 'area', 
                                                       'government', 'resident', 'time']):
                    
                    # 'largest' avoid issues with New York City, which has a section
                    # starting with 'largest_borough_by_area'
                    if 'largest' not in key.text.lower():
                        section = key.text.lower()
                        
                        # years are usually in parentheses (2019)
                        section_year = re.search('\((.*\s+)?(\d\d\d\d)(\s+.*)?\)', section)
                        if section_year: section_year = section_year.group(2)
                        else:
                            section_year = re.search('\((.*)(-+|\/+)?(\d\d\d\d)(\s+.*)?\)', section)
                            if section_year: section_year = section_year.group(3)
                            else:
                                section_year = re.search('\((.*\s+)?(\d\d\d\d)(-+|\/+)?(.*)?\)', section)
                                if section_year: section_year = section_year.group(2)
            
            if value and value.text.replace(' ', '').replace('\xa0', '') != '':
                                
                # get links in values
                value_links = value.find_all('a')
                if len(value_links) >= 1: 
                    value_links = [v for v in value_links if v.has_attr('href')]
                    value_links = [v['href'] for v in value_links]
                    # remove citations:
                    value_links = [v for v in value_links if not v.startswith('#')]
                else:
                    value_links = None

                key, value = key.text.lower(), value.text
                
                # we don't care about rank, and it messes
                # up actors like NY state
                if sect_change and 'rank' in value.lower():
                    continue 
                    
                # Fixes Hangzhou GDP
                sc = re.search('((20|19|18)\d\d)', value)
                fl = re.search("[A-Za-z]", value)
                if sect_change and sc:
                    section_year = sc.group(1)
                    if not fl: continue
                
                # get any year values in the key or value text.
                # these will be associated with this key.
                key_year = re.search('\((.*\s+)?(\d\d\d\d)(\s+.*)?\)', key)
                wh = 'key'
                if not key_year:
                    key_year = re.search('\((.*\s+)?(\d\d\d\d)(\s+.*)?\)', value)
                    wh = 'value'
                if key_year:
                    key_year = key_year.group(2)
                

                # if we have both a key year and a section year,
                # use the key year. 
                if key_year: year = key_year
                else: year = section_year
                
                # clean up the text.
                section = clean(section)
                key = clean(key)
                #selecting km2 instead of sq mi
                if section == 'area' and "(" in value:
                    temp = value.split("(")
                    for val in range(len(temp)):
                        if "km2" in temp[val] or "km²" in temp[val]:
                            if type(value) == str:
                                value = [v for v in temp[val].strip(")").split('\n')]
                    #if area_units is something else
                    if 'km2' not in value[0] and "km²" not in value[0]:
                        value = [temp[0]]
                else:
                    value = [v for v in value.split('\n')]
                value = [clean(v) for v in value]
                #if multiple area types in value
                if "or" in value[0]:
                    temp = value[0].split("or")
                    for v in range(len(temp)):
                        if "km2" in temp[v] or "km²" in temp[v]:
                            value = [temp[v].strip(" ")]
                
                # add section to key, replace spaces.
                if section and section != key:
                    key = section + ' ' + key
                key = key.replace(' ', '_')
                key = key.replace('\xa0', '_')

                # add all this info to data dict.
                data[key] = value
                if year:
                    data[key + '_year'] = year
                if value_links: 
                    data[key + '_links'] = value_links
        sect_change = False
    if 'area' in data:
        if ";" in data['area'][0]:
            if "km2" in data['area'][0]:
                areas = data['area'][0].split(";")
                chosen = [zz for zz in areas if 'km2' in zz]
                area = chosen[0].replace('\xa0',' ')
                area = area.strip(" ")
                data['area'][0] = area
        if "ha" in data['area'][0]:
            try:
                area = float(data['area'][0].split(" ")[0]) / 100
                data['area'][0] = str(area) + " km2"
            except:
                data['area'][0] += " flag"
        if "sq mi" in data['area'][0]:
            area = float(data['area'][0].split(" ")[0]) * 2.58999
            data['area'][0] = str(area) + " km2"
    
    return data

# In[6]:

def infer_type(categories, title, iso):
    city_score = 0
    region_score = 0
    country_score = 0
    company_score = 0
    university_score = 0
    
    cats = [c.lower() for c in categories]
    
    if any(i in cats for i in ['communities of belgium']):
        return ('Region', 10)
    
    if any(i in cats for i in ['somerset west and taunton', 'alfoz (lugo)', 'araquil']) or  \
        title in ['Sabce', 'Island of Mozambique']:
            return ('City', 10)

    
    cats = ' '.join(cats).split(' ')
    if any(i in cats for i in ['births', 'temples']):
        return None
    
    for w in cats:
        
        if w in ['list', 'lists']: return None
                
        if w in ['city', 'cities', 'municipality', 'municipalities', 'town', 'towns', 'capital', 'capitals',
                 'populated', 'partidos', 'geography', 'places', 'comarcas', 'villages', 'communes', 'wards',
                 'ward', 'puebla', 'department', 'areas']:
            if w in ['villages', 'cities', 'towns']:
                city_score += 2
            else:
                city_score += 1
            
        if w in ['arrondissements', 'arrondissement', 'province', 'provinces', 'canton', 'entity',
                 'region', 'regions', 'nuts', 'county', 'counties', 'states', 'districts',
                 'state', 'territory', 'territories', 'emirate', 'geography', 'voivodeship', 'gmina',
                 'canterbury,', 'island', 'district']:
            
            if iso in ['MUS', 'ZAF', 'TZA', 'ZMB', 'CHN', 'KOR', 'PER', 'TUR'] \
                and w in ['districts', 'district']:
                city_score += 1
                
            elif iso in ["KOR", 'USA'] and w in ['county', 'counties']:
                city_score += 1
                
            else:
                region_score += 1
            
        if w in ['country', 'countries', 'states', 'state', 'nations']:
            country_score += 1
            
        if w in ['companies']:
            company_score += 1
            
        if w in ['universities', 'university', 'college', 'colleges', 'educational']:
            university_score += 1
        
        if w in ['disambiguation']:
            return 'Disambiguation'
                
    max_score = max((city_score, region_score, country_score, company_score, university_score))
    if max_score == 0: return None
    if city_score == max_score: return ('City', max_score)
    if company_score == max_score: return ('Company', max_score)
    if region_score == max_score: return ('Region', max_score)
    if country_score == max_score: return ('Country', max_score)
    if university_score == max_score: return ('University', max_score)
    
# In[7]:


def clean_numbers(x):
    x = x.replace(',', '')
    units = None
    if '€' in x:
        units = 'EUR'
        x = x.replace('€', '')
    if '¥' in x:
        units = 'RMB'
        x = x.replace('¥', '')
    if 'CNY' in x:
        units = 'RMB'
        x = x.replace('CNY', '')
    if 'US$' in x:
        units = 'USD'
        x = x.replace('US$', '')
    if '$' in x:
        units = 'USD'
        x = x.replace('$', '')
    if '/km2' in x:
        units = '/km2'
        x = x.replace('/km2', '')
    if 'km2' in x or 'km²' in x:
        units = 'km2'
        x = x.replace('km2', '').replace('km²', '')
    if 'acres' in x:
        units = 'acres'
        x = x.replace('acres', '')
    if 'ha' in x:
        units = 'ha'
        x = x.replace('ha', '')
    if 'km' in x:
        units = 'km'
        x = x.replace('km', '')
    if '/sq mi' in x:
        units = '/sq mi'
        x = x.replace('/sq mi', '')
    if '/sqmi' in x:
        units = '/sq mi'
        x = x.replace('/sqmi', '')
    if 'sq mi' in x:
        units = 'sq mi'
        x = x.replace('sq mi', '')
    if 'sqmi' in x:
        units = 'sqmi'
        x = x.replace('sq mi', '')
    if 'miles' in x:
        units = 'mi'
        x = x.replace('miles', '')
    if 'mi' in x and 'million' not in x:
        units = 'mi'
        x = x.replace('mi', '')
    if 'm' in x and 'mi' not in x and 'million' not in x:
        units = 'm'
        x = x.replace('m', '')
    if 'ft' in x:
        units = 'ft'
        x = x.replace('ft', '')
    
    factor = 1
    if 'thousand' in x: factor = 1000
    if 'million' in x: factor  = 1000000
    if 'billion' in x: factor  = 1000000000
    if 'trillion' in x: factor = 1000000000000
    x = x.replace('thousand', '').replace('million', '').replace('billion', '').replace('trillion', '')
            
    x = [c for c in x if c in '1234567890.']
    x = ''.join(x)
    x = x.strip('.')
    if x == '': return None, None
    if x.count(".") > 1: return x, "flag"
    return float(x) * factor, units


#### In[8]:


from collections import Counter

def process_page(page):
    data = {}
    
    data['title']   = page.find('h1', {'class': 'firstHeading'}).text
    if data['title'].startswith('File:'): return None
    ## print(data['title'])
    
    # get coords in decimal degrees
    latlon = page.find('span', {'class': 'geo'})
    if latlon and "," not in str(latlon.text): 
        data['lat'] = float(latlon.text.split(";")[0])
        data['lng'] = float(latlon.text.split(";")[1])
    
    
    # attempt to get a wiki_iso. 
    data['wiki_iso'] = None
    infobox = page.find('table', {'class': 'infobox'})
    if infobox:
        for i in infobox.find_all('a'):
            if i.has_attr('title'):
                title = i['title']
                if title in title_iso:
                    data['wiki_iso'] = title_iso[title]
                    break
                    
    
    # Fixing for some places
    if "Hong Kong" in data['title']: data['wiki_iso'] = 'HKG'
    if "Zimbabwe" in data['title']: data['wiki_iso'] = 'ZWE'
    if "Waterloo (village), New York" in data['title']: data['wiki_iso'] = 'USA'
    
        
        
    if not data['wiki_iso']:
        ps = [p for p in page.find_all('p') if not p.has_attr('class')]
        if len(ps) >= 1:
            for i in ps[0].find_all('a'):
                if i.has_attr('title'):
                    title = i['title']
                    if title in title_iso:
                        data['wiki_iso'] = title_iso[title]
                        break
    
    data['infobox'] = get_infobox(page)
    
    
    if not data['wiki_iso']:
        if 'country' in data['infobox']:
            if data['infobox']['country'][0] in title_iso:
                data['wiki_iso'] = title_iso[data['infobox']['country'][0]]
    
    # These 3 introduce a lot of noise. Indiana -> India, for example. 
    # necessary for some actors though. :(
    
    if not data['wiki_iso']:
        ps = [p for p in page.find_all('p') if not p.has_attr('class')]
        if len(ps) >= 1:
            for k, v in title_iso.items():
                if k in ps[0].text:
                    data['wiki_iso'] = v
                    break
    
    if not data['wiki_iso']:
        if infobox:
            for k, v in title_iso.items():
                if k in infobox.text:
                    data['wiki_iso'] = v
                    break
    
    if not data['wiki_iso']:
        for k, v in title_iso.items():
            if k in data['title']:
                data['wiki_iso'] = v
                break
    
    data['categories'] = []
    cat = page.find('div', {'id': 'mw-normal-catlinks'})
    if cat:
        data['categories'] = [i.text for i in cat.find_all('li')]
    if 'category_label' in data['infobox']:
        data['categories'].append(data['infobox']['category_label'])  
    data['wiki_type'] = infer_type(data['categories'], data['title'], data['wiki_iso'])
    
        
    if type(data['wiki_type']) == tuple : data['wiki_type'] = data['wiki_type'][0]
    

    
    for k,v in data['infobox'].items():
        if 'area' in k and 'links' not in k and 'year' not in k and 'code' not in k \
            and 'coordinates' not in k and k != 'areas' and 'rank' not in k:
            data['area'] = data['infobox'][k]        
            data['area'], data['area_units'] = clean_numbers(data['area'][0])
            if k+'_year' in data['infobox']:
                data['area_year'] = data['infobox'][k+'_year']
            break  
            
    data['population'] = []

    for k,v in data['infobox'].items():
        if 'population' in k and 'links' not in k and 'year' not in k and 'municipality' not in k:
            if 'rank' not in k and 'density' not in k and 'percent' not in k and 'ethnic' not in k \
                and 'change' not in k and 'demonym' not in k and '_by_' not in k and 'median' not in k \
                and 'household' not in k and v[0] != '' and 'largest' not in k and 'for' not in v[0] \
                and 'language' not in k and not k.endswith('population_') and 'gender' not in k \
                and 'pop_2011–2016' not in k: 
                    if 'estimate' in k and k+'_year' in data['infobox'].keys():
                        yy = data['infobox'][k+'_year']
                        data['population'].append([yy+'_'+k,data['infobox'][k]])
                    else:
                        data['population'].append([k,data['infobox'][k]])
                    for pop in data['population']:
                        if type(pop[1]) != float and type(pop[1]) != 'NoneType' and pop[1][0] != "N/A":
                            pop[1], _ = clean_numbers(pop[1][0])
                    
    if data['population']:
        k = data['population'][0][0]
        if k+'_year' in data['infobox'].keys():
            data['population_year'] = data['infobox'][k+'_year']
        for tem in data['population']:
            tem[0] = tem[0].replace(',','')
            tem[0] = tem[0].replace('population_', '')
            tem[0] = tem[0].replace('greater_toronto_area','metro')
            tem[0] = tem[0].replace('†_city_proper._', '')
            if "(" in tem[0]: tem[0] = tem[0].split("_")[-1]
            if 'population_year' not in data:
                if 'census' in tem[0]:
                    data['population_year'] = tem[0].split("_")[0]
                elif 'estimate' in tem[0]:
                    data['population_year'] = tem[0].split("_")[0]
        #data['population'][0][0] = 'population'
    else:
        del data['population']
    
    for k,v in data['infobox'].items():
        if 'gdp' in k and 'links' not in k and 'year' not in k and 'hdi' not in k:
            data['gdp'] = data['infobox'][k]
            if 'USD' in data['gdp'][0] and 'CNY' in data['gdp'][0]:
                data['gdp'] = ["USD" + data['gdp'][0].split("USD")[1]]
            data['gdp'], data['gdp_units'] = clean_numbers(data['gdp'][0])
            if k+'_year' in data['infobox']:
                data['gdp_year'] = data['infobox'][k+'_year']
            break
                
    for k,v in data['infobox'].items():
        if 'elevation' in k and 'links' not in k and 'year' not in k:
            data['elevation'] = data['infobox'][k][0]
            break
        
    
    return data


 
# In[9]:


def get_region_hierarchy(data):
    
    # generate a list of region triples. 
    # (key, page['title'], page['area'], page['population'])
    
    region_words = ['region', 'state', 'province', 'county', 'emirate', 'gmina', 'island',
                    'district', 'raion', 'prefecture', 'territory', 'canton',
                    'governorate', 'administrative_region', 'mkhare', 'metropolitan_city',
                    'oblast', 'autonomous_region', 'commune', 'entity', 'sovereign_state',
                    'municipality', 'provinces', 'marz', 'republic', 'cercle',
                    'autonomous_community', 'provincekind', 'governorates', 'comarca',
                    'locale', 'subdivision', 'voblast', 'voivodeship', 'parish',
                    'constituent_country', 'ceremonial_county', 'arrondissement', 'admin', 'country']
    
    tmp = []
    for k,v in data['infobox'].items():
        if '_links' in k or '_year' in k or 'postcode' in k or 'area_' in k or 'population_' in k or 'gdp_' in k \
            or 'congressional_' in k or 'government_' in k or 'highest_' in k or '_electorate' in k or 'nuts_' in k \
            or 'historical_' in k or '_bird' in k or '_fish' in k or '_flower' in k or '_stone' in k or 'animal' in k \
            or 'language' in k or 'historic_' in k or "_specialized" in k or "judicial_" in k or "romanisation" in k: continue
        if k.startswith("sub") and k.endswith('s'): continue
        if any(w in k for w in region_words):
            for i in v:
                if any(char.isdigit() for char in i) or i.lower().startswith("list") or "," in i: break
                if i != '' and i != 'n':
                    tmp += [[k, i]]
                    
    if len(tmp) > 0 and ('country' in tmp[-1][0] or 'sovereign_state' in tmp[-1][0]):
        return tmp[::-1]
    
    return tmp

# In[10]:


def handle_disambig(name, iso, entity_type, disambig, debug=False):
    page = get_page(disambig['title'])
    page = page.find('div', {'class': 'mw-parser-output'})
    if debug: print('Disambiguating')
    if page:
        for i in page.find_all('li'):
            a = i.find('a')
            if a and a.has_attr('title'):
                data = process_page(get_page(a['title']))
                if debug: print(data['title'], data['wiki_type'], data['wiki_iso'])
                if data['wiki_type'] == entity_type and data['wiki_iso'] == iso:
                    data['region_hierarchy'] = get_region_hierarchy(data)
                    if data['infobox'] == {} or data['infobox'] == None:
                        if debug: print('no infobox')

                    del data['infobox']
                    data['categories'] = '; '.join(data['categories'])
                    return data
    return {}


# In[11]:


def get_data(name, iso, entity_type, check_country=True, debug=False, check_disambig=True):
    name = name.replace('!', '')
    name = name.replace('Aggregazione ', '')
    name = name.replace('Intercommunality of ', '')
    name = name.replace('Ilha de', "Island of")
    if "," in name or 'State' in name or iso_title[iso] in name or name in ['Tlokwe']:
        if iso == 'JPN': 
            if "Town" in name: name = name.replace("Town", '')
            if "City" in name: name = name.split(" City")[0]
        titles = search(name)
    else: 
        if name.startswith('Nasu'): name = name.replace("-",'').title()
        
        titles = search(name + ", " + iso_title[iso])
    if not check_country and debug: print(titles)
    pages = get_pages(titles)
    if not pages:
        if debug: print('pages null')
        return {}
    pages = [process_page(page) for page in pages]
    
    #for p in pages: print(p['title'])
    #debug = True
    
    if debug:
        for p in pages: print(p)
    

    if len(pages) != 0:
        disambig = [i for i in pages if i['wiki_type'] == 'Disambiguation']
        if len(disambig) > 0: disambig = disambig[0]
        else: disambig = None
        
        # filter iso
        if iso:
            iso_pages = [i for i in pages if 'wiki_iso' in i]
            iso_pages = [i for i in iso_pages if i['wiki_iso'] == iso]

            if len(iso_pages) == 0:
                name = name.lower().replace('dimos', 'municipality')
                if disambig and check_disambig:
                    d = handle_disambig(name, iso, entity_type, disambig, debug=debug)
                    if d and d != {}: return d
                if check_country and iso in iso_title:
                    if debug: print('\nchecking country')
                    name = name.replace('d\'', '')
                    return get_data(name + ", " + iso_title[iso], iso, entity_type, check_country=False, debug=debug)
        else: iso_pages = pages
        
        
        #for i in iso_pages: print(i['title'], i['wiki_type'])
        #filter entity_type
        et_pages = [i for i in iso_pages if i['wiki_type'] == entity_type]  
     
        if len(et_pages) == 0:
            name = name.lower().replace('dimos', 'municipality')
            if disambig and check_disambig:
                d = handle_disambig(name, iso, entity_type, disambig, debug=debug)
                if d and d != {}: return d
            if check_country and iso in iso_title:
                if debug: print('\nchecking country')
                name = name.replace('d\'', '')
                return get_data(name + ", " + iso_title[iso], iso, entity_type, check_country=False, debug=debug)
        else:
            max_score = 0
            rank = 0
            # for page_num in range(len(et_pages)):
            #     score = infer_type(et_pages[page_num]['categories'],
            #                        et_pages[page_num]['title'],
            #                        et_pages[page_num]['wiki_iso'])[1]
            #     print(et_pages[page_num]['title'], score)
            #     if score > max_score:
            #         max_score = score
            #         rank = page_num
            data = et_pages[rank]
            data['region_hierarchy'] = get_region_hierarchy(data)
        
            if data['infobox'] == {} or data['infobox'] == None:
                if debug: print('no infobox')

            del data['infobox']
            data['categories'] = '; '.join(data['categories'])
            
            return data
    return {}


In [5]:
# Testing cell

get_data("Ann Arbor", "USA", "City")

{'title': 'Ann Arbor, Michigan',
 'lat': 42.28139,
 'lng': -83.74833,
 'wiki_iso': 'USA',
 'categories': 'Ann Arbor, Michigan; Populated places established in 1824; Academic enclaves; County seats in Michigan; Cities in Washtenaw County, Michigan; Metro Detroit; 1824 establishments in Michigan Territory; University towns in the United States; City',
 'wiki_type': 'City',
 'area': 74.56,
 'area_units': 'km2',
 'population': [['city', 113934.0],
  ['2019_estimate', 119980.0],
  ['urban', 306022.0],
  ['metro', 344791.0]],
 'population_year': '2010',
 'elevation': '840 ft',
 'region_hierarchy': [['country', 'United States'],
  ['state', 'Michigan'],
  ['county', 'Washtenaw']]}

#### Importing in Region Dictionary

In [6]:
with open('reg_dict.csv') as x:
    reg_dict = {}
    reader = csv.reader(x)
    skip = next(reader)
    for row in reader:
        reg_dict[row[1]] = row[2]

# Past Usages of Scraper



## Adding _Latitude & Longitude, Admin_1_ and _Population Information_ from Wikipedia and Flagging Rows with Possible Errors

In [7]:
filename = input("What is your file name? ") #latlngscrape.csv

with open(filename, encoding = 'utf-8-sig') as x:
    con = []
    reader = csv.DictReader(x)
    for rows in reader:
        con.append(dict(rows))

errors = []
a = -1

for x in con:
    a += 1
    if a % 6 == 0: print((str(round(a/len(con) * 100,2)) + "% completed"))
    name = x['name']
    iso = x['iso']
    et = x['entity_type']
    x['wiki_name'] = 'NA'
    x['wiki_lat'] = 'NA'
    x['wiki_lng'] = 'NA'
    x['wiki_pop'] = 'NA'
    x['wiki_pop_year'] = 'NA'
    x['wiki_state'] = 'NA'
    x['admin_1'] = 'NA'
    try:
        info = get_data(name, iso, et)
        if info:   
            if 'lat' in info:
                x['wiki_lat'] = info['lat']
                x['wiki_lng'] = info['lng']
            x['wiki_name'] = info['title']
            if 'population' in info:
                x['wiki_pop'] = info['population']
                if 'population_year' in info: x['wiki_pop_year'] = info['population_year']
            if info['region_hierarchy']:
                x['admin_1'] = info['region_hierarchy'][-1]
                for reg in info['region_hierarchy']:
                    if reg[0] == 'state':
                        x['wiki_state'] = reg[1]
            
    except:
        errors.append(a)
        print(errors)

print()
d1 = input("Would you like to flag possible rows with errors? (Y/N) ")
if d1.lower() == 'y':
    for x in con:
        x['flag'] = 1
        name = x['name'].replace(",", " ")
        name = name.replace("San", "")
        name = name.split()
        wname = x['wiki_name'].replace(",", " ").split()
        if any(word in name for word in wname):
            x['flag'] = 0
    
    a = 0
    for x in con:
        a += x['flag']
                
    print("%s rows have been flagged!" % a) 
    print()

d2 = input("Would you like to export the file? (Y/N) " )

if d2.lower() == "y":
    with open(filename.strip(".csv") + "_updated.csv", 'w', newline='', encoding = 'utf-8-sig') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=',')
        header = list(con[0].keys())
        spamwriter.writerow(header)
        for x in con:
            row = list(x.values())
            spamwriter.writerow(row)
    print()
    print("File has been updated!")
    print("Rows with errors: " + str(errors))
    
else:
    print()
    print("Rows with errors: " + str(errors))

What is your file name? Current_ICLEI_Members_list_scrape.csv
0.0% completed
1.86% completed
3.73% completed
5.59% completed
7.45% completed
9.32% completed
11.18% completed
13.04% completed
14.91% completed
16.77% completed
18.63% completed
20.5% completed
22.36% completed
24.22% completed
26.09% completed
27.95% completed
29.81% completed
31.68% completed
33.54% completed
35.4% completed
37.27% completed
39.13% completed
40.99% completed
42.86% completed
44.72% completed
46.58% completed
48.45% completed
50.31% completed
52.17% completed
54.04% completed
55.9% completed
57.76% completed
59.63% completed
61.49% completed
63.35% completed
65.22% completed
67.08% completed
68.94% completed
70.81% completed
72.67% completed
74.53% completed
76.4% completed
78.26% completed
80.12% completed
81.99% completed
83.85% completed
85.71% completed
87.58% completed
89.44% completed
91.3% completed
93.17% completed
95.03% completed
96.89% completed
98.76% completed

Would you like to flag possible

## Adding _Latitude & Longitude Information_ from Wikipedia and Flagging Rows with Possible Errors

In [None]:
filename = input("What is your file name? ") #latlngscrape.csv

with open(filename, encoding = 'utf-8-sig') as x:
    con = []
    reader = csv.DictReader(x)
    for rows in reader:
        con.append(dict(rows))

errors = []
a = -1

for x in con:
    a += 1
    if a % 6 == 0: print((str(round(a/len(con) * 100,2)) + "% completed"))
    name = x['name']
    iso = x['iso']
    et = x['entity_type']
    x['wiki_name'] = 'NA'
    x['wiki_lat'] = 'NA'
    x['wiki_lng'] = 'NA'
    try:
        info = get_data(name, iso, et)
        if info:   
            if 'lat' in info:
                x['wiki_lat'] = info['lat']
                x['wiki_lng'] = info['lng']
            x['wiki_name'] = info['title']
            
    except:
        errors.append(a)
        print(errors)

print()
d1 = input("Would you like to flag possible rows with errors? (Y/N) ")
if d1.lower() == 'y':
    for x in con:
        x['flag'] = 1
        name = x['name'].replace(",", " ")
        name = name.replace("San", "")
        name = name.split()
        wname = x['wiki_name'].replace(",", " ").split()
        if any(word in name for word in wname):
            x['flag'] = 0
    
    a = 0
    for x in con:
        a += x['flag']
                
    print("%s rows have been flagged!" % a) 
    print()

d2 = input("Would you like to export the file? (Y/N) " )

if d2.lower() == "y":
    with open(filename.strip(".csv") + "_updated.csv", 'w', newline='', encoding = 'utf-8-sig') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=',')
        header = list(con[0].keys())
        spamwriter.writerow(header)
        for x in con:
            row = list(x.values())
            spamwriter.writerow(row)
    print()
    print("File has been updated!")
    print("Rows with errors: " + str(errors))
    
else:
    print()
    print("Rows with errors: " + str(errors))

## Adding _Population, Latitude & Longitude Information_ from Wikipedia and Flagging Rows with Possible Errors

In [None]:
filename = input("What is your file name? ") #errorneous_population_year.csv

with open(filename, encoding = 'utf-8-sig') as x:
    con = []
    reader = csv.DictReader(x)
    for rows in reader:
        con.append(dict(rows))


errors = []
a = -1

for x in con:
    a += 1
    if a % 5 == 0: print((str(round(a/len(con) * 100,2)) + "% completed"))
    name = x['name']
    iso = x['iso']
    et = x['entity_type']
    x['wiki_name'] = 'NA'
    x['wiki_lat'] = 'NA'
    x['wiki_lng'] = 'NA'
    x['wiki_pop'] = 'NA'
    x['wiki_pop_year'] = 'NA'
    try:
        info = get_data(name, iso, et)
        if info:   
            if 'lat' in info:
                x['wiki_lat'] = info['lat']
                x['wiki_lng'] = info['lng']
            x['wiki_name'] = info['title']
            if 'population' in info:
                x['wiki_pop'] = info['population']
                if 'population_year' in info: x['wiki_pop_year'] = info['population_year']
            
    except:
        errors.append(a)
        print(errors)

print()
d1 = input("Would you like to flag possible rows with errors? (Y/N) ")
if d1.lower() == 'y':
    for x in con:
        x['flag'] = 1
        name = x['name'].replace(",", " ")
        name = name.replace("San", "")
        name = name.split()
        wname = x['wiki_name'].replace(",", " ").split()
        if any(word in name for word in wname):
            x['flag'] = 0

    
    for x in con:
        if x['flag'] == 0 and x['lat'] != 'NA' and x['wiki_lat'] != 'NA':
            if abs(float(x['lat'])-float(x['wiki_lat'])) > 0.1 \
                or abs(float(x['lng'])-float(x['wiki_lng'])) > 0.1:
                    x['flag'] = 1
    
    a = 0
    for x in con:
        a += x['flag']
                
    print("%s rows have been flagged!" % a) 
    print()

d2 = input("Would you like to export the file? (Y/N) " )

if d2.lower() == "y":
    with open(filename.strip(".csv") + "_updated.csv", 'w', newline='', encoding = 'utf-8-sig') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=',')
        header = list(con[0].keys())
        spamwriter.writerow(header)
        for x in con:
            row = list(x.values())
            spamwriter.writerow(row)
    print()
    print("File has been updated!")
    print("Rows with errors: " + str(errors))
    
else:
    print()
    print("Rows with errors: " + str(errors))

## Adding _Population, Latitude & Longitude Information_ from Wikipedia and Flagging Rows with Possible Errors

In [None]:
filename = input("What is your file name? ") #population_scraping.csv

with open(filename, encoding = 'utf-8-sig') as x:
    con = []
    reader = csv.DictReader(x)
    for rows in reader:
        con.append(dict(rows))

errors = []
a = -1

for x in con:
    a += 1
    if a % 8 == 0: print((str(round(a/len(con) * 100,2)) + "% completed"))
    name = x['name']
    iso = x['iso']
    et = x['entity_type']
    x['wiki_name'] = 'NA'
    x['wiki_lat'] = 'NA'
    x['wiki_lng'] = 'NA'
    x['wiki_pop'] = 'NA'
    x['wiki_pop_year'] = 'NA'
    try:
        info = get_data(name, iso, et)
        if info:   
            if 'lat' in info:
                x['wiki_lat'] = info['lat']
                x['wiki_lng'] = info['lng']
            x['wiki_name'] = info['title']
            if 'population' in info:
                x['wiki_pop'] = info['population']
                if 'population_year' in info: x['wiki_pop_year'] = info['population_year']
            
    except:
        errors.append(a)
        print(errors)

print()
d1 = input("Would you like to flag possible rows with errors? (Y/N) ")
if d1.lower() == 'y':
    for x in con:
        x['flag'] = 1
        name = x['name'].replace(",", " ")
        name = name.replace("San", "")
        name = name.split()
        wname = x['wiki_name'].replace(",", " ").split()
        if any(word in name for word in wname):
            x['flag'] = 0

    
    for x in con:
        if x['flag'] == 0 and x['lat'] != 'NA' and x['wiki_lat'] != 'NA':
            if abs(float(x['lat'])-float(x['wiki_lat'])) > 0.1 \
                or abs(float(x['lng'])-float(x['wiki_lng'])) > 0.1:
                    x['flag'] = 1
    
    a = 0
    for x in con:
        a += x['flag']
                
    print("%s rows have been flagged!" % a) 
    print()

d2 = input("Would you like to export the file? (Y/N) " )

if d2.lower() == "y":
    with open(filename.strip(".csv") + "_updated.csv", 'w', newline='', encoding = 'utf-8-sig') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=',')
        header = list(con[0].keys())
        spamwriter.writerow(header)
        for x in con:
            row = list(x.values())
            spamwriter.writerow(row)
    print()
    print("File has been updated!")
    print("Rows with errors: " + str(errors))
    
else:
    print()
    print("Rows with errors: " + str(errors))

## Adding _Region Information_ from Wikipedia

In [None]:
filename = input("What is your file name? ") #subnational_contextuals_database_August2020.csv

with open(filename, encoding = 'utf-8-sig') as x:
    con = []
    reader = csv.DictReader(x)
    for rows in reader:
        con.append(dict(rows))

errors = []
a = -1

for x in con:
    a += 1
    if a % 14 == 0: print((str(round(a/len(con) * 100,2)) + "% completed"))
    name = x['name']
    iso = x['iso']
    et = x['entity_type']
    x['wiki_name'] = 'NA'
    x['region_hierarchy'] = 'NA'
    x['region_name'] = 'NA'
    x['admin_1'] = 'NA'
    try:
        info = get_data(name, iso, et)
        if info:   
            if 'lat' in info:
                lat = info['lat']
                lng = info['lng']
                if float(x['lat'])-0.5 > float(lat) or float(lat) > float(x['lat'])+0.5 or float(x['lng'])-0.5 > float(lng) or float(lng) > float(x['lng'])+0.5: 
                    continue
            x['wiki_name'] = info['title']
            if info['region_hierarchy']:
                x['region_hierarchy'] = info['region_hierarchy']
                x['admin_1'] = info['region_hierarchy'][-1]
                if iso in reg_dict:
                    for reg in info['region_hierarchy']:
                        if reg[0] in reg_dict[iso].lower():
                            x['region_name'] = reg
                else:
                    x['region_name'] = x['admin_1'] 
            
            
    except:
        errors.append(a)
        print(errors)

print()
decision = input("Would you like to export the file? (Y/N) " )

if decision.lower() == "y":
    with open(filename.strip(".csv") + "_updated.csv", 'w', newline='', encoding = 'utf-8-sig') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=',')
        header = list(con[0].keys())
        spamwriter.writerow(header)
        for x in con:
            row = list(x.values())
            spamwriter.writerow(row)
    print()
    print("File has been updated!")
    print("Rows with errors: " + str(errors))
    
else:
    print()
    print("Rows with errors: " + str(errors))

## Updating _Area, Population, GDP_ NA Values & Adding _Latitude, Longitude, Elevation and Region Information_ from Wikipedia

In [None]:
filename = input("What is your file name? ") #contextuals_database_10June20_zy.csv

with open(filename, encoding = 'utf-8-sig') as x:
    con = []
    reader = csv.DictReader(x)
    for rows in reader:
        con.append(dict(rows))

errors = []
a = -1

for x in con:
    a += 1
    if a % 14 == 0: print((str(round(a/len(con) * 100,2)) + "% completed"))
    name = x['name']
    iso = x['iso']
    et = x['entity_type']
    x['wiki_name'] = "NA"
    x['wiki_lat'] = "NA"
    x['wiki_lng'] = "NA"
    x['elevation'] = "NA"
    x['region_hierarchy'] = 'NA'
    x['region_name'] = 'NA'
    x['admin_1'] = 'NA'
    try:
        info = get_data(name, iso, et)
        if info:
            if x['area'] == "NA":
                if 'area' in info:
                    x['area'] = info['area']
                    if info['area_units']: x['area_units'] = info['area_units']
            if x['population'] == "NA":
                if 'population' in info:
                    x['population'] = info['population']
                    if info['population_year']: x['population_year'] = info['population_year']
            if x['gdp'] == "NA":
                if 'gdp' in info:
                    x['gdp'] = info['gdp']
                    if info['gdp_unit']: x['gdp_unit'] = info['gdp_unit']
                    if info['gdp_year']: x['gdp_year'] = info['gdp_year']    
            x['wiki_name'] = info['title']
            if "lat" in info:
                x['wiki_lat'] = info['lat']
                x['wiki_lng'] = info['lng']
            if 'elevation' in info:
                x['elevation'] = info['elevation']
            if info['region_hierarchy']:
                x['region_hierarchy'] = info['region_hierarchy']
                x['admin_1'] = info['region_hierarchy'][-1]
                if iso in reg_dict:
                    for reg in info['region_hierarchy']:
                        if reg[0] in reg_dict[iso].lower():
                            x['region_name'] = reg
                else:
                    x['region_name'] = x['admin_1']  
            
            
    except:
        errors.append(a)
        print(errors)


decision = input("Would you like to export the file? (Y/N) " )

if decision.lower() == "y":
    with open(filename.strip(".csv") + "_updated.csv", 'w', newline='', encoding = 'utf-8-sig') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=',')
        header = list(con[0].keys())
        spamwriter.writerow(header)
        for x in con:
            row = list(x.values())
            spamwriter.writerow(row)
    print("File has been updated!")
    print("Rows with errors: " + str(errors))
    
else:
    print("Rows with errors: " + str(errors))

## Scraper for other Languages (Work In Progress)
#### Includes: Spanish, Ukranian, Hungarian

In [None]:
def get_page_2(title, iso):
    global lan
    
    if title is not None:
        # retry page processing as sometimes network errors
        # can occur. Most of the time an exception will not be thrown.
        if iso == "ESP": lan = 'es'
        elif iso == "UKR": lan = 'uk'
        elif iso == "HUN": lan = 'hu'

        try:
            page = requests.get('https://' + lan + '.wikipedia.org/wiki/' + title)
        except requests.exceptions.RequestException as e:
            time.sleep(1)
            try:
                page = requests.get('https://' + lan + '.wikipedia.org/wiki/' + title)
            except requests.exceptions.RequestException as e:
                return None
        return BeautifulSoup(page.content, 'lxml')
    
def get_pages_2(titles, iso):
    
    if type(titles) == list and len(titles) > 0:
        pgs = []
        for title in titles:
            pg = get_page_2(title, iso)
            if pg: pgs.append(pg)
            
        return pgs
    
def search_2(name, iso):
    """
    queries the wikimedia API to return {limit} matches to the search term
    """
    titles = _search_query_2(name, iso)
    return titles
    
def _search_query_2(name, iso):
    """
    the underlying api query that powers the search function.
    """
    if iso == "ESP": lan = 'es'
    elif iso == "UKR": lan = 'uk'
    elif iso == "HUN": lan = 'hu'
    else: return None
    
    url = 'https://' + lan + '.wikipedia.org/w/index.php'
    API_URL = 'https://' + lan + '.wikipedia.org/w/api.php'
    
    params = {
        'sort': 'relevance',
        'search': name,
        'profile': 'advanced',
        'fulltext': 1
    }

    r = requests.get(url, params=params)
    if r.status_code == 200:
        page = BeautifulSoup(r.content, 'lxml')
        titles = page.find_all('div', {'class': 'mw-search-result-heading'})
        titles = [t.find('a')['title'] for t in titles]
        return titles
    else:
        time.sleep(0.5)
        r = requests.get(API_URL, params=params)
        return titles

def get_infobox_2(page):
    
    data = {}
    
    infobox = page.find('table', {'class': 'infobox'})
    if not infobox:
        return {}

    
    fix_newlines(infobox)
    data = get_top_region(infobox, data)
    data = get_cats(infobox, data)
    
    section = ''
    section_year = None
    sect_change = False
    
    # iterate through each table row.
    for i in infobox.find_all('tr'):
        key = i.find('th')
        value = i.find('td')
        if key:
            
            # mergedtoprow defines sections. This is useful to keep track of years
            # that scope over the entire section, and labels.
            if not i.has_attr('class') or (i.has_attr('class') and (i['class'][0] == 'mergedtoprow')):
                sect_change = True
                section = ''
                section_year = None
                
                # these are the only things that reliably have sections that we're interested in.
                if any(i in key.text.lower() for i in ['gdp', 'population', 'area', 'surface',
                                                       'government', 'resident', 'time']):
                    
                    # 'largest' avoid issues with New York City, which has a section
                    # starting with 'largest_borough_by_area'
                    if 'largest' not in key.text.lower():
                        section = key.text.lower()
                        
                        # years are usually in parentheses (2019)
                        section_year = re.search('\((.*\s+)?(\d\d\d\d)(\s+.*)?\)', section)
                        if section_year: section_year = section_year.group(2)
                        else:
                            section_year = re.search('\((.*)(-+|\/+)?(\d\d\d\d)(\s+.*)?\)', section)
                            if section_year: section_year = section_year.group(3)
                            else:
                                section_year = re.search('\((.*\s+)?(\d\d\d\d)(-+|\/+)?(.*)?\)', section)
                                if section_year: section_year = section_year.group(2)
            
            if value and value.text.replace(' ', '').replace('\xa0', '') != '':
                                
                # get links in values
                value_links = value.find_all('a')
                if len(value_links) >= 1: 
                    value_links = [v for v in value_links if v.has_attr('href')]
                    value_links = [v['href'] for v in value_links]
                    # remove citations:
                    value_links = [v for v in value_links if not v.startswith('#')]
                else:
                    value_links = None

                key, value = key.text.lower(), value.text
                
                # we don't care about rank, and it messes
                # up actors like NY state
                if sect_change and 'rank' in value.lower():
                    continue 
                    
                # Fixes Hangzhou GDP
                sc = re.search('((20|19|18)\d\d)', value)
                fl = re.search("[A-Za-z]", value)
                if sect_change and sc:
                    section_year = sc.group(1)
                    if not fl: continue
                
                # get any year values in the key or value text.
                # these will be associated with this key.
                key_year = re.search('\((.*\s+)?(\d\d\d\d)(\s+.*)?\)', key)
                wh = 'key'
                if not key_year:
                    key_year = re.search('\((.*\s+)?(\d\d\d\d)(\s+.*)?\)', value)
                    wh = 'value'
                if key_year:
                    key_year = [key_year.group(2)]
                

                # if we have both a key year and a section year,
                # use the key year. 
                if key_year: year = key_year
                else: year = section_year
                
                # clean up the text.
                section = clean(section)
                key = clean(key)
                #selecting km2 instead of sq mi
                if section == 'area' or section == 'surface' and "(" in value:
                    temp = value.split("(")
                    for val in range(len(temp)):
                        if "km2" in temp[val] or "km²" in temp[val]:
                            if type(value) == str:
                                value = [v for v in temp[val].strip(")").split('\n')]
                    #if area_units is something else
                    if 'km2' not in value[0] and "km²" not in value[0]:
                        value = [temp[0]]
                else:
                    value = [v for v in value.split('\n')]
                value = [clean(v) for v in value]
                #if multiple area types in value
                if "or" in value[0]:
                    temp = value[0].split("or")
                    for v in range(len(temp)):
                        if "km2" in temp[v] or "km²" in temp[v]:
                            value = [temp[v].strip(" ")]
                
                # add section to key, replace spaces.
                if section and section != key:
                    key = section + ' ' + key
                key = key.replace(' ', '_')
                key = key.replace('\xa0', '_')

                # add all this info to data dict.
                data[key] = value
                if year:
                    data[key + '_year'] = year
                if value_links: 
                    data[key + '_links'] = value_links
        sect_change = False
    if 'area' in data:
        if ";" in data['area'][0]:
            if "km2" in data['area'][0]:
                areas = data['area'][0].split(";")
                chosen = [zz for zz in areas if 'km2' in zz]
                area = chosen[0].replace('\xa0',' ')
                area = area.strip(" ")
                data['area'][0] = area
        if "ha" in data['area'][0]:
            try:
                area = float(data['area'][0].split(" ")[0]) / 100
                data['area'][0] = str(area) + " km2"
            except:
                data['area'][0] += " flag"
        if "sq mi" in data['area'][0]:
            area = float(data['area'][0].split(" ")[0]) * 2.58999
            data['area'][0] = str(area) + " km2"
    
    
    tmp = list(data.keys())
    translations = translator.translate(tmp)
    
    t_data = {}
    
    for t in translations:
        temp = []
        for y in data[t.origin]:
            try: z = translator.translate(y, src = lan)
            except: z = translator.translate(y)
            if z.text: temp.append(z.text) 
        t.text = t.text.replace('com._autónoma', 'autonomous_community')
        t_data[t.text] =  temp
        
    return t_data

# In[6]:

def process_page_2(page):
    data = {}
    
    data['title'] = page.find('h1', {'class': 'firstHeading'}).text
    if data['title'].startswith('File:'): return None
    ## print(data['title'])
    
    # get coords in decimal degrees
    latlon = page.find('span', {'class': 'geo'})
    if latlon:
        if lan:
            data['lat'] = float(latlon.text.split(",")[0].strip())
            data['lng'] = float(latlon.text.split(",")[1].strip())
        else:
            if "," not in str(latlon.text): 
                data['lat'] = float(latlon.text.split(";")[0])
                data['lng'] = float(latlon.text.split(";")[1])
    
    
    # attempt to get a wiki_iso. 
    data['wiki_iso'] = None
    infobox = page.find('table', {'class': 'infobox'})
    if infobox:
        for i in infobox.find_all('a'):
            if i.has_attr('title'):
                title = i['title']
                if title in title_iso:
                    data['wiki_iso'] = title_iso[title]
                    break
                    
        
    if not data['wiki_iso']:
        ps = [p for p in page.find_all('p') if not p.has_attr('class')]
        if len(ps) >= 1:
            for i in ps[0].find_all('a'):
                if i.has_attr('title'):
                    title = i['title']
                    if title in title_iso:
                        data['wiki_iso'] = title_iso[title]
                        break
    
    data['infobox'] = get_infobox_2(page)
    
    
    if not data['wiki_iso']:
        if 'country' in data['infobox']:
            if data['infobox']['country'][0] in title_iso:
                data['wiki_iso'] = title_iso[data['infobox']['country'][0]]
    
    # These 3 introduce a lot of noise. Indiana -> India, for example. 
    # necessary for some actors though. :(
    
    if not data['wiki_iso']:
        ps = [p for p in page.find_all('p') if not p.has_attr('class')]
        if len(ps) >= 1:
            for k, v in title_iso.items():
                if k in ps[0].text:
                    data['wiki_iso'] = v
                    break
    
    if not data['wiki_iso']:
        if infobox:
            for k, v in title_iso.items():
                if k in infobox.text:
                    data['wiki_iso'] = v
                    break
    
    if not data['wiki_iso']:
        for k, v in title_iso.items():
            if k in data['title']:
                data['wiki_iso'] = v
                break
    
    data['categories'] = []
    cat = page.find('div', {'id': 'mw-normal-catlinks'})
    if cat:
        data['categories'] = [i.text for i in cat.find_all('li')]
    if 'category_label' in data['infobox']:
        data['categories'].append(data['infobox']['category_label'])  

    tmp = []
    
    for y in data['categories']:
        z = translator.translate(y)
        tmp.append(z.text)

    data['categories'] = tmp    
    
    data['wiki_type'] = infer_type(data['categories'])
    if type(data['wiki_type']) == tuple : data['wiki_type'] = data['wiki_type'][0]
    
    
    for k,v in data['infobox'].items():
        if 'area' in k or 'surface' in k and 'links' not in k and 'year' not in k and 'code' not in k \
            and 'coordinates' not in k and k != 'areas' and 'rank' not in k:
                if not v: continue
                data['area'] = data['infobox'][k] 
                data['area'], data['area_units'] = clean_numbers(data['area'][0])
                if k+'_year' in data['infobox']:
                    data['area_year'] = data['infobox'][k+'_year']
                break  
            
    data['population'] = []

    for k,v in data['infobox'].items():
        if 'population' in k and 'links' not in k and 'year' not in k and 'municipality' not in k and v:
            if 'rank' not in k and 'density' not in k and 'percent' not in k and 'ethnic' not in k \
                and 'change' not in k and 'demonym' not in k and '_by_' not in k and 'median' not in k \
                and 'household' not in k and v[0] != '' and 'largest' not in k and 'for' not in v[0] \
                and 'language' not in k and not k.endswith('population_') and 'gender' not in k \
                and 'pop_2011–2016' not in k: 
                    if 'estimate' in k and k+'_year' in data['infobox'].keys():
                        yy = data['infobox'][k+'_year']
                        data['population'].append([yy+'_'+k,data['infobox'][k]])
                    else:
                        data['population'].append([k,data['infobox'][k]])
                    for pop in data['population']:
                        if type(pop[1]) != float and type(pop[1]) != 'NoneType' and pop[1]:
                            pop[1], _ = clean_numbers(pop[1][0])
                    
    if data['population']:
        k = data['population'][0][0]
        if k+'_year' in data['infobox'].keys():
            data['population_year'] = data['infobox'][k+'_year']
        for tem in data['population']:
            tem[0] = tem[0].replace(',','')
            tem[0] = tem[0].replace('population_', '')
            tem[0] = tem[0].replace('greater_toronto_area','metro')
            tem[0] = tem[0].replace('†_city_proper._', '')
            if "(" in tem[0]: tem[0] = tem[0].split("_")[-1]
            if 'population_year' not in data:
                if 'census' in tem[0]:
                    data['population_year'] = tem[0].split("_")[0]
                elif 'estimate' in tem[0]:
                    data['population_year'] = tem[0].split("_")[0]
        #data['population'][0][0] = 'population'
    else:
        del data['population']
    
    for k,v in data['infobox'].items():
        if 'gdp' in k and 'links' not in k and 'year' not in k and 'hdi' not in k:
            data['gdp'] = data['infobox'][k]
            if 'USD' in data['gdp'][0] and 'CNY' in data['gdp'][0]:
                data['gdp'] = ["USD" + data['gdp'][0].split("USD")[1]]
            data['gdp'], data['gdp_units'] = clean_numbers(data['gdp'][0])
            if k+'_year' in data['infobox']:
                data['gdp_year'] = data['infobox'][k+'_year']
            break
                
    for k,v in data['infobox'].items():
        if 'elevation' in k and 'links' not in k and 'year' not in k:
            data['elevation'] = data['infobox'][k][0]
            break
      
    # data['title'] = translator.translate(data['title']).text
    return data


 
# In[9]:

def get_data_2(name, iso, entity_type, check_country=True, debug=False, check_disambig=True):
    name = name.replace('!', '')
    name = name.replace('Aggregazione ', '')
    titles = search_2(name, iso)
    if not check_country and debug: print(titles)
    pages = get_pages_2(titles, iso)
    if not pages:
        if debug: print('pages null')
        return {}
    pages = [process_page_2(page) for page in pages]
    
    
    if debug:
        for p in pages: print(p)

    if len(pages) != 0:
        disambig = [i for i in pages if i['wiki_type'] == 'Disambiguation']
        if len(disambig) > 0: disambig = disambig[0]
        else: disambig = None
        
      
        # filter iso
        iso_pages = [i for i in pages if 'wiki_iso' in i]
        iso_pages = [i for i in iso_pages if i['wiki_iso'] == iso]
        
        
        if len(iso_pages) == 0:
            return {}
            
        # filter entity_type
        et_pages = [i for i in iso_pages if i['wiki_type'] == entity_type]
     
        if len(et_pages) == 0:
            return {}
        
        else:
            max_score = 0
            rank = 0
            for page_num in range(len(et_pages)):
                score = infer_type(et_pages[page_num]['categories'])[1]
                if score > max_score:
                    max_score = score
                    rank = page_num
            data = et_pages[rank]
            data['region_hierarchy'] = get_region_hierarchy(data)
        
            if data['infobox'] == {} or data['infobox'] == None:
                if debug: print('no infobox')

            del data['infobox']
            data['categories'] = '; '.join(data['categories'])
            
            if 'population_year' in data:
                data['population_year'] = data['population_year'][0]
                    
            return data
    return {}

In [None]:
# Testing Cell

get_data_2("Abertura", "ESP", "City")