__Subject:__      Infering City Population Size from City Data

__Date:__         07/20/2018

__Name:__         Edmund D. Chitwood

***

__Notebook Summary:__<br> 
<br>The following Notebook contains functions that 
-  scrape web pages and turn them to Beautiful Soup, 
-  iterate over lists of the Wikipedia pages of cities with populations of 100,000 or more, 
-  parse sections within those Wikipedia pages to extract city data from them, 
-  add that data into city dictionaries within lists, 
-  add those lists of dicts to Pandas DataFrames and 
-  pickle the combined DataFrames. 

***

In [1]:
from __future__ import print_function, division
import requests
from bs4 import BeautifulSoup
from lxml import etree
import re
import pandas as pd
import unicodedata
import time
import pickle

# Scrape

In [2]:
# Get response object from URL and turn text element into Beautiful Soup.
# Return soup.

def web_page_to_soup(url):
    response = requests.get(url)
    if response == None:
        return None
    
    # Make sure requests return an objext with the correct status code.
    if response.status_code != 200:
        return None
    
    page = response.text
    soup = BeautifulSoup(page, "lxml") 
    return soup

In [3]:
# Use class fn org to target city name.
# Return name of city.

def city_name_from_soup(soup):
    try:
        city_name = soup.find(class_='fn org').text
    except AttributeError:
    
    # Except handles instances where there is no fn org class,
    # which raises an error since NoneType has no Attribute Text
    # (e.g. https://en.wikipedia.org/wiki/Adelaide).
        city_name = 'No Name' 
    
    return city_name

# Parse

In [4]:
# Use class geo to target latitude and longitude. 
# Return city latitude and longitude. 

def lat_lon_from_soup(soup):
    try:
        lat_lon = soup.find(class_='geo').text
        lat_lon_list = lat_lon.split(';')
        lat = float(lat_lon_list[0])
        lon = float(lat_lon_list[1]) 
    
    # Except handles instances where there is no geo class,
    # which raises an error since NoneType has no Attribute Text.
    # In that case, pass in null values.
    except AttributeError:
        lat = float('nan')
        lon = float('nan')
    
    return lat,lon 

In [5]:
# Use class infobox geography vcard to target city infobox.
# Return area of city in square miles.

def area_from_soup(soup):    
    try:
        infobox_string = (soup.find(attrs={'class' : 'infobox geography vcard'})
                              .find(text=re.compile('sq')))
        
        # Return the normal form of unicode string.
        area_string = unicodedata.normalize('NFKD', infobox_string).encode('ascii','ignore')
        area_string = str(area_string)
        area_list = area_string.split('sq mi')
        area_string_sq_mi = area_list[0]
        
        # Use regex and replace to deal with decimals and commas.
        area_list = re.findall(r"[-+]?\d*\,\d+\.\d+|\d*\.\d+|\d*\,\d+|\d+", area_string_sq_mi)
        area_string_sq_mi = area_list[0]
        area_sq_mi = float(area_string_sq_mi.replace(',',''))
        
        # Check for instances of 'sq' before area in sq units. 
        area_string = (soup.find(attrs={'class' : 'infobox geography vcard'})
                           .find(text=re.compile('sq'))
                           .parent
                           .parent)
        if '/sq' in str(area_string.text):
            area_sq_mi = float('nan')

    # Except handles instances area numbers are out of order, 
    # contain unusual syntax, listed in sq km etc.
    # In those case, try getting area in sq mi a differnt way.
    except (TypeError, IndexError, AttributeError) as e:
        try:
            sq_list = list(soup.find(attrs={'class' : 'infobox geography vcard'}).findAll(text=re.compile('sq')))
            sq_num_list = []
            for s in sq_list:
                sq_num_list.append(re.findall(r"[-+]?\d*\,\d+\.\d+|\d*\.\d+|\d*\,\d+|\d+", s))
            
            sq_num_floats = []
            for s in sq_num_list:
                if s == []:
                    pass
                else:
                    sq_num_floats.append(float(s[0].replace(',','')))
            
            area_sq_mi = sorted(sq_num_floats)[0]
        
        # Except handles instances area numbers are out of order, 
        # contain unusual syntax, listed in sq km etc.
        # In those case, try getting area in sq mi a differnt way.
        except (TypeError, IndexError, AttributeError) as e:
            area_sq_mi = float('nan')
    
    return area_sq_mi

In [6]:
# Use class infobox geography vcard to target city infobox.
# Return population of city.

def population_from_soup(soup):
    try:
        infobox_string = (soup.find(attrs={'class' : 'infobox geography vcard'})
                                .find(text=re.compile('Population'))
                                .parent
                                .parent
                                .findNextSibling()
                                .find('td'))
        infobox_string_2 = (soup.find(attrs={'class' : 'infobox geography vcard'})
                                .find(text=re.compile('Population'))
                                .parent
                                .parent
                                .findNextSibling()
                                .find('td')
                                .find(text=re.compile('Population')))
        
        # The following if statements handle atypical population data content
        # and formatting. For example, some infoboxes contain population 
        # figures for cities in multiple contexts (e.g. municipality, metro region).
        if infobox_string_2 == None:
            infobox_string = (soup.find(attrs={'class' : 'infobox geography vcard'})
                                  .find(text=re.compile('Population'))
                                  .parent
                                  .parent
                                  .find('td'))
            if infobox_string == None:
                infobox_string = (soup.find(attrs={'class' : 'infobox geography vcard'})
                                      .find(text=re.compile('Population'))
                                      .parent
                                      .parent
                                      .findNextSibling()
                                      .find('td'))
                
            if 'sq' in str(infobox_string.text):
                infobox_string = (soup.find(attrs={'class' : 'infobox geography vcard'})
                                      .find(text=re.compile('Population'))
                                      .parent
                                      .parent
                                      .findNextSibling()
                                      .findNextSibling()
                                      .find('td'))
                
        pop_string = str(infobox_string.text)
        pop_string = pop_string.split('[')[0]
        pop_string = pop_string.split('(')[0]
        pop_string = ''.join(e for e in pop_string if e.isdigit())            
        population = int(pop_string)
    
    # Except handles instances where population is not organized within
    # td element (e.g. https://en.wikipedia.org/wiki/Adelaide).
    # In those case, pass in null values.
    except (AttributeError, ValueError) as e:
        population = float('nan')
        
    return population

In [7]:
# Use class infobox geography vcard to target city infobox.
# Return elevation above sea level in feet.

def elevation_from_soup(soup):
    try:
        infobox_string = (soup.find(attrs={'class' : 'infobox geography vcard'})
                              .find(text=re.compile('Elevation'))
                              .parent
                              .parent
                              .find('td')
                              .text)
        elevation_list = infobox_string.split('(')
        
        # Use regex and replace to deal with decimals and commas.
        elevation_list_1 = re.findall(r"[-+]?\d*\,\d+\.\d+|\d*\.\d+|\d*\,\d+|\d+", elevation_list[0])
        elevation_list_2 = re.findall(r"[-+]?\d*\,\d+\.\d+|\d*\.\d+|\d*\,\d+|\d+", elevation_list[1])
        elevation_1 = float(elevation_list_1[-1].replace(',',''))
        elevation_2 = float(elevation_list_2[-1].replace(',',''))
        
        # Make sure elevation returns in feet rather than meters.
        if elevation_1 > elevation_2:
            elevation = elevation_1
        else:
            elevation = elevation_2
            
    except (AttributeError, IndexError) as e:
            try:
                infobox_string = (soup.find(attrs={'class' : 'infobox geography vcard'})
                                      .find(text=re.compile('Highest'))
                                      .parent
                                      .parent
                                      .find('td')
                                      .text)
                elevation_list = infobox_string.split('(')
                elevation_list_1 = re.findall(r"[-+]?\d*\,\d+\.\d+|\d*\.\d+|\d*\,\d+|\d+", elevation_list[0])
                elevation_list_2 = re.findall(r"[-+]?\d*\,\d+\.\d+|\d*\.\d+|\d*\,\d+|\d+", elevation_list[1])
                elevation_1 = float(elevation_list_1[-1].replace(',',''))
                elevation_2 = float(elevation_list_2[-1].replace(',',''))

                if elevation_1 > elevation_2:
                    elevation = elevation_1
                else:
                    elevation = elevation_2
                    
            except (AttributeError, IndexError) as e:
                
                # Handle cases where elevation listed at top of climate table,
                # as is the case of the city A Coruna.
                try:
                    infobox_string = (soup.find(attrs={'class' : 'wikitable collapsible'})
                                          .find('th')
                                          .text)
                    elevation_list = infobox_string.split(' ')
                    feet_string = None
                    for s in elevation_list:
                        if 'feet' in s:
                            feet_string = s
                            
                    if feet_string == None:
                        elevation = float('nan')
                        return elevation
                    
                    position = elevation_list.index(feet_string)
                    elevation_string = elevation_list[position-1]
                    elevation = float(elevation_string.replace('(',''))
                except (AttributeError, IndexError) as e:
                    elevation = float('nan')
    
    return elevation

In [8]:
# Use class wikitable collapsible or wikitable collapsible collpsed to target city climate table.
# Return climate table in climate object. This includes the data points targeted by the next five functions.
# Some city pages (e.g. Amadora) have no climate table at all.

def climate_from_soup(soup):
    climate = soup.find(attrs={'class' : 'wikitable collapsible'})
    
    # For a city like Shenzen, the climate table is collpsed, the following if statement handles such cases.
    if climate == None:
        climate = soup.find(attrs={'class' : 'wikitable collapsible collapsed'})
    
    return climate

In [9]:
# Use find to target temp in the climate object.
# Return temp in fahrenheit.

def average_high_temp_from_climate(climate):
    try:
        temp_result_set = (climate.find(text=re.compile('Average high'))
                                  .parent
                                  .parent
                                  .findAll('td'))
        
        # Annual values are listed in the last columns of the climate tables.
        temp_string = temp_result_set[-1].text
        temp_list = temp_string.split('(')

        # Get temp in celsius and fahrenheit.
        # Use regex to deal with decimals.
        temp_list_1 = re.findall(r"[-+]?\d*\.\d+|\d+", temp_list[0])
        temp_list_2 = re.findall(r"[-+]?\d*\.\d+|\d+", temp_list[1])
        temp_float_1 = float(temp_list_1[0])
        temp_float_2 = float(temp_list_2[0])

        # Compare temps and return the one in fahrenheit (i.e. the larger number).
        if temp_float_1 > temp_float_2:
            temp_float = temp_float_1
        else:
            temp_float = temp_float_2
            
    except (AttributeError, IndexError) as e:
        temp_float = float('nan')
        
    return temp_float

In [10]:
# Use find to target temp in the climate object.
# Return temp in fahrenheit.

def average_low_temp_from_climate(climate):
    try:
        temp_result_set = (climate.find(text=re.compile('Average low'))
                                  .parent
                                  .parent
                                  .findAll('td'))
        
        # Annual values are listed in the last columns of the climate tables.
        temp_string = temp_result_set[-1].text
        temp_list = temp_string.split('(')

        # Get temp in celsius and fahrenheit.
        # Use regex to deal with decimals.
        temp_list_1 = re.findall(r"[-+]?\d*\.\d+|\d+", temp_list[0])
        temp_list_2 = re.findall(r"[-+]?\d*\.\d+|\d+", temp_list[1])        
        temp_float_1 = float(temp_list_1[0])
        temp_float_2 = float(temp_list_2[0])

        # Compare temps and return fahrenheit (i.e. the larger number).
        if temp_float_1 > temp_float_2:
            temp_float = temp_float_1
        else:
            temp_float = temp_float_2
    
    except (AttributeError, IndexError) as e:
        temp_float = float('nan')
        
    return temp_float

In [11]:
# Use find to target average precipitation days in the climate object.
# Return annual rainy days.

def annual_precipitation_days_from_climate(climate):
    
    # In some climate tables, average precipitation days are refered to as Average Rainy Days.
    try:
        average_precipition_days_result_set = (climate.find(text=re.compile('Average rainy days'))
                                                      .parent
                                                      .parent
                                                      .findAll('td'))
        
        # Annual values are listed in the last columns of the climate tables.
        precipitation_string = average_precipition_days_result_set[-1].text
        precipitation_string = precipitation_string.strip('\n')
        precipitation_days = float(precipitation_string)
    except AttributeError:
        try:
            average_precipition_days_result_set = (climate.find(text=re.compile('Average precipitation days'))
                                                          .parent
                                                          .parent
                                                          .findAll('td'))
            precipitation_string = average_precipition_days_result_set[-1].text.strip('\n')
            precipitation_days = float(precipitation_string)
        except AttributeError:
            precipitation_days = float('nan')

    return precipitation_days

In [12]:
# Use find to target average precipitation inches in the climate object.
# Return annual precipitation in inches.

def annual_precipitation_inches_from_climate(climate):   
    try:
        average_precipition_measurements_result_set = (climate.find(text=re.compile('precipitation'))
                                                              .parent
                                                              .parent
                                                              .parent
                                                              .findAll('td'))
    except (AttributeError, IndexError) as e:
        try:
            average_precipition_measurements_result_set = (climate.find(text=re.compile('Average rainfall'))
                                                                  .parent
                                                                  .parent
                                                                  .findAll('td'))
        except (AttributeError, IndexError) as e:
            return float('nan')

    try:
        
        # Annual values are listed in the last columns of the climate tables.
        precipitation_measurements_string = average_precipition_measurements_result_set[-1].text
        precipitation_measurements_list = precipitation_measurements_string.split(' ')

        # Get precipitation in cm and in.
        # Use regex and replace to deal with decimals and commas.
        precipitation_1 = re.findall(r"[-+]?\d*\,\d+\.\d+|\d*\.\d+|\d*\,\d+|\d+", precipitation_measurements_list[0])
        precipitation_2 = re.findall(r"[-+]?\d*\,\d+\.\d+|\d*\.\d+|\d*\,\d+|\d+", precipitation_measurements_list[1])
        precipitation_1 = float(precipitation_1[0].replace(',',''))
        precipitation_2 = float(precipitation_2[0].replace(',',''))

        # Compare temps and return inches (i.e. the smaller number).
        if precipitation_1 > precipitation_2:
            precipitation_inches = precipitation_2
        else:
            precipitation_inches = precipitation_1
    except (AttributeError, IndexError) as e:
        precipitation_inches = float('nan')

    return precipitation_inches

In [13]:
# Use find to target annual sunshine hours in the climate object.
# Return annual sunshine hours.

def annual_sunshine_hours_from_climate(climate):
    try:
        annual_sunshine_result_set = (climate.find(text=re.compile('Mean monthly'))
                                             .parent
                                             .parent
                                             .findAll('td'))
        
        # Annual values are listed in the last columns of the climate tables.
        annual_sunshine_string = annual_sunshine_result_set[-1].text
        annual_sunshine_float = float(annual_sunshine_string.replace(',',''))
    except (AttributeError, IndexError):
        annual_sunshine_float = float('nan')

    return annual_sunshine_float

# Generate City URLs

In [14]:
# Use string of letters (e.g 'ABCDEFGHI' to generate URLs to lists of towns and cities with 100,000 or more inhabitant.
# Return urls to to lists of towns and cities with 100,000 or more inhabitant by beginning letter.
# Function takes upercase ASCII letters.

def urls_of_cities_begin_letter(letters):
    
    urls_of_cities = []
    
    for i in letters:
        (urls_of_cities
        .append('https://en.wikipedia.org/wiki/List_of_towns_and_cities_with_100,000_or_more_inhabitants/cityname:_'
        + i))
    
    return urls_of_cities    

In [15]:
# Use URL to Wikipedia page for all cities with populations over 100,000, beginning with a certain letter.
# Return list of URLS for each city with populations over 100,000, beginning with a certain letter.

def city_urls_from_wikipedia(page_of_cities_beginning_with_letter):
    cities_soup = web_page_to_soup(page_of_cities_beginning_with_letter)
    
    # Cities and URLs to their pages are listed in tables on target pages.
    cities_table = cities_soup.find('table',{'class':'wikitable sortable'})
    all_links = cities_table.findAll('a', href=True)
    city_urls = []
    
    # Get URLs to cities; step by 2 to avoid links to city countries.
    for i in range(0, len(all_links),2):
        total_url = 'https://en.wikipedia.org' + all_links[i]['href']
        city_urls.append(total_url)                 
    
    return city_urls

# Create List of Dictionaries with Wikipedia Data for each City

In [16]:
## Use above functions to get Wiki data for each city with population of 100,000 or more.
## Return list of dictionaries with data for each city.

def city_info_from_city_urls(city_urls_list):
    list_of_dicts = []
    for i in city_urls_list:
        soup = web_page_to_soup(i)
        city_name = city_name_from_soup(soup)
        lat = lat_lon_from_soup(soup)[0]
        lon = lat_lon_from_soup(soup)[1]
        area = area_from_soup(soup)
        elevation = elevation_from_soup(soup)
        population = population_from_soup(soup)
        climate = climate_from_soup(soup)
        average_low_temp = average_low_temp_from_climate(climate)
        average_high_temp = average_high_temp_from_climate(climate)
        annual_precipitation_days = annual_precipitation_days_from_climate(climate)
        annual_precipitation_inches = annual_precipitation_inches_from_climate(climate)
        annual_sunshine_hours = annual_sunshine_hours_from_climate(climate)
        city_dict = {
            'City Name': city_name,
            'URL' : i,
            'Latitude': lat,
            'Longitude': lon,
            'Area': area,
            'Elevation': elevation,
            'Population': population,
            'Average Low Temp': average_low_temp,
            'Average High Temp': average_high_temp,
            'Annual Precipitation Days': annual_precipitation_days,
            'Annual Precipitation Inches': annual_precipitation_inches,
            'Annual Sunshine Hours': annual_sunshine_hours,
        }
        list_of_dicts.append(city_dict)
        
    return list_of_dicts
        
        

# Generate DataFrames

In [17]:
# Use above three functions to create list of Dicts for cities beginning with certain letters 
# and add them them to a DataFrame. Return dataframe with data for each city beginning with input letters.
# Function takes upercase ASCII letters.

def cities_dataframe(letters):
    all_cities = []
    city_letter_dataframe = pd.DataFrame()
    
    # Pass input letters to function to get list of URLs to city landing pages by letter.
    urls_city_landing_pages = urls_of_cities_begin_letter(letters)
    for i in urls_city_landing_pages:
        
        # Print URL to track progress.
        print(i)
        
        # Pass URL to city landing pages by letter to function to get list of URLS
        # to individual city pages.
        city_urls_list = city_urls_from_wikipedia(i)
        
        # Pass list of URLS to individual city pages to function to return list of 
        # city dicts and add them to DataFrame.
        city_letter_list_dicts = city_info_from_city_urls(city_urls_list)
        city_letter_dataframe = pd.DataFrame(city_letter_list_dicts)
        all_cities.append(city_letter_dataframe)
        
        # Pause after each iteration of the for loop to avoid rate limiting issues.
        time.sleep(60)
    
    return pd.concat(all_cities)

In [18]:
# Get city DataFrames. 
# Do this incrementally (partial alphabet) to mitigate work loss and deal with rate limiting.
all_cities_df_a_i = cities_dataframe('ABCDEFGHI')

https://en.wikipedia.org/wiki/List_of_towns_and_cities_with_100,000_or_more_inhabitants/cityname:_A
https://en.wikipedia.org/wiki/List_of_towns_and_cities_with_100,000_or_more_inhabitants/cityname:_B
https://en.wikipedia.org/wiki/List_of_towns_and_cities_with_100,000_or_more_inhabitants/cityname:_C
https://en.wikipedia.org/wiki/List_of_towns_and_cities_with_100,000_or_more_inhabitants/cityname:_D
https://en.wikipedia.org/wiki/List_of_towns_and_cities_with_100,000_or_more_inhabitants/cityname:_E
https://en.wikipedia.org/wiki/List_of_towns_and_cities_with_100,000_or_more_inhabitants/cityname:_F
https://en.wikipedia.org/wiki/List_of_towns_and_cities_with_100,000_or_more_inhabitants/cityname:_G
https://en.wikipedia.org/wiki/List_of_towns_and_cities_with_100,000_or_more_inhabitants/cityname:_H
https://en.wikipedia.org/wiki/List_of_towns_and_cities_with_100,000_or_more_inhabitants/cityname:_I


In [19]:
all_cities_df_j_r = cities_dataframe('JKLMNOPQR')

In [20]:
all_cities_df_s_z = cities_dataframe('STUVWXYZ')

# Inspect DataFrames before Pickling

In [25]:
all_cities_df_a_i.shape

(1313, 12)

In [26]:
all_cities_df_a_i.head()

Unnamed: 0,Annual Precipitation Days,Annual Precipitation Inches,Annual Sunshine Hours,Area,Average High Temp,Average Low Temp,City Name,Elevation,Latitude,Longitude,Population,URL
0,130.0,39.92,2010.0,14.61,64.0,53.2,A Coruña,190.0,43.365,-8.41,246056.0,https://en.wikipedia.org/wiki/A_Coru%C3%B1a
1,,32.945,1616.5,62.1,57.6,44.6,Aachen,873.0,50.783,6.083,244951.0,https://en.wikipedia.org/wiki/Aachen
2,169.3,23.626,,54.0,52.0,39.6,Aalborg,16.0,57.05,9.917,112194.0,https://en.wikipedia.org/wiki/Aalborg
3,123.0,28.43,1506.0,35.0,52.5,38.8,Aarhus,344.0,56.15,10.217,2.0,https://en.wikipedia.org/wiki/Aarhus
4,,,,28.0,,,Aba,673.0,5.117,7.367,534265.0,"https://en.wikipedia.org/wiki/Aba,_Nigeria"


In [27]:
count_nan = len(all_cities_df_a_i) - all_cities_df_a_i.count()
print(count_nan)

Annual Precipitation Days      721
Annual Precipitation Inches    512
Annual Sunshine Hours          841
Area                           182
Average High Temp              493
Average Low Temp               493
City Name                        0
Elevation                      391
Latitude                         7
Longitude                        7
Population                      48
URL                              0
dtype: int64


# Pickle DataFrames

In [28]:
all_cities_df_a_i.to_pickle('all_cities_pickle_a_i.pkl')
all_cities_df_j_r.to_pickle('all_cities_pickle_j_r.pkl')
all_cities_df_s_z.to_pickle('all_cities_pickle_s_z.pkl')