In [95]:
import json
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [81]:
MUSEUM_WIKI_URL = "https://en.wikipedia.org/w/api.php?formatversion=2&action=parse&page=List_of_most_visited_museums&prop=text&format=json"
CITIES_POPULATION_URL = "http://worldpopulationreview.com/world-cities/"

In [82]:
wiki_response = requests.get(url=MUSEUM_WIKI_URL)
museums_page = json.loads(wiki_response.text)
museums_data_html = museums_page["parse"]["text"]

In [83]:
soup = BeautifulSoup(museums_data_html, 'html.parser')
table = soup.find('table')
museums_rows = []

for row in table.findAll('tr'):
    museum_row = []
    for cell in row.findAll('td'):
        if cell.find('a') and cell.find('a').get('title'):
            if cell.find('a').get('class') and 'new' in cell.find('a').get('class'): # special case
                museum_row.append(cell.text.replace('\xa0[zh]', '').strip())
                continue
            if cell.find(id='cite_ref-16'): # special case
                museum_row.append(cell.text.replace('[c]', '').strip())
                continue
            museum_row.append(cell.text.strip())
        elif cell.find('a'):
            museum_row.append(int(cell.text.strip()[:4]))
        else:
            museum_row.append(int(cell.text.strip().replace(',', '')))
    if museum_row:
        museums_rows.append(museum_row)

In [84]:
population_response = requests.get(url=CITIES_POPULATION_URL)
population_page = population_response.text
soup = BeautifulSoup(population_page, 'html.parser')
population_table = soup.find('table').find('tbody')

city_populations = []
for row in population_table:
    info = [col.text.strip() for col in row.findAll('td')]
    city_populations.append([info[1], int(info[3].replace(',', ''))])

In [85]:
museum_cities = set()
for row in museums_rows:
    museum_cities.add(row[1])
    
population_cities = set()
for row in city_populations:
    population_cities.add(row[0])
    
print(museum_cities-population_cities)

{'Taichung', 'New York City', 'Vatican City', 'Saint Petersburg', 'Taipei', 'Oświęcim', 'Suzhou', 'Washington, D.C.', "Xi'an"}


In [86]:
city_populations.append(['Taichung', 2816667])
city_populations.append(['New York City', 8398748])
city_populations.append(['Vatican City', 825])
city_populations.append(['Saint Petersburg', 5351935])
city_populations.append(['Taipei', 2646204])
city_populations.append(['Oświęcim', 39057])
city_populations.append(['Suzhou', 10721700])
city_populations.append(['Washington, D.C.', 702455])
city_populations.append(['Xi\'an', 12000600])

In [87]:
MUSEUM_DB_PATH = 'sqlite:///museum.sqlite'
CITY_DB_PATH = 'sqlite:///city.sqlite'

In [103]:
from sqlalchemy import create_engine
city_engine = create_engine(CITY_DB_PATH, echo=False)
museum_engine = create_engine(MUSEUM_DB_PATH, echo=False)

cities_df = pd.DataFrame(city_populations)
cities_df.columns = ['city', 'population']

museum_df = pd.DataFrame(museums_rows)
museum_df.columns = ['museum_name', 'city', 'nb_visitors', 'reported_year']

cities_df.to_sql('cities', con=city_engine)
museum_df.to_sql('museums', con=museum_engine)