### Wikipedia city scraping


In [2]:
def scrape_wiki_cities() -> DataFrame:
    from bs4 import BeautifulSoup
    import requests

    #get html code
    doc_url = 'https://en.wikipedia.org/wiki/List_of_cities_in_the_European_Union_by_population_within_city_limits'
    response = requests.get(doc_url)
    if response.status_code != 200:
        raise Exception(f'wikipedia returned code {response.status_code} for url = {doc_url}')
    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.select('table.wikitable > tbody > tr')

    # prettify the names and take only selected ones
    header = [h.text.strip().replace(' ', '_').lower() for h in table[0].select('th')][1:-2]
    cities = [[cell.text.strip() for cell in city.select('td')[1:-2]] for city in table[1:]]

    import pandas as pd 
    return pd.DataFrame(data=cities, columns=header)

cities = scrape_wiki_cities()

In [3]:
from pandas import DataFrame
def cleanup_cities(df : DataFrame):
    import pandas as pd
    df.loc[:, 'officialpopulation'] = df['officialpopulation'].str.replace(',', '').astype(int)
    df.loc[:, 'date'] = pd.to_datetime(df['date'])

cleanup_cities(cities)
cities.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94 entries, 0 to 93
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   city                94 non-null     object        
 1   member_state        94 non-null     object        
 2   officialpopulation  94 non-null     int32         
 3   date                94 non-null     datetime64[ns]
dtypes: datetime64[ns](1), int32(1), object(2)
memory usage: 2.7+ KB


In [4]:
cities.describe()

Unnamed: 0,officialpopulation
count,94.0
mean,736750.1
std,633466.2
min,300018.0
25%,353314.2
50%,543663.0
75%,744136.5
max,3664088.0


In [6]:
cities.city.array


<PandasArray>
[           'Berlin',            'Madrid',              'Rome',
         'Bucharest',             'Paris',            'Vienna',
           'Hamburg',            'Warsaw',          'Budapest',
         'Barcelona',            'Munich',             'Milan',
             'Sofia',            'Prague',           'Cologne',
         'Stockholm',            'Naples',         'Amsterdam',
         'Marseille',             'Turin',          'Valencia',
            'Kraków',            'Zagreb', 'Frankfurt am Main',
           'Seville',          'Zaragoza',              'Łódź',
            'Athens',          'Helsinki',         'Rotterdam',
           'Wrocław',        'Copenhagen',           'Palermo',
         'Stuttgart',        'Düsseldorf',              'Riga',
           'Leipzig',          'Dortmund',        'Gothenburg',
             'Essen',            'Málaga',            'Bremen',
             'Genoa',           'Vilnius',           'Dresden',
            'Dublin',     