This notebook scrapes and cleans data from IQ Air. It will take 3-5 minutes to run and produces the dataset iq_air.csv

Data Source: https://www.iqair.com/world-most-polluted-cities?continent=&country=&state=&page=1&perPage=50&cities=

In [1]:
import requests as req
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np

Dictionary of each continent's information. First item in list is used in URL, second denotes how many pages of data there are for each continent. All code after this is run dynamically based on this dictionary, so if the URL string changes or the amount of data changes, it can be changed in this one place, and the rest of the script will automatically update.

In [2]:
cont_info = {'Africa':['59af92713e70001c1bd78e4e', 2],
            'Asia':['59af92b13e70001c1bd78e53', 31],
            'Europe':['59af92ac3e70001c1bd78e52', 24],
            'North America':['59af928f3e70001c1bd78e4f', 35],
            'Oceania':['59af92e43e70001c1bd78e54', 3],
            'South America':['59af929e3e70001c1bd78e50', 2]}

Code for scraping data from IQ Air (takes 3-5 minutes).

In [3]:
# create empty dataframe that will ultimately hold all of the data.
df_all = pd.DataFrame()
# This loop will run once for each continent and will
    # (1) pull all data for that continent, 
    # (2) add a column denoting which continent the data belongs to, and
    # (3) append the continent-specific dataframe to the overall dataframe (df_all)
for key, val in cont_info.items():
    cont_name = key
    cont_url = val[0]
    cont_pgs = val[1]
    # blank dataframe to manipulate data at continent-level
    df_cont = pd.DataFrame()
    # for every page of data for this continent, extract data and assemble dataframe
    for pg_num in list(range(1, cont_pgs + 1)):
        # getting full page html
        url = f'https://www.iqair.com/world-most-polluted-cities?continent={cont_url}&country=&state=&page={pg_num}&perPage=50&cities='
        page = req.get(url)
        soup = bs(page.content, 'html.parser')
        # find data table
        tbl = soup.find('table')
        # extract column names
        col_names = tbl.find_all('th')
        cols = []
        for col in col_names:
            col = col.text.strip()
            cols.append(str(col))
        # extract row data, append to dictionary
        rows = tbl.find_all('tr')
        row_dat = {}
        for row in rows:
            dat = row.find_all('td')
            dat_list = []
            for d in dat:
                dat_list.append(d.text.strip())
            if len(dat_list)==0:
                continue
            dat_key = int(dat_list[0])
            dat_val = dat_list[1:]
            row_dat[dat_key] = dat_val
        # convert dictionary to dataframe
        df = pd.DataFrame.from_dict(row_dat, orient='index')
        # rename columns using scraped table header
        df.columns = cols[1:]
        # append page data to continent dataframe
        df_cont = df_cont.append(df)
    # add column with continent name to dataframe
    df_cont['continent'] = cont_name
    # append to overall dataframe
    df_all = df_all.append(df_cont)

df_all

Unnamed: 0,City,2019 AVG,JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC,2018 AVG,2017 AVG,continent
1,"Hartbeespoort, South Africa",60,42,65.6,68.5,57.5,112.1,41.1,31.4,59.9,61,76.3,21.4,-,-,-,Africa
2,"Bloemfontein, South Africa",42.3,12.6,11.2,-,42.8,62,102.1,72.8,49.7,33.8,28.9,20.1,16.4,-,-,Africa
3,"Springs, South Africa",39.1,14.2,13.9,12.2,28.9,66.7,103.4,86.8,50,36.3,25.6,11.5,12.3,-,-,Africa
4,"Vanderbijlpark, South Africa",34.7,-,-,-,25.6,53.9,59.3,56.8,42,25.9,20.9,14.8,12.9,-,-,Africa
5,"Sebokeng, South Africa",32.7,20.8,29,31.4,19.1,44.7,56.1,44.6,38.9,29.7,29,19.7,23.5,-,-,Africa
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80,"Tutamandahostel, Ecuador",6.7,-,-,-,7.1,7,6.4,6.7,6.1,7.6,7.6,5.8,6.6,-,-,South America
81,"Calama, Chile",6.4,6.9,5,5.7,7.5,5.1,7.1,6.3,6.7,6.7,6.7,6.3,6.8,13.9,9.8,South America
82,"Puerto Baquerizo Moreno, Ecuador",5.8,-,8.7,12.7,7.4,4.3,-,3.6,3.3,3.4,2.8,2.3,3.8,-,-,South America
83,"Punta Arenas, Chile",4.7,3.4,4.1,4.2,4.2,5.5,5.9,5.5,5.1,4.6,4.8,4.6,4.1,4.5,5.5,South America


Cleaning Scraped data

In [4]:
# index originally represented intra-continent ranking, so we will add it as a column and reset the overall index
df_all.reset_index(inplace=True)
# rename AVG columns
df_all = df_all.rename(columns={'index':'cont_rank',
                                 '2019 AVG':'avg_2019',
                                 '2018 AVG':'avg_2018',
                                 '2017 AVG':'avg_2017'})
# make all other columns lowercase
cols = list(df_all.columns)
for idx in list(range(len(cols))):
    cols[idx] = cols[idx].lower()
df_all.columns = cols
# replace hyphens with NaN values and change column data types
df_all = df_all.replace('-', np.nan)
df_all = df_all.astype({'avg_2019':'float',
                         'jan':'float',
                         'feb':'float',
                         'mar':'float',
                         'apr':'float',
                         'may':'float',
                         'jun':'float',
                         'jul':'float',
                         'aug':'float',
                         'sep':'float',
                         'oct':'float',
                         'nov':'float',
                         'dec':'float',
                         'avg_2018':'float',
                         'avg_2017':'float'})

In [5]:
# extract country name
def cln_country(city):
    country = city[city.find(', ')+2:]
    return country

df_all['country'] = df_all.city.apply(cln_country)

# extract city name
def cln_city(city):
    city2 = city[:city.find(', ')]
    return city2

df_all['city'] = df_all.city.apply(cln_city)

df_all

Unnamed: 0,cont_rank,city,country,continent,avg_2019,jan,feb,mar,apr,may,jun,jul,aug,sep,oct,nov,dec,avg_2018,avg_2017
0,1,Hartbeespoort,South Africa,Africa,60.0,42.0,65.6,68.5,57.5,112.1,41.1,31.4,59.9,61.0,76.3,21.4,,,
1,2,Bloemfontein,South Africa,Africa,42.3,12.6,11.2,,42.8,62.0,102.1,72.8,49.7,33.8,28.9,20.1,16.4,,
2,3,Springs,South Africa,Africa,39.1,14.2,13.9,12.2,28.9,66.7,103.4,86.8,50.0,36.3,25.6,11.5,12.3,,
3,4,Vanderbijlpark,South Africa,Africa,34.7,,,,25.6,53.9,59.3,56.8,42.0,25.9,20.9,14.8,12.9,,
4,5,Sebokeng,South Africa,Africa,32.7,20.8,29.0,31.4,19.1,44.7,56.1,44.6,38.9,29.7,29.0,19.7,23.5,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4674,80,Tutamandahostel,Ecuador,South America,6.7,,,,7.1,7.0,6.4,6.7,6.1,7.6,7.6,5.8,6.6,,
4675,81,Calama,Chile,South America,6.4,6.9,5.0,5.7,7.5,5.1,7.1,6.3,6.7,6.7,6.7,6.3,6.8,13.9,9.8
4676,82,Puerto Baquerizo Moreno,Ecuador,South America,5.8,,8.7,12.7,7.4,4.3,,3.6,3.3,3.4,2.8,2.3,3.8,,
4677,83,Punta Arenas,Chile,South America,4.7,3.4,4.1,4.2,4.2,5.5,5.9,5.5,5.1,4.6,4.8,4.6,4.1,4.5,5.5


Add iso3 country codes as match keys. Kosovo = KSV for the purposes of this project, even though it is technically not assigned an iso3 code currently.

In [15]:
enc = open('iso3.csv').encoding
iso3 = pd.read_csv('iso3.csv', encoding=enc)
iso3

Unnamed: 0,country,iso3
0,Aruba,ABW
1,Afghanistan,AFG
2,Angola,AGO
3,Anguilla,AIA
4,Åland Islands,ALA
...,...,...
256,Taiwan,TWN
257,U.S. Virgin Islands,VIR
258,USA,USA
259,Vietnam,VNM


In [18]:
df_all = pd.merge(df_all, iso3, how="left", on="country")
print(df_all[df_all['iso3'].isnull()])

Empty DataFrame
Columns: [cont_rank, city, country, continent, avg_2019, jan, feb, mar, apr, may, jun, jul, aug, sep, oct, nov, dec, avg_2018, avg_2017, iso3]
Index: []


No mismatched values

In [19]:
# reorder columns
df_all = df_all[['cont_rank',
                'city',
                'country',
                'iso3',
                'continent',
                'avg_2019',
                'jan',
                'feb',
                'mar',
                'apr',
                'may',
                'jun',
                'jul',
                'aug',
                'sep',
                'oct',
                'nov',
                'dec',
                'avg_2018',
                'avg_2017']]
df_all

Unnamed: 0,cont_rank,city,country,iso3,continent,avg_2019,jan,feb,mar,apr,may,jun,jul,aug,sep,oct,nov,dec,avg_2018,avg_2017
0,1,Hartbeespoort,South Africa,ZAF,Africa,60.0,42.0,65.6,68.5,57.5,112.1,41.1,31.4,59.9,61.0,76.3,21.4,,,
1,2,Bloemfontein,South Africa,ZAF,Africa,42.3,12.6,11.2,,42.8,62.0,102.1,72.8,49.7,33.8,28.9,20.1,16.4,,
2,3,Springs,South Africa,ZAF,Africa,39.1,14.2,13.9,12.2,28.9,66.7,103.4,86.8,50.0,36.3,25.6,11.5,12.3,,
3,4,Vanderbijlpark,South Africa,ZAF,Africa,34.7,,,,25.6,53.9,59.3,56.8,42.0,25.9,20.9,14.8,12.9,,
4,5,Sebokeng,South Africa,ZAF,Africa,32.7,20.8,29.0,31.4,19.1,44.7,56.1,44.6,38.9,29.7,29.0,19.7,23.5,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4674,80,Tutamandahostel,Ecuador,ECU,South America,6.7,,,,7.1,7.0,6.4,6.7,6.1,7.6,7.6,5.8,6.6,,
4675,81,Calama,Chile,CHL,South America,6.4,6.9,5.0,5.7,7.5,5.1,7.1,6.3,6.7,6.7,6.7,6.3,6.8,13.9,9.8
4676,82,Puerto Baquerizo Moreno,Ecuador,ECU,South America,5.8,,8.7,12.7,7.4,4.3,,3.6,3.3,3.4,2.8,2.3,3.8,,
4677,83,Punta Arenas,Chile,CHL,South America,4.7,3.4,4.1,4.2,4.2,5.5,5.9,5.5,5.1,4.6,4.8,4.6,4.1,4.5,5.5


In [20]:
# write to csv
df_all.to_csv('iq_air.csv', index=False)