In [1]:
import pandas as pd

# Load and prepare data

In [2]:
def load_data():
    '''
    A function for loading csv data into dataframe df.
    '''

    #Location of csv file
    csv_file = '../raw_data/air_pollution_data.csv'

    #Loading csv file into df dataframe
    df = pd.read_csv(csv_file)

    return df

def clean_data(df):
    '''
    A function to clean raw data:
    - Dropping unuseful columns
    - Dropping rows with year = NA
    - Dropping rows where pm10_concentration AND pm25_concentration AND no2_concentration are NA
    '''

    #Dropping columns: web_link, reference, iso3, who_ms, population_source, version, pm10_tempcov, pm25_tempcov, no2_tempcov
    df.drop(columns=['web_link',
                     'reference',
                     'iso3',
                     'who_ms',
                     'population_source',
                     'version',
                     'pm10_tempcov',
                     'pm25_tempcov',
                     'no2_tempcov'],
            inplace=True)

    #Dropping rows where year is NA (3 rows for India)
    df.dropna(subset=['year'], inplace=True)

    #Dropping rows where pm10_concentration AND pm25_concentration AND no2_concentration are NA
    df.dropna(how='all', subset=['pm10_concentration', 'pm25_concentration', 'no2_concentration'], inplace=True)

    return df


In [3]:
df = load_data()
df = clean_data(df)

In [4]:
def classify_concentrations(df):
    '''
    A function that classifies the concentrations of NO2, PM10, and PM2.5 into categories based on the European Air Quality Index (AQI) classification.
    '''
    # Define classification limits
    no2_limits = [0, 40, 90, 120, 230, 340, 1000]
    pm10_limits = [0, 10, 20, 25, 50, 75, 800]
    pm25_limits = [0, 20, 40, 50, 100, 150, 1200]

    # Classify PM10 concentrations
    df['pm10_class'] = pd.cut(df['pm10_concentration'], bins=pm10_limits, labels=[1, 2, 3, 4, 5, 6])

    # Classify PM2.5 concentrations
    df['pm25_class'] = pd.cut(df['pm25_concentration'], bins=pm25_limits, labels=[1, 2, 3, 4, 5, 6])

    # Classify NO2 concentrations
    df['no2_class'] = pd.cut(df['no2_concentration'], bins=no2_limits, labels=[1, 2, 3, 4, 5, 6])

    # Drop the original concentration columns
    df = df.drop(columns=['no2_concentration', 'pm10_concentration', 'pm25_concentration'])

    # Determine the target class as the maximum of the three pollutant classes
    df['target_class'] = df[['no2_class', 'pm10_class', 'pm25_class']].max(axis=1)
    return df


In [5]:
df = classify_concentrations(df)

In [6]:
df.head()

Unnamed: 0,who_region,country_name,city,year,type_of_stations,population,latitude,longitude,pm10_class,pm25_class,no2_class,target_class
0,4_Eur,Spain,A Coruna/ESP,2013.0,"Urban, Urban, Suburban",246146.0,43.3679,-8.418571,3,1,1,3
1,4_Eur,Spain,A Coruna/ESP,2014.0,"Urban, Urban, Suburban",247604.0,43.368033,-8.418233,4,1,1,4
2,4_Eur,Spain,A Coruna/ESP,2015.0,"Urban, Urban, Suburban, Suburban",247604.0,43.370375,-8.4229,4,1,1,4
3,4_Eur,Spain,A Coruna/ESP,2016.0,"Urban, Urban, Suburban, Suburban",247604.0,43.370375,-8.4229,3,1,1,3
4,4_Eur,Spain,A Coruna/ESP,2017.0,"Urban, Urban, Suburban, Suburban",247604.0,43.370375,-8.4229,4,1,1,4
