In [20]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import enum

### Download dataset

In [21]:
DATASET = r"C:\Users\91930\Documents\GITHUB\ArtOfAI\dataset\annual_aqi_by_county_2023.csv"

In [31]:
aqi = pd.read_csv(DATASET)

In [32]:
aqi.head()

Unnamed: 0,State,County,Year,Days with AQI,Good Days,Moderate Days,Unhealthy for Sensitive Groups Days,Unhealthy Days,Very Unhealthy Days,Hazardous Days,Max AQI,90th Percentile AQI,Median AQI,Days CO,Days NO2,Days Ozone,Days PM2.5,Days PM10
0,Alabama,Baldwin,2023,170,143,27,0,0,0,0,90,54,40,0,0,84,86,0
1,Alabama,Clay,2023,155,109,46,0,0,0,0,83,61,40,0,0,0,155,0
2,Alabama,DeKalb,2023,212,155,55,2,0,0,0,133,63,43,0,0,141,71,0
3,Alabama,Elmore,2023,118,102,16,0,0,0,0,90,54,40,0,0,118,0,0
4,Alabama,Etowah,2023,181,126,55,0,0,0,0,100,64,43,0,0,74,107,0


### How to create new columns derived from existing columns

In [24]:
aqi.head()

Unnamed: 0,State,County,Year,Days with AQI,Good Days,Moderate Days,Unhealthy for Sensitive Groups Days,Unhealthy Days,Very Unhealthy Days,Hazardous Days,Max AQI,90th Percentile AQI,Median AQI,Days CO,Days NO2,Days Ozone,Days PM2.5,Days PM10
0,Alabama,Baldwin,2023,170,143,27,0,0,0,0,90,54,40,0,0,84,86,0
1,Alabama,Clay,2023,155,109,46,0,0,0,0,83,61,40,0,0,0,155,0
2,Alabama,DeKalb,2023,212,155,55,2,0,0,0,133,63,43,0,0,141,71,0
3,Alabama,Elmore,2023,118,102,16,0,0,0,0,90,54,40,0,0,118,0,0
4,Alabama,Etowah,2023,181,126,55,0,0,0,0,100,64,43,0,0,74,107,0


In [25]:
class AQIEnum(enum.Enum):
    GOOD = 0
    YELLOW = 1
    ORANGE = 2
    RED = 3
    PURPLE = 4
    MAROON = 5

In [26]:
def check_aqi(val: int) -> bool:
    if val < 0:
        raise ValueError("AQI Value should not be negative")
    
    if val >= 0 and val <= 50:
        return AQIEnum.GOOD.value
    elif val > 50 and val <= 100:
        return AQIEnum.YELLOW.value
    elif val > 100 and val <= 150:
        return AQIEnum.ORANGE.value
    elif val > 150 and val <= 200:
        return AQIEnum.RED.value
    elif val > 200 and val <= 300:
        return AQIEnum.PURPLE.value
    else:
        return AQIEnum.MAROON.value
    
    
    
    

We are given *Days with AQI* which gives the value of *AQI* on a particular day and  we are going to added another column to the df *aqi*, which will *aqi* in short.

In [35]:
aqi['AQI SHORT'] = [check_aqi(x) for x in aqi["Days with AQI"]]

To create a new column, use the `[]` brackets with the new column name at the left side of the assignment.

#### Note:

The calculation of the values is done **element-wise**. But our calculation we have used loop, because we have to short hand the AQI value.

In [36]:
aqi.head()

Unnamed: 0,State,County,Year,Days with AQI,Good Days,Moderate Days,Unhealthy for Sensitive Groups Days,Unhealthy Days,Very Unhealthy Days,Hazardous Days,Max AQI,90th Percentile AQI,Median AQI,Days CO,Days NO2,Days Ozone,Days PM2.5,Days PM10,AQI SHORT
0,Alabama,Baldwin,2023,170,143,27,0,0,0,0,90,54,40,0,0,84,86,0,3
1,Alabama,Clay,2023,155,109,46,0,0,0,0,83,61,40,0,0,0,155,0,3
2,Alabama,DeKalb,2023,212,155,55,2,0,0,0,133,63,43,0,0,141,71,0,4
3,Alabama,Elmore,2023,118,102,16,0,0,0,0,90,54,40,0,0,118,0,0,2
4,Alabama,Etowah,2023,181,126,55,0,0,0,0,100,64,43,0,0,74,107,0,3


### I want to check the ratio of the value of columns and save the result in new column

We are going to check the ration to Good days to moderate days.

In [39]:
aqi['Good vs Moderate'] = (
    aqi['Good Days'] / aqi['Moderate Days']
)
aqi['Good vs Moderate'].head()

0    5.296296
1    2.369565
2    2.818182
3    6.375000
4    2.290909
Name: Good vs Moderate, dtype: float64

The calculation is give element-wise, so the `/` is applied *for the values in each row*.

Also other mathematical operators (`+`, `-`, `*`, `/`, ...) or logical operators (`<`, `>`, and `==`, ...) work element-wise.

### I want to rename the data columns to the corresponding station identifiers.

In [42]:
aqi_renamed = aqi.rename(
    columns={
        "Days with AQI": "AQI Per Day"
    }

)

In [45]:
#  "Days with AQI" -> "AQI Per Day"
aqi_renamed.head()

Unnamed: 0,State,County,Year,AQI Per Day,Good Days,Moderate Days,Unhealthy for Sensitive Groups Days,Unhealthy Days,Very Unhealthy Days,Hazardous Days,Max AQI,90th Percentile AQI,Median AQI,Days CO,Days NO2,Days Ozone,Days PM2.5,Days PM10,AQI SHORT,Good vs Moderate
0,Alabama,Baldwin,2023,170,143,27,0,0,0,0,90,54,40,0,0,84,86,0,3,5.296296
1,Alabama,Clay,2023,155,109,46,0,0,0,0,83,61,40,0,0,0,155,0,3,2.369565
2,Alabama,DeKalb,2023,212,155,55,2,0,0,0,133,63,43,0,0,141,71,0,4,2.818182
3,Alabama,Elmore,2023,118,102,16,0,0,0,0,90,54,40,0,0,118,0,0,2,6.375
4,Alabama,Etowah,2023,181,126,55,0,0,0,0,100,64,43,0,0,74,107,0,3,2.290909


The mapping should not be restricted to fixed names only, but can be a mapping function as well. For example, converting the column names to lowercase letters can be done using a function as well:

In [46]:
aqi_column_name_lower = aqi_renamed.rename(columns=str.lower)
aqi_column_name_lower.columns

Index(['state', 'county', 'year', 'aqi per day', 'good days', 'moderate days',
       'unhealthy for sensitive groups days', 'unhealthy days',
       'very unhealthy days', 'hazardous days', 'max aqi',
       '90th percentile aqi', 'median aqi', 'days co', 'days no2',
       'days ozone', 'days pm2.5', 'days pm10', 'aqi short',
       'good vs moderate'],
      dtype='object')