In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# exploration questions:
# are there any months in the year that seem to be higher AQI? lower AQI?
# are there any gases that change in level frequently? are there any that seem to change in tandem with others?

## AQI Basics

    Daily AQI color     Level of concern           Value of index     Desc. of air quality

    Green               Good                       0 to 50            Air quality is satisfactory, and air pollution poses
                                                                      little or no risk.

    Yellow              Moderate                   51 to 100          Air quality is acceptable. However, there may be a
                                                                      risk for some people, particularly those who are
                                                                      unusually sensitive to air pollution.

    Orange              Unhealthy for sensitive    101 to 150         Members of sensitive groups may experience health
                                                                      effects. The general public is less likely to be
                                                                      affected.

    Red                 Unhealthy                  151 to 200         Some members of the general public may experience
                                                                      health effects; members of sensitive groups may
                                                                      experience more serious health effects.

    Purple              Very unhealthy             201 to 300         Health alert: The risk of health effects is increased
                                                                      for everyone.

    Maroon              Hazardous                  301+               Health warning of emergency conditions: everyone
                                                                      is more likely to be affected.
        

In [3]:
air_quality = pd.read_csv('data/aqi_data.csv')
air_quality.columns

Index(['Id', 'Mounths', 'PM10 in æg/m3', 'SO2 in æg/m3', 'NOx  in æg/m3',
       ' PM2.5  in æg/m3', 'Ammonia - NH3  in æg/m3', 'O3   in æg/m3',
       'CO  in mg/m3', ' Benzene  in æg/m3', 'AQI'],
      dtype='object')

In [4]:
# renaming columns
air_quality.columns = ['id', 'month', 'PM10', 'SO2', 'NOx', 'PM25', 'NH3', 'O3', 'CO', 'benzene', 'AQI']
air_quality

Unnamed: 0,id,month,PM10,SO2,NOx,PM25,NH3,O3,CO,benzene,AQI
0,1,Jan-17,174.0,26.4,35.0,79,25.0,107.6,0.90,0.70,149.0
1,2,Feb-17,143.0,35.1,40.3,75,31.0,103.0,0.90,0.90,129.0
2,3,Mar-17,142.0,32.1,30.9,59,26.0,80.7,0.80,0.50,128.0
3,4,Apr-17,117.0,50.9,36.3,75,36.0,79.5,0.90,0.70,111.0
4,5,May-17,,41.6,25.2,53,28.0,70.0,0.50,0.50,
...,...,...,...,...,...,...,...,...,...,...,...
67,68,Aug-22,89.0,12.0,35.0,22,26.0,15.2,0.64,0.09,67.0
68,69,Sep-22,92.0,13.0,37.3,30,31.0,10.0,0.70,0.06,76.0
69,70,Oct-22,93.0,12.0,30.7,46,28.0,9.7,0.67,0.07,101.0
70,71,Nov-22,87.0,10.4,28.2,78,30.0,12.1,1.52,0.03,166.0


In [5]:
# checking data types
air_quality.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72 entries, 0 to 71
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       72 non-null     int64  
 1   month    72 non-null     object 
 2   PM10     66 non-null     float64
 3   SO2      71 non-null     float64
 4   NOx      70 non-null     float64
 5   PM25     72 non-null     int64  
 6   NH3      72 non-null     float64
 7   O3       72 non-null     float64
 8   CO       72 non-null     float64
 9   benzene  72 non-null     float64
 10  AQI      67 non-null     float64
dtypes: float64(8), int64(2), object(1)
memory usage: 6.3+ KB


In [6]:
air_quality.head()

Unnamed: 0,id,month,PM10,SO2,NOx,PM25,NH3,O3,CO,benzene,AQI
0,1,Jan-17,174.0,26.4,35.0,79,25.0,107.6,0.9,0.7,149.0
1,2,Feb-17,143.0,35.1,40.3,75,31.0,103.0,0.9,0.9,129.0
2,3,Mar-17,142.0,32.1,30.9,59,26.0,80.7,0.8,0.5,128.0
3,4,Apr-17,117.0,50.9,36.3,75,36.0,79.5,0.9,0.7,111.0
4,5,May-17,,41.6,25.2,53,28.0,70.0,0.5,0.5,


In [7]:
# selecting only non-null aqi values
aqi = air_quality.loc[air_quality.AQI.notnull()]

In [8]:
# creating new column with quality classification
# with for loop

for index, row in aqi.iterrows():
    if row.AQI <50:
        aqi.loc[index, 'level_of_concern'] = 'good'
    elif row.AQI <100:
        aqi.loc[index, 'level_of_concern'] = 'moderate'
    elif row.AQI <150:
        aqi.loc[index, 'level_of_concern'] = 'unhealthy for sensitive'
    elif row.AQI <200:
        aqi.loc[index, 'level_of_concern'] = 'unhealthy'
    elif row.AQI <300:
        aqi.loc[index, 'level_of_concern'] = 'very unhealthy'
    elif row.AQI >301:
        aqi.loc[index, 'level_of_concern'] = 'hazardous'
    else: # for any null values / mistakes
        aqi.loc[index, 'level_of_concern'] = '-1'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  aqi.loc[index, 'level_of_concern'] = 'unhealthy for sensitive'


In [9]:
# value counts for level of concern by number of months

months_count = aqi.level_of_concern.value_counts().reset_index().rename(columns = {'index':'level_of_concern', 'level_of_concern':'months_count'})
months_count

Unnamed: 0,level_of_concern,months_count
0,unhealthy for sensitive,37
1,moderate,27
2,unhealthy,3


In [17]:
# add year column & separate month & year, then convert month into integer

# first convert month column to datetime
aqi.dtypes

id                    int64
month                object
PM10                float64
SO2                 float64
NOx                 float64
PM25                  int64
NH3                 float64
O3                  float64
CO                  float64
benzene             float64
AQI                 float64
level_of_concern     object
dtype: object

In [19]:
aqi.month

0     Jan-17
1     Feb-17
2     Mar-17
3     Apr-17
6     Jul-17
       ...  
67    Aug-22
68    Sep-22
69    Oct-22
70    Nov-22
71    Dec-22
Name: month, Length: 67, dtype: object