# Exploring data by pollutant

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
path_raw_original = '../data/raw/original/'
filename_pollutants = 'Measurement_item_info.csv'
filename_instruments = 'Measurement_info.csv'

df_pollutants = pd.read_csv(path_raw_original + filename_pollutants)
df = pd.read_csv(path_raw_original + filename_instruments)


In [None]:
df_pollutants.head(6)

In [None]:
# replace Item code with Item name & shorten column names

pollutants_dict = pd.Series(df_pollutants['Item name'].values,index=df_pollutants['Item code']).to_dict()
df['Item code'] = df['Item code'].replace(pollutants_dict)
df.rename(columns={"Measurement date": "datetime",
                   "Station code": "station",
                   "Item code": "pollutant",
                  "Average value": "value",
                  "Instrument status": "operational"}, inplace = True)
df['datetime'] = pd.to_datetime(df['datetime'])
df

In [None]:
# Are there any NaNs?

df.isnull().values.any()

In [None]:
# we will want to delete data where values are negative

df = df[(df['value']>0)]
df

In [None]:
# split dataframes by pollutant
# within pollutants split by instrument status code and pickle

pollutants = {}

for p in ['SO2', 'NO2', 'CO', 'O3', 'PM10', 'PM2.5']:
    sub = df.loc[df['pollutant'] == p]
    pollutants[p]={}
    pollutants[p]['all'] = sub
    pollutants[p]['not0'] = sub.loc[sub['operational'] != 0]
    for c in [0,1,2,4,8,9]:
        pollutants[p][c] = sub.loc[sub['operational'] == c]
    
    for k in pollutants[p].keys():
        pollutants[p][k].to_pickle('../data/processed/{}_{}.pkl'.format(p,k))
        

### Plotting some example pollutants and instrument status codes

In [None]:
pollutants['SO2'][0].plot.scatter(x='datetime', y='value', figsize=(12, 6),
                                  title='SO2 at normal instrument operation accross all stations')

In [None]:
ax1 = pollutants['SO2'][0].plot(kind='scatter',x='datetime', y='value',
                                figsize=(12, 6), color='grey', label='0 - normal operation',
                                title='Non-normal operational data points for SO2 pollutant')
ax2 = pollutants['SO2'][9].plot(kind='scatter',x='datetime', y='value',
                                figsize=(12, 6), color='r', ax=ax1, label='9 - abnormal data')
ax3 = pollutants['SO2'][8].plot(kind='scatter',x='datetime', y='value',
                                figsize=(12, 6), color='g', ax=ax1, label='8 - under repair')
ax4 = pollutants['SO2'][4].plot(kind='scatter',x='datetime', y='value',
                                figsize=(12, 6), color='y', ax=ax1, label='4 - power cut off')
ax5 = pollutants['SO2'][2].plot(kind='scatter',x='datetime', y='value',
                                figsize=(12, 6), color='orange', ax=ax1, label='2 - abnormal')


In [None]:
# let's zoom in on the y-range that is important

ax1 = pollutants['SO2'][0].plot(kind='scatter',x='datetime', y='value', ylim=(0,0.4),
                                figsize=(12, 6), color='grey', label='0 - normal operation',
                                title='Non-normal operational data points for SO2 pollutant')
ax2 = pollutants['SO2'][9].plot(kind='scatter',x='datetime', y='value',
                                figsize=(12, 6), color='r', ax=ax1, label='9 - abnormal data')
ax3 = pollutants['SO2'][8].plot(kind='scatter',x='datetime', y='value',
                                figsize=(12, 6), color='g', ax=ax1, label='8 - under repair')
ax4 = pollutants['SO2'][4].plot(kind='scatter',x='datetime', y='value',
                                figsize=(12, 6), color='y', ax=ax1, label='4 - power cut off')
ax5 = pollutants['SO2'][2].plot(kind='scatter',x='datetime', y='value',
                                figsize=(12, 6), color='orange', ax=ax1, label='2 - abnormal')


In [None]:
pollutants['SO2']['all'].operational.value_counts(normalize=True) * 100

In [None]:
pollutants['SO2']['all'].describe()