In [3]:
import pandas as pd
import numpy as np
import datetime

# import functions made by us
import functions

# Data is missing from the repository because they had huge sizes (60GB, 1.5GB, 100MB)

# Example reading SQL

In [1]:
# We tried to use SQL with traffic flow and parque2013 but one is too big (60gb in total) and the other
# gives un an error, so we'll import in this cell one month from air data just for an example.
import pymysql
from sqlalchemy import create_engine

from getpass import getpass  # To get the password without showing the input

In [5]:
password = getpass()
connection_string = 'mysql+pymysql://root:'+password+'@localhost/aire'
engine = create_engine(connection_string)
example = pd.read_sql_query('SELECT * FROM abr_mo01', engine)
example.head(5)

········


Unnamed: 0,PROVINCIA,MUNICIPIO,ESTACION,MAGNITUD,PUNTO_MUESTREO,ANO,MES,DIA,H01,V01,...,H20,V20,H21,V21,H22,V22,H23,V23,H24,V24
0,28,79,4,1,28079004_1_38,2001,4,1,20,V,...,9,V,11,V,18,V,27,V,34,V
1,28,79,4,1,28079004_1_38,2001,4,2,17,V,...,12,V,14,V,15,V,13,V,11,V
2,28,79,4,1,28079004_1_38,2001,4,3,11,V,...,9,V,10,V,11,V,10,V,9,V
3,28,79,4,1,28079004_1_38,2001,4,4,8,V,...,10,V,10,V,10,V,9,V,8,V
4,28,79,4,1,28079004_1_38,2001,4,5,8,V,...,9,V,9,V,11,V,13,V,14,V


## Reading air_quality data

In [2]:
# Reading air_quality data and concatenating alll the data
anios = range(2010,2020)
folders = ["Anio"+str(anio) for anio in anios]
months = ["ene","feb","mar","abr","may","jun","jul","ago","sep","oct","nov","dic"]

all_data = pd.DataFrame()

for k,folder in enumerate(folders):
    months_anio = [months[i]+"_mo"+folder[-2:]+".csv" for i in range(len(months))]
    
    for month in months_anio:
        new_month = pd.read_csv("./air_quality/"+folder+"/"+month,sep=";",encoding="latin-1")
        all_data = pd.concat([all_data, new_month], axis=0)

## Cleaning data

In [3]:
# Resseting index after the concat
all_data = all_data.reset_index(drop=True)

# Filtering out stations that are not within zone A+B
all_data=all_data[all_data['ESTACION'].isin([38,48,4,35,8,49,47,11,39,50])].reset_index(drop=True)

## Processing data

In [4]:
# Extracting vector H and V
V_haches = ['H01', 'V01', 'H02', 'V02', 'H03', 'V03', 'H04',
       'V04', 'H05', 'V05', 'H06', 'V06', 'H07', 'V07', 'H08', 'V08', 'H09',
       'V09', 'H10', 'V10', 'H11', 'V11', 'H12', 'V12', 'H13', 'V13', 'H14',
       'V14', 'H15', 'V15', 'H16', 'V16', 'H17', 'V17', 'H18', 'V18', 'H19',
       'V19', 'H20', 'V20', 'H21', 'V21', 'H22', 'V22', 'H23', 'V23', 'H24','V24']
haches = []
uves = []
for index in range(0,len(V_haches),2):
    haches.append(V_haches[index])
    uves.append(V_haches[index+1])

In [5]:
# Optional ~ Printing options
printing = 0
updating = 2500

# For every row, doing the average across al the values H-XX if V-XX == V, otherwise it means that 
# measure is not reliable and so we don't count it.
for brute_index,row in all_data.iterrows():
    count = 0
    total = 0
    
    for index in range(len(uves)):
    # for every column VXX we check if it's 'V'
        if row[uves[index]]=='V':
            count += 1
            total += row[haches[index]]
    
    if count != 0:        
        all_data.loc[brute_index,'MEAN'] = total/count
    
    # If there are no reliable values for that day, write the mean as a None
    else:
        all_data.loc[brute_index,'MEAN'] = None
    
    # Optional ~ printing
    if (brute_index+1)%updating==0:
        printing += updating
        print('%.1f%% completed.' %(printing/len(all_data)*100),end='\r')
print('100.0% completed.',end='\r') 

100.0% completed.

In [6]:
# Grouping the same magnitud (particle) measurements of different stations
all_data = all_data.groupby(['MAGNITUD','ANO','MES','DIA']).agg({'MEAN':np.mean}).reset_index()

In [7]:
# Concatenating year-month-day as a single column
all_data['FECHA'] = all_data[['ANO','MES','DIA']].apply(lambda row: '-'.join(row.values.astype(str)), axis=1)
all_data['FECHA'] = pd.to_datetime(all_data['FECHA'])

In [8]:
# Filtering out every year before 2013
all_data=all_data[all_data['FECHA']>='2013-01-01'].reset_index(drop=True)

In [9]:
# Creating a column with the first day of that time-range (we'll use this column later to aggregate by week)
all_data = functions.aggregate_time2(all_data,'FECHA',days=15)

100.0% completed.

In [10]:
# Grouping by week, magnitud
all_data = all_data.groupby(['MAGNITUD','time_range']).agg({'MEAN':np.mean}).reset_index()

In [11]:
# Snake_case before saving the data
all_data.columns = list(map(lambda x: x.lower(), all_data.columns))
all_data.columns = all_data.columns.str.replace(' ','_')
all_data

Unnamed: 0,magnitud,time_range,mean
0,1,2013-01-01,7.362202
1,1,2013-01-16,5.173782
2,1,2013-01-31,6.855948
3,1,2013-02-15,6.101359
4,1,2013-03-02,4.555366
...,...,...,...
2299,44,2019-10-27,0.114512
2300,44,2019-11-11,0.095403
2301,44,2019-11-26,0.091150
2302,44,2019-12-11,0.073052


In [13]:
# Saving the results 
all_data.to_csv('./air_quality/clean_air.csv',index=False)