In [38]:
import pandas as pd
import numpy as np
import os
import glob

In [2]:
canada_air_data_dir = "../data/raw_data/Canada/air/"

## Canada air pollutants data prep

### Pollutants in consideration

PM2.5: Airborne Particulate Matter measuring 2.5 micrometers or smaller<br>
PM10: Airborne Particulate Matter measuring < 10 micrometers > 2.5 micrometers<br>
NO2: Nitrogen Dioxide<br>
SO2: Sulphur Dioxide<br>
CO: Carbon Monoxide<br>
O3: Ozone<br>

In [27]:
def prepCAair(pollutant, groupby=True):
    try:
        df_00_16 = pd.read_csv(os.path.join(canada_air_data_dir, pollutant.upper()+'_00_16.csv'), parse_dates=[0], usecols=[0, 5])
        df_17 = pd.read_csv(os.path.join(canada_air_data_dir, pollutant.upper()+'_17.csv'), parse_dates=[0], usecols=[0, 7])
        
    except FileNotFoundError:
        print('File not found. Accepted values {o3, co, no2, so2, pm25, pm10} case-indifferent')
        
    df = df_00_16.append(df_17, ignore_index=True)
    assert ['DATE_PST', 'RAW_VALUE'] == df.columns.values.tolist()
    
    df.rename(columns={'DATE_PST':'year', 'RAW_VALUE':pollutant.lower()+'_mean'}, inplace=True)
    df['year'] = df['year'].dt.year
    
    df = df[df.year < 2018]
    df = df[df[df.columns[1]] >= 0]
    if groupby:
        df_groupby = df.groupby('year').mean()
        return df_groupby
    return df

In [29]:
# Carbon Monoxide: CO
df_CA_air_CO = prepCAair('co')
df_CA_air_CO['co_units'] = 'ppm'

Unnamed: 0_level_0,co_mean,co_units
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,0.464702,ppm
2001,0.429632,ppm
2002,0.431412,ppm


In [30]:
# Ozone: O3
df_CA_air_O3 = prepCAair('o3')
df_CA_air_O3['o3_units'] = 'ppb'

Unnamed: 0_level_0,o3_mean,o3_units
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,14.981322,ppb
2001,16.44008,ppb
2002,16.30448,ppb


In [31]:
# Nitrogen DiOxide : NO2
df_CA_air_NO2 = prepCAair('no2')
df_CA_air_NO2['no2_units'] = 'ppb'

Unnamed: 0_level_0,no2_mean,no2_units
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,13.304986,ppb
2001,12.443451,ppb
2002,12.235973,ppb


In [32]:
# Sulfur Dioxide: SO2
df_CA_air_SO2 = prepCAair('so2')
df_CA_air_SO2['so2_units'] = 'ppb'

Unnamed: 0_level_0,so2_mean,so2_units
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,3.918739,ppb
2001,2.541681,ppb
2002,2.544733,ppb


In [33]:
# Particulate matter 2.5 micrometers and smaller in size
df_CA_air_PM25 = prepCAair('pm25')
df_CA_air_PM25['pm25_units'] = 'μg/m3'

Unnamed: 0_level_0,pm25_mean,pm25_units
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,5.977311,ug/m3
2001,5.384297,ug/m3
2002,5.973789,ug/m3


In [34]:
# Particulate matter 10 micrometers and smaller
df_CA_air_PM10 = prepCAair('pm10')
df_CA_air_PM10['pm10_units'] = 'μg/m3'

Unnamed: 0_level_0,pm10_mean,pm10_units
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,13.731744,ug/m3
2001,13.258304,ug/m3
2002,14.189306,ug/m3


In [36]:
df_CA_air = pd.concat([df_CA_air_CO, df_CA_air_O3, df_CA_air_NO2, df_CA_air_SO2, df_CA_air_PM25, df_CA_air_PM10], axis=1)
df_CA_air.head()

Unnamed: 0_level_0,co_mean,co_units,o3_mean,o3_units,no2_mean,no2_units,so2_mean,so2_units,pm25_mean,pm25_units,pm10_mean,pm10_units
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2000,0.464702,ppm,14.981322,ppb,13.304986,ppb,3.918739,ppb,5.977311,ug/m3,13.731744,ug/m3
2001,0.429632,ppm,16.44008,ppb,12.443451,ppb,2.541681,ppb,5.384297,ug/m3,13.258304,ug/m3
2002,0.431412,ppm,16.30448,ppb,12.235973,ppb,2.544733,ppb,5.973789,ug/m3,14.189306,ug/m3
2003,0.421084,ppm,17.127153,ppb,11.975094,ppb,3.047193,ppb,6.28433,ug/m3,14.939431,ug/m3
2004,0.424777,ppm,16.083696,ppb,11.233853,ppb,2.629271,ppb,6.001286,ug/m3,14.737286,ug/m3


In [37]:
df_CA_air.to_csv("../data/transformed_data/Canada/df_ca_air.csv")

## Canada fuel economy data prep

In [39]:
canada_fuel_data_dir = "../data/raw_data/Canada/fuel/"

In [54]:
CA_fuel_csv_glob = glob.glob(os.path.join(canada_fuel_data_dir, "*.csv"))

In [167]:
df_CA_fuel = pd.concat((pd.read_csv(f) for f in CA_fuel_csv_glob), ignore_index=True)
df_CA_fuel.head()

Unnamed: 0,YEAR,MAKE,MODEL,VEHICLE CLASS,ENGINE SIZE (L),CYLINDERS,TRANSMISSION,FUEL TYPE,CITY (L/100 km),HWY (L/100 km),COMB (L/100 km),COMB (mpg),CO2 EMISSIONS (g/km),CO2 RATING,SMOG RATING,CO2 EMISSIONS (g/km).1
0,2015,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.7,6.7,8.3,34,191.0,,,
1,2015,ACURA,ILX,COMPACT,2.4,4,M6,Z,10.8,7.4,9.3,30,214.0,,,
2,2015,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,6.1,6.1,46,140.0,,,
3,2015,ACURA,MDX SH-AWD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,255.0,,,
4,2015,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,244.0,,,


In [168]:
df_CA_fuel.columns = df_CA_fuel.columns.str.lower()
df_CA_fuel.columns = df_CA_fuel.columns.str.replace("\s+\(", "(" )

In [169]:
df_CA_fuel.head()

Unnamed: 0,year,make,model,vehicle class,engine size(l),cylinders,transmission,fuel type,city(l/100 km),hwy(l/100 km),comb(l/100 km),comb(mpg),co2 emissions(g/km),co2 rating,smog rating,co2 emissions(g/km).1
0,2015,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.7,6.7,8.3,34,191.0,,,
1,2015,ACURA,ILX,COMPACT,2.4,4,M6,Z,10.8,7.4,9.3,30,214.0,,,
2,2015,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,6.1,6.1,46,140.0,,,
3,2015,ACURA,MDX SH-AWD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,255.0,,,
4,2015,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,244.0,,,


In [170]:
# After renaming columns, we now have 2 duplicates

# Since these are from different datasets, it doesnt look like there is a row where both of these values are missing
(df_CA_fuel['co2 emissions(g/km)'].isna().sum(axis=1) > 1).sum()

0

In [171]:
co2_emissions = df_CA_fuel['co2 emissions(g/km)'].sum(axis=1)
df_CA_fuel.drop('co2 emissions(g/km)', axis=1, inplace=True)
df_CA_fuel['co2 emissions(g/km)'] = co2_emissions
df_CA_fuel.head()

Unnamed: 0,year,make,model,vehicle class,engine size(l),cylinders,transmission,fuel type,city(l/100 km),hwy(l/100 km),comb(l/100 km),comb(mpg),co2 rating,smog rating,co2 emissions(g/km)
0,2015,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.7,6.7,8.3,34,,,191.0
1,2015,ACURA,ILX,COMPACT,2.4,4,M6,Z,10.8,7.4,9.3,30,,,214.0
2,2015,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,6.1,6.1,46,,,140.0
3,2015,ACURA,MDX SH-AWD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,,,255.0
4,2015,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,,,244.0


In [172]:
transmission_decode_dict = {
    'A':'Automatic', 
    'AM':'Automated manual', 
    'AS':'Automatic with select shift',
    'AV':'Continuously variable',
    'M':'Manual'
}

fuel_type_decode_dict = {
    'X':'Regular',
    'Z':'Premium',
    'D':'Diesel',
    'E':'Ethanol (E85)',
    'N': 'Natural gas'    
}

In [175]:
df_CA_fuel['num_gears'] = df_CA_fuel.transmission.str.extract("(\d+)")
df_CA_fuel.transmission = df_CA_fuel.transmission.str.extract("([A-Z]+)")
df_CA_fuel.transmission = df_CA_fuel.transmission.map(transmission_decode_dict)
df_CA_fuel['fuel type'] = df_CA_fuel['fuel type'].map(fuel_type_decode_dict)

In [179]:
# CVT transmission is an diffferent approach to varying ratio gear sets, 
#    with a ratio pulley system most commonly used instead
# A gif explaining how it works: https://i.imgur.com/AmGPVy9.gifv

df_CA_fuel[df_CA_fuel.num_gears.isna()].transmission.value_counts()

Continuously variable    606
Name: transmission, dtype: int64

In [183]:
df_CA_fuel.num_gears.fillna(0,inplace=True)
df_CA_fuel.drop(['co2 rating', 'smog rating'], axis=1, inplace=True)
df_CA_fuel.columns = df_CA_fuel.columns.str.replace("0 ", "")
df_CA_fuel.columns = df_CA_fuel.columns.str.replace(" ", "_")

In [194]:
df_CA_fuel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17553 entries, 0 to 17552
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   year                 17553 non-null  int64  
 1   make                 17553 non-null  object 
 2   model                17553 non-null  object 
 3   vehicle_class        17553 non-null  object 
 4   engine_size(l)       17553 non-null  float64
 5   cylinders            17553 non-null  int64  
 6   transmission         17553 non-null  object 
 7   fuel_type            17553 non-null  object 
 8   city(l/10km)         17553 non-null  float64
 9   hwy(l/10km)          17553 non-null  float64
 10  comb(l/10km)         17553 non-null  float64
 11  comb(mpg)            17553 non-null  int64  
 12  co2_emissions(g/km)  17553 non-null  float64
 13  num_gears            17553 non-null  object 
dtypes: float64(5), int64(3), object(6)
memory usage: 1.9+ MB


In [195]:
df_CA_fuel.describe()

Unnamed: 0,year,engine_size(l),cylinders,city(l/10km),hwy(l/10km),comb(l/10km),comb(mpg),co2_emissions(g/km)
count,17553.0,17553.0,17553.0,17553.0,17553.0,17553.0,17553.0,17553.0
mean,2009.168404,3.413046,5.9105,12.852561,8.847018,11.051342,27.351051,248.701589
std,4.972865,1.325078,1.7914,3.524094,2.302029,2.942049,7.354517,58.818511
min,2000.0,0.8,2.0,3.5,3.2,3.6,11.0,83.0
25%,2005.0,2.4,4.0,10.5,7.3,9.1,22.0,208.0
50%,2009.0,3.3,6.0,12.3,8.3,10.5,27.0,242.0
75%,2013.0,4.3,8.0,14.8,10.2,12.8,31.0,285.0
max,2017.0,8.4,16.0,30.6,20.6,26.1,78.0,570.0


In [200]:
df_CA_fuel.head()

Unnamed: 0,year,make,model,vehicle_class,engine_size(l),cylinders,transmission,fuel_type,city(l/10km),hwy(l/10km),comb(l/10km),comb(mpg),co2_emissions(g/km),num_gears
0,2015,ACURA,ILX,COMPACT,2.0,4,Automatic with select shift,Premium,9.7,6.7,8.3,34,191.0,5
1,2015,ACURA,ILX,COMPACT,2.4,4,Manual,Premium,10.8,7.4,9.3,30,214.0,6
2,2015,ACURA,ILX HYBRID,COMPACT,1.5,4,Continuously variable,Premium,6.0,6.1,6.1,46,140.0,7
3,2015,ACURA,MDX SH-AWD,SUV - SMALL,3.5,6,Automatic with select shift,Premium,12.7,9.1,11.1,25,255.0,6
4,2015,ACURA,RDX AWD,SUV - SMALL,3.5,6,Automatic with select shift,Premium,12.1,8.7,10.6,27,244.0,6


In [202]:
df_CA_fuel.to_csv('../data/transformed_data/Canada/df_CA_fuel.csv',index=False)

In [199]:
df_CA_fuel.groupby(['year','vehicle_class', 'engine_size(l)', 'cylinders']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,city(l/10km),hwy(l/10km),comb(l/10km),comb(mpg),co2_emissions(g/km)
year,vehicle_class,engine_size(l),cylinders,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2000,COMPACT,1.5,4,8.000000,5.800000,7.000000,40.750000,161.000000
2000,COMPACT,1.6,4,8.875000,6.700000,7.900000,35.750000,181.750000
2000,COMPACT,1.8,4,9.913333,7.066667,8.633333,33.133333,198.666667
2000,COMPACT,1.9,4,7.512500,5.125000,6.462500,44.750000,159.625000
2000,COMPACT,2.0,4,9.884211,6.921053,8.547368,33.157895,196.631579
...,...,...,...,...,...,...,...,...
2017,TWO-SEATER,6.5,12,23.250000,13.800000,19.000000,14.750000,447.000000
2017,TWO-SEATER,8.4,10,19.900000,12.600000,16.600000,17.000000,387.000000
2017,VAN - PASSENGER,3.5,6,16.200000,12.800000,14.600000,19.000000,344.000000
2017,VAN - PASSENGER,3.7,6,19.850000,15.750000,18.000000,16.500000,348.500000
