In [7]:
import pandas as pd
import glob
import numpy as np
import os
data_dir = "../../data/raw_data/"

## US fuel economy data

In [29]:
us_fuel_data = "USA/fuel/US_fuel_econ_84-17.csv"
df_us_fuel = pd.read_csv(os.path.join(data_dir, us_fuel_data), usecols=\
                         [1, 2, 3, 4, 5, 6, 10, 11, 12, 13, 14, 15, 17, 18, 24, 25, 35, 36])
df_us_fuel = df_us_fuel[df_us_fuel["Year"] >= 2000].reset_index(drop=True) # only looking at data from 2000

In [30]:
df_us_fuel.columns = [c.lower().split(' (')[0].replace(' ', '_') for c in df_us_fuel.columns]
df_us_fuel.rename(columns= {'class':'veh_class'}, inplace=True)

In [47]:
df_us_fuel['highway_mpg'] = [unrounded if unrounded > 0.0 else rounded for rounded, unrounded in df_us_fuel[df_us_fuel.columns[df_us_fuel.columns.str.contains('highway')]].values]
df_us_fuel.drop(columns= 'unrounded_highway_mpg', inplace=True)

In [49]:
df_us_fuel['city_mpg'] = [unrounded if unrounded > 0.0 else rounded for rounded, unrounded in df_us_fuel[df_us_fuel.columns[df_us_fuel.columns.str.contains('city')]].values]
df_us_fuel.drop(columns= 'unrounded_city_mpg', inplace=True)

In [55]:
df_us_fuel['combined_mpg'] = [unrounded if unrounded > 0.0 else rounded for rounded, unrounded in df_us_fuel[df_us_fuel.columns[df_us_fuel.columns.str.contains('combined')]].values]
df_us_fuel.drop(columns= 'unrounded_combined_mpg', inplace=True)

In [51]:
df_us_fuel.fuel_type_1.value_counts() # I will drop the vehicles with primarily electric engines (Fuel type 1)

Regular Gasoline     11729
Premium Gasoline      7981
Diesel                 179
Electricity            123
Midgrade Gasoline       77
Natural Gas             47
Name: fuel_type_1, dtype: int64

In [52]:
df_us_fuel = df_us_fuel.drop(np.where(df_us_fuel.fuel_type_1=='Electricity')[0]).reset_index(drop=True)

In [53]:
df_us_fuel.turbocharger = np.where(df_us_fuel.turbocharger == 'T', 1, 0)
df_us_fuel.supercharger = np.where(df_us_fuel.supercharger == 'S', 1, 0)

In [56]:
df_us_fuel.isna().sum()

year                   0
make                   0
model                  0
veh_class              0
drive                  1
transmission           0
engine_cylinders       0
engine_displacement    0
turbocharger           0
supercharger           0
fuel_type              0
fuel_type_1            0
city_mpg               0
highway_mpg            0
combined_mpg           0
dtype: int64

In [57]:
display(df_us_fuel[df_us_fuel['drive'].isna()])
df_us_fuel['drive'].iat[2306] = 'Rear-Wheel Drive'

Unnamed: 0,year,make,model,veh_class,drive,transmission,engine_cylinders,engine_displacement,turbocharger,supercharger,fuel_type,fuel_type_1,city_mpg,highway_mpg,combined_mpg
2306,2002,Lotus,Esprit V8,Two Seaters,,Manual 5-Speed,8.0,3.5,1,0,Premium,Premium Gasoline,14.0,21.0,16.0


In [59]:
df_us_fuel.to_csv('../../data/transformed_data/USA/df_us_fuel.csv', index=False)

## US air quality data

**The major air pollutants most commonly measured are PM2.5, PM10, NO2, SO2, CO, and O3**
* `PM2.5`: Airborne Particulate Matter measuring 2.5 micrometers or smaller
* `PM10`: Airborne Particulate Matter measuring < 10 micrometers > 2.5 micrometers
* `NO2`: Nitrogen Dioxide
* `SO2`: Sulphur Dioxide
* `CO`: Carbon Monoxide
* `O3`: Ozone


### PM10 and PM2.5 data that is missing from the main dataset

#### PM10

pm10_glob = glob.glob(data_dir + "/USA/air/daily*.csv")
df_pm10 = pd.concat((pd.read_csv(f, parse_dates=[2]) for f in pm10_glob))

#__This will not run until the files are extracted from the zip archive__

df_pm10.columns = df_pm10.columns.str.lower()
df_pm10.columns = df_pm10.columns.str.replace(" ", "_")
df_pm10 = df_pm10.set_index('date_local')
df_pm10['year'] = df_pm10.index.year

df_pm10['parameter_name'] = np.where(df_pm10['sample_duration']=='24 HOUR', 'pm10_24h', 'pm10')
df_pm10_pivot = df_pm10.pivot_table(values='arithmetic_mean', columns='parameter_name', index = 'year').rename_axis("", axis=1)
df_pm10_pivot.rename(columns={'pm10':'pm10_mean', 'pm10_24h': 'pm10_24h_mean'}, inplace=True)

df_pm10_pivot['pm10_units'] = 'μg/m3'
df_pm10_pivot['pm10_24h_units'] = 'μg/m3'

#### PM2.5

pm25_glob = glob.glob(data_dir + '/USA/air/daily_88101*.csv')
df_pm25 = pd.concat((pd.read_csv(f) for f in pm25_glob))

#__This will not run until the files are extracted from the zip archive__

df_pm25 = df_pm25[df_pm25.columns[8:]]
df_pm25.drop(columns=['Pollutant Standard', 'Event Type', 'Observation Count', 'Observation Percent'], inplace=True)

df_pm25 = df_pm25[df_pm25.columns[:5]]
df_pm25.index = pd.DatetimeIndex(df_pm25['Date Local'])
df_pm25.drop(columns='Date Local', inplace=True)

df_pm25.columns = df_pm25.columns.str.lower()
df_pm25.columns = df_pm25.columns.str.replace(" ", "_")
df_pm25['year'] = df_pm25.index.year

df_pm25 = df_pm25[df_pm25.sample_duration != '24-HR BLK AVG']
df_pm25['parameter_name'] = np.where(df_pm25['sample_duration']=='24 HOUR', 'pm2.5_24h', 'pm2.5')

df_pm25_pivot = df_pm25.pivot_table(values='arithmetic_mean', columns='parameter_name', index = 'year').rename_axis("", axis=1)
df_pm25_pivot.rename(columns={'pm2.5':'pm2.5_mean', 'pm2.5_24h': 'pm2.5_24h_mean'}, inplace=True)

df_pm25_pivot['pm2.5_units'] = 'μg/m3'
df_pm25_pivot['pm2.5_24h_units'] = 'μg/m3'

#### 2017

In [109]:
df_us_air2017 = pd.read_csv(data_dir + "/USA/air/annual_conc_by_monitor_2017.csv", usecols=[8, 9, 13, 14, 27])
df_us_air2017.columns = [c.lower().replace(' ','_') for c in df_us_air2017.columns]

In [110]:
# The EPA provides granular details on PM pollutants, generalizing to "PM2.5"
df_us_air2017_pm25mask = np.where(df_us_air2017.parameter_name.str.contains('PM\s?2.5'), True, False)
df_us_air2017.loc[df_us_air2017_pm25mask, 'parameter_name'] = "PM2.5"

# "PM10"
df_us_air2017_pm10mask = np.where(df_us_air2017.parameter_name.str.contains("PM\s?10"), True, False)
df_us_air2017.loc[df_us_air2017_pm10mask, 'parameter_name'] = 'PM10'

# NO2
df_us_air2017['parameter_name'] = df_us_air2017.parameter_name.replace('Nitrogen dioxide (NO2)', "NO2")

# SO2
df_us_air2017['parameter_name'] = df_us_air2017.parameter_name.replace('Sulfur dioxide', "SO2")

# CO
df_us_air2017['parameter_name'] = df_us_air2017.parameter_name.replace('Carbon monoxide', "CO")

# Ozone
df_us_air2017['parameter_name'] = df_us_air2017.parameter_name.replace('Ozone', "O3")

In [111]:
df_us_air2017 = df_us_air2017[np.isin(df_us_air2017.parameter_name, ["PM2.5", 'PM10', "NO2", "SO2", "CO", "O3"])]
# Using these pollutants as they are the most commmonly reported and observed

df_us_air2017 = df_us_air2017[(df_us_air2017.sample_duration == '1 HOUR') | \
                        (df_us_air2017.sample_duration == '24 HOUR')].reset_index(drop=True)

In [112]:
us_air2017_unit_mapper = {'Micrograms/cubic meter (LC)': 'μg/m3', 
                          'Parts per million':'ppm',
                          'Parts per billion':'ppb',
                          'Micrograms/cubic meter (25 C)': 'μg/m3', 
                          'Nanograms/cubic meter (LC)': 'ng/m3',
                          'Nanograms/cubic meter (25 C)':'ng/m3'}

In [113]:
df_us_air2017['units_of_measure'] = df_us_air2017.units_of_measure.map(us_air2017_unit_mapper)
df_us_air2017 = df_us_air2017.drop(np.where(df_us_air2017.units_of_measure == 'ng/m3')[0]).reset_index(drop=True)

In [114]:
# Flattening to remove the sample duration feature

df_us_air2017['parameter_name'] = [p + '_24H' if s == '24 HOUR' else p for p, s in df_us_air2017[['parameter_name', 'sample_duration']].values]
del df_us_air2017['sample_duration']

In [115]:
us_air2017_paramunits_dict = {param:units for param,units in df_us_air2017.groupby(['parameter_name', 'units_of_measure']).mean().index.values}
# us_air2017_paramunits_dict

In [116]:
pivoted_us_air_2017 = df_us_air2017.pivot_table(values='arithmetic_mean', columns='parameter_name', index='year').rename_axis('').rename_axis("",axis=1)
pivoted_us_air_2017

Unnamed: 0,CO,NO2,O3,PM10,PM10_24H,PM2.5,PM2.5_24H,SO2
,,,,,,,,
2017.0,0.290675,12.672481,0.047358,22.240733,15.376554,7.429756,1.854176,3.005513


In [117]:
for c in pivoted_us_air_2017.columns.values:
    pivoted_us_air_2017[c + '_units'] = us_air2017_paramunits_dict[c]
    
pivoted_us_air_2017.columns = pivoted_us_air_2017.columns.str.lower()
pivoted_us_air_2017.columns = [c + '_mean' if c in pivoted_us_air_2017.columns[~pivoted_us_air_2017.columns.str.contains('_units')] else c for c in  pivoted_us_air_2017.columns]
pivoted_us_air_2017

Unnamed: 0,co_mean,no2_mean,o3_mean,pm10_mean,pm10_24h_mean,pm2.5_mean,pm2.5_24h_mean,so2_mean,co_units,no2_units,o3_units,pm10_units,pm10_24h_units,pm2.5_units,pm2.5_24h_units,so2_units
,,,,,,,,,,,,,,,,
2017.0,0.290675,12.672481,0.047358,22.240733,15.376554,7.429756,1.854176,3.005513,ppm,ppb,ppm,μg/m3,μg/m3,μg/m3,μg/m3,ppb


#### 2000 - 2016

In [171]:
df_us_air2 = pd.read_csv(data_dir + "/USA/air/pollution_us_2000_2016.csv", usecols=[0, 1, 2, 4, 5, 7, 8, 10, 11],\
                         parse_dates=[0])
df_us_air2.columns = [c.lower().replace(' ','_') for c in df_us_air2.columns]
df_us_air2['year'] = df_us_air2.date_local.dt.year

In [174]:
del df_us_air2['date_local']

In [177]:
df_us_air2 = df_us_air2.replace(['Parts per billion', 'Parts per million'], ['ppb', 'ppm'])
usair2_units = df_us_air2[df_us_air2.columns[df_us_air2.columns.str.contains('_units')]].iloc[0]
# usair2_units

In [284]:
df_usa2_grouped = df_us_air2.groupby('year').mean()
for c in usair2_units.index: df_usa2_grouped[c] = usair2_units[c]
df_usa2_grouped.sort_index(axis=1, inplace=True)

In [290]:
df_us_air_table = df_usa2_grouped.join([df_pm10_pivot, df_pm25_pivot]).append(pivoted_us_air_2017)
df_us_air_table.to_csv('../../data/transformed_data/USA/df_us_air.csv', index_label = 'year')