In [1]:
import pandas as pd
import numpy as np
import os
import glob

### Pollutants in consideration

PM2.5: Airborne Particulate Matter measuring 2.5 micrometers or smaller<br>
PM10: Airborne Particulate Matter measuring < 10 micrometers > 2.5 micrometers<br>
NO2: Nitrogen Dioxide<br>
SO2: Sulphur Dioxide<br>
CO: Carbon Monoxide<br>
O3: Ozone<br>

In [688]:
uk_data_air_path = '../data/raw_data/UK/air/'
df_uk_air = pd.read_csv(os.path.join(uk_data_air_path, 'air-quality-data-continuous.csv'), parse_dates=[0])

In [689]:
df_uk_air['year'] = df_uk_air['Date Time'].dt.year
df_uk_air = df_uk_air[(df_uk_air.year >= 2000) & (df_uk_air.year < 2018)]
df_uk_air = df_uk_air.sort_values('Date Time').reset_index(drop=True)

In [690]:
df_uk_air['day'] = df_uk_air['Date Time'].dt.day
df_uk_air['month'] = df_uk_air['Date Time'].dt.month
df_uk_air_gb = df_uk_air.groupby(['year', 'month', 'day']).mean().reset_index()

uk_air_gb_dates = df_uk_air['Date Time'].dt.date.unique()
df_uk_air_gb = df_uk_air_gb.set_index(uk_air_gb_dates)
df_uk_air_gb.index = pd.DatetimeIndex(df_uk_air_gb.index)
# df_uk_air_gb

Unnamed: 0,year,month,day,NO2,PM10,PM2.5,CO,O3,SO2
2000-01-01,2000,1,1,40.510303,19.125000,,0.618750,18.250000,6.687500
2000-01-02,2000,1,2,47.422500,12.590909,,0.622727,40.181818,4.727273
2000-01-03,2000,1,3,48.576053,16.421053,,0.815789,40.947368,6.368421
2000-01-04,2000,1,4,70.966905,19.210526,,0.752632,16.526316,7.473684
2000-01-05,2000,1,5,46.120889,22.666667,,0.309524,50.285714,4.238095
...,...,...,...,...,...,...,...,...,...
2017-12-27,2017,12,27,25.105361,7.480650,3.314286,,57.942970,
2017-12-28,2017,12,28,56.028080,19.095034,10.750000,,32.140911,
2017-12-29,2017,12,29,32.421077,17.165568,6.900000,,57.908504,
2017-12-30,2017,12,30,21.583102,34.635833,11.835714,,70.208096,


In [691]:
del df_uk_air_gb['month']
del df_uk_air_gb['day']

__Secondary dataset to fill missing values of `SO2`__

In [692]:
uk_so2 = pd.read_csv(os.path.join(uk_data_air_path, 'so2_12_18.csv'), parse_dates=[0])
uk_so2 = uk_so2.drop(columns=uk_so2.columns[uk_so2.columns.str.contains('Status')])
uk_so2 = uk_so2.replace('No data', '-1')

uk_so2_dates = uk_so2['Date']
uk_so2 = uk_so2[uk_so2.columns[1:]].astype('int')

In [693]:
uk_so2.replace(-1, np.nan, inplace=True)
uk_so2_rowmeans = uk_so2.mean(axis=1, skipna=True)
uk_so2_rowmeans.set_axis(uk_so2_dates, inplace=True)
# uk_so2_rowmeans

In [694]:
df_uk_air_gb['SO2'].iloc[df_uk_air_gb.index.isin(uk_so2_rowmeans.index)] = uk_so2_rowmeans

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


__Carbon Monoxide data__

In [707]:
uk_co = pd.read_csv(os.path.join(uk_data_air_path, 'co.csv'), parse_dates=[0], index_col=[0]) #mg/m3
uk_co = uk_co.drop(columns = uk_co.columns[uk_co.columns.str.contains('Status')])
uk_co.replace('No data', '-1.', inplace=True)
uk_co = uk_co.astype('float32')

In [708]:
uk_co.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1826 entries, 2013-01-01 to 2017-12-31
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Carbon monoxide    1826 non-null   float32
 1   Carbon monoxide.1  1826 non-null   float32
 2   Carbon monoxide.2  1826 non-null   float32
 3   Carbon monoxide.3  1826 non-null   float32
 4   Carbon monoxide.4  1826 non-null   float32
 5   Carbon monoxide.5  1826 non-null   float32
 6   Carbon monoxide.6  1826 non-null   float32
dtypes: float32(7)
memory usage: 64.2 KB


In [710]:
uk_co.replace(-1, np.nan, inplace=True)
uk_co_daily_means = pd.Series(np.nanmean(uk_co, axis=1), index=uk_co.index)
df_uk_air_gb['CO'].iloc[df_uk_air_gb.index.isin(uk_co_daily_means.index)] = uk_co_daily_means

__Particulate Matter 2.5 data__

In [732]:
uk_pm25 = pd.read_csv(os.path.join(uk_data_air_path, 'pm2_5.csv'), parse_dates=[0], index_col=[0]) #micrograms/m3
uk_pm25 = uk_pm25.drop(columns = uk_pm25.columns[uk_pm25.columns.str.contains('Status')])
uk_pm25.replace('No data', '-1', inplace=True)
uk_pm25 = uk_pm25.astype('int')

In [733]:
uk_pm25.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2922 entries, 2000-01-01 to 2007-12-31
Data columns (total 6 columns):
 #   Column                                        Non-Null Count  Dtype
---  ------                                        --------------  -----
 0   PM2.5 particulate matter (Hourly measured)    2922 non-null   int32
 1   PM2.5 particulate matter (Hourly measured).1  2922 non-null   int32
 2   PM2.5 particulate matter (Hourly measured).2  2922 non-null   int32
 3   PM2.5 particulate matter (Hourly measured).3  2922 non-null   int32
 4   PM2.5 particulate matter (Hourly measured).4  2922 non-null   int32
 5   PM2.5 particulate matter (Hourly measured).5  2922 non-null   int32
dtypes: int32(6)
memory usage: 91.3 KB


In [740]:
uk_pm25.replace(-1, np.nan, inplace=True)
uk_pm25_daily_means = pd.Series(np.nanmean(uk_pm25, axis=1), index=uk_pm25.index)
df_uk_air_gb['PM2.5'].iloc[df_uk_air_gb.index.isin(uk_pm25_daily_means.index)] = uk_pm25_daily_means

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [745]:
df_uk_air_gb.set_axis(df_uk_air_gb.columns.str.lower(), axis=1, inplace=True)
uk_air_col_mapper = {col:col+'_mean' for col in df_uk_air_gb.columns[1:]}
# uk_air_col_mapper

In [771]:
df_uk_air_gb.rename(columns=uk_air_col_mapper, inplace=True)
df_uk_air_gb = df_uk_air_gb.groupby('year').mean()
# df_uk_air_gb

In [769]:
units = {
    'no2_units':'μg/m3',
    'pm10_units':'μg/m3',
    'pm2.5_units':'μg/m3',
    'co_units':'mg/m3',
    'o3_units':'μg/m3',
    'so2_units':'μg/m3'
}

for unit in units:
    df_uk_air_gb[unit] = units[unit]

In [773]:
df_uk_air_gb.sort_index(axis=1, inplace=True)
df_uk_air_gb

Unnamed: 0_level_0,co_mean,co_units,no2_mean,no2_units,o3_mean,o3_units,pm10_mean,pm10_units,pm2.5_mean,pm2.5_units,so2_mean,so2_units
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2000,0.597871,mg/m3,41.150895,μg/m3,38.749863,μg/m3,25.678764,μg/m3,15.658015,μg/m3,8.952743,μg/m3
2001,0.582164,mg/m3,27.611768,μg/m3,37.732052,μg/m3,23.491575,μg/m3,15.020091,μg/m3,8.116022,μg/m3
2002,0.502731,mg/m3,38.097511,μg/m3,37.932929,μg/m3,25.706487,μg/m3,14.018265,μg/m3,6.801325,μg/m3
2003,0.56795,mg/m3,50.176843,μg/m3,39.940277,μg/m3,28.753549,μg/m3,14.319178,μg/m3,7.146593,μg/m3
2004,0.389436,mg/m3,44.198753,μg/m3,40.175748,μg/m3,24.267985,μg/m3,13.249317,μg/m3,3.945776,μg/m3
2005,0.347381,mg/m3,47.046574,μg/m3,45.021357,μg/m3,24.026207,μg/m3,13.28242,μg/m3,3.233386,μg/m3
2006,0.327601,mg/m3,49.989958,μg/m3,43.157261,μg/m3,22.264746,μg/m3,14.413333,μg/m3,2.852252,μg/m3
2007,0.383096,mg/m3,47.758656,μg/m3,42.822425,μg/m3,20.767027,μg/m3,12.487626,μg/m3,2.305862,μg/m3
2008,0.376759,mg/m3,47.85944,μg/m3,44.080978,μg/m3,21.101994,μg/m3,13.100326,μg/m3,2.090882,μg/m3
2009,0.371968,mg/m3,47.914751,μg/m3,44.008868,μg/m3,19.222127,μg/m3,13.315674,μg/m3,2.239381,μg/m3


In [808]:
df_uk_air_gb.to_csv('../data/transformed_data/UK/df_uk_air.csv')

## Fuel

In [775]:
uk_data_fuel_path = '../data/raw_data/UK/fuel/'

2000

In [776]:
uk_fuel_00_glob = glob.glob(os.path.join(uk_data_fuel_path, r'uk_fuel_2000/*.csv'))
df_uk_fuel_00 = pd.concat((pd.read_csv(f, encoding='cp850') for f in uk_fuel_00_glob), ignore_index=True)
df_uk_fuel_00 = df_uk_fuel_00.drop(columns=df_uk_fuel_00.columns[[2, 3, 4, 8, 9, 10, 14, 15, 16, 17, 18, 19, 20, 21, 22]])
df_uk_fuel_00['year'] = 2000

df_uk_fuel_00.rename(columns={df_uk_fuel_00.columns[0]:'Manufacturer',\
                              df_uk_fuel_00.columns[2]:'Transmission'},\
                     inplace=True)

df_uk_fuel_00.head(1)

Unnamed: 0,Manufacturer,Model,Transmission,Engine Capacity,Fuel Type,Imperial (cold),Imperial Extra-urban,Imperial Combined,year
0,FORD,Fiesta,M5,1242,Petrol,32.5,48.7,40.9,2000


2001

In [777]:
uk_fuel_01_glob = glob.glob(os.path.join(uk_data_fuel_path, r'uk_fuel_2001/*.csv'))
df_uk_fuel_01 = pd.concat((pd.read_csv(f, encoding='cp850') for f in uk_fuel_01_glob), ignore_index=True)
df_uk_fuel_01 = df_uk_fuel_01.drop(columns=df_uk_fuel_01.columns[[2, 3, 7, 8, 9, 13, 14, 15, 16, 17, 18, 19, 20, 21]])
df_uk_fuel_01['year'] = 2001

df_uk_fuel_01.rename(columns={df_uk_fuel_01.columns[0]:'Manufacturer',\
                              df_uk_fuel_01.columns[2]:'Transmission'},\
                     inplace=True)

df_uk_fuel_01.head(1)

Unnamed: 0,Manufacturer,Model,Transmission,Engine Capacity,Fuel Type,Imperial (cold),Imperial Extra-urban,Imperial Combined,year
0,AUDI,Audi A2 (Standard & SE),M5,1390,Petrol,34.4,60.1,47.1,2001


2002

In [778]:
uk_fuel_02_glob = glob.glob(os.path.join(uk_data_fuel_path, r'uk_fuel_2002/*.csv'))
df_uk_fuel_02 = pd.concat((pd.read_csv(f, encoding='cp850', usecols=[0,1,2,3,4,8,9,10]) for f in uk_fuel_02_glob),\
                          ignore_index=True)

df_uk_fuel_02.rename(columns={df_uk_fuel_02.columns[0]:'Manufacturer',\
                              df_uk_fuel_02.columns[2]:'Transmission'},\
                     inplace=True)

df_uk_fuel_02['year'] = 2002
df_uk_fuel_02.head(1)

Unnamed: 0,Manufacturer,Model,Transmission,Engine Capacity,Fuel Type,Imperial Urban (Cold),Imperial Extra-Urban,Imperial Combined,year
0,ALFA ROMEO,156 Saloon 2002 MY Range,M5,1970,Petrol,23.2,42.8,32.8,2002


2003

In [779]:
uk_fuel_03_glob = glob.glob(os.path.join(uk_data_fuel_path, r'uk_fuel_2003/*.csv'))
df_uk_fuel_03 = pd.concat((pd.read_csv(f, encoding='cp850', usecols=[0,1,2,3,4,8,9,10]) for f in uk_fuel_03_glob),\
                          ignore_index=True)

df_uk_fuel_03.rename(columns={df_uk_fuel_03.columns[0]:'Manufacturer',\
                              df_uk_fuel_03.columns[2]:'Transmission'},\
                     inplace=True)

df_uk_fuel_03['year'] = 2003
df_uk_fuel_03.head(1)

Unnamed: 0,Manufacturer,Model,Transmission,Engine Capacity,Fuel Type,Imperial Urban (Cold),Imperial Extra-Urban,Imperial Combined,year
0,ALFA ROMEO,147 Range,M6,3179,Petrol,15.6,32.8,23.3,2003


2004

In [780]:
uk_fuel_04_glob = glob.glob(os.path.join(uk_data_fuel_path, r'uk_fuel_2004/*.csv'))
df_uk_fuel_04 = pd.concat((pd.read_csv(f, encoding='cp850', usecols=[0,1,2,3,4,8,9,10]) for f in uk_fuel_04_glob),\
                          ignore_index=True)

df_uk_fuel_04.rename(columns={df_uk_fuel_04.columns[0]:'Manufacturer', \
                              df_uk_fuel_04.columns[2]:'Transmission'}, \
                     inplace=True)

df_uk_fuel_04['year'] = 2004
df_uk_fuel_04.head(1)

Unnamed: 0,Manufacturer,Model,Transmission,Engine Capacity,Fuel Type,Imperial Urban (Cold),Imperial Extra-Urban,Imperial Combined,year
0,ALFA ROMEO,147 Range,M6,3179,Petrol,15.6,32.8,23.3,2004


2005

In [781]:
uk_fuel_05_glob = glob.glob(os.path.join(uk_data_fuel_path, r'uk_fuel_2005/*.csv'))
df_uk_fuel_05 = pd.concat((pd.read_csv(f, encoding='cp850', usecols=[0,1,2,3,4,8,9,10]) for f in uk_fuel_05_glob),\
                          ignore_index=True)

df_uk_fuel_05.rename(columns={df_uk_fuel_05.columns[0]:'Manufacturer',\
                              df_uk_fuel_05.columns[2]:'Transmission',\
                             df_uk_fuel_05.columns[3]:'Engine Capacity'},\
                     inplace=True)

df_uk_fuel_05['year'] = 2005
df_uk_fuel_05.head(1)

Unnamed: 0,Manufacturer,Model,Transmission,Engine Capacity,Fuel Type,Imperial Urban (Cold),Imperial Extra-Urban,Imperial Comb-ined,year
0,ALFA ROMEO,147,M6,3179,Petrol,15.6,32.8,23.3,2005


2006

In [782]:
uk_fuel_06_glob = glob.glob(os.path.join(uk_data_fuel_path, r'uk_fuel_2006/*.csv'))
df_uk_fuel_06 = pd.concat((pd.read_csv(f, encoding='cp850', usecols=[0,1,2,3,4,8,9,10]) for f in uk_fuel_06_glob),\
                          ignore_index=True)

df_uk_fuel_06.rename(columns={df_uk_fuel_06.columns[0]:'Manufacturer',\
                              df_uk_fuel_06.columns[2]:'Transmission',\
                             df_uk_fuel_06.columns[3]:'Engine Capacity'},\
                     inplace=True)

df_uk_fuel_06['year'] = 2006
df_uk_fuel_06.head(1)

Unnamed: 0,Manufacturer,Model,Transmission,Engine Capacity,Fuel Type,Imperial Urban (Cold),Imperial Extra-Urban,Imperial Combined,year
0,ALFA ROMEO,Alfa 147,M5,1598,Petrol,25.2,44.1,34.4,2006


2007

In [783]:
uk_fuel_07_glob = glob.glob(os.path.join(uk_data_fuel_path, r'uk_fuel_2007/*.csv'))
df_uk_fuel_07 = pd.concat((pd.read_csv(f, encoding='cp850', usecols=[0,1,2,3,4,8,9,10]) for f in uk_fuel_07_glob),\
                          ignore_index=True)

df_uk_fuel_07.rename(columns={df_uk_fuel_07.columns[0]:'Manufacturer',\
                              df_uk_fuel_07.columns[2]:'Transmission',\
                             df_uk_fuel_07.columns[3]:'Engine Capacity'},\
                     inplace=True)

df_uk_fuel_07['year'] = 2007
df_uk_fuel_07.head(1)

Unnamed: 0,Manufacturer,Model,Transmission,Engine Capacity,Fuel Type,Imperial Urban (Cold),Imperial Extra-Urban,Imperial Combined,year
0,ALFA ROMEO,Alfa 147,M5,1598,Petrol,25.2,44.1,34.4,2007


2008

In [784]:
uk_fuel_08_glob = glob.glob(os.path.join(uk_data_fuel_path, r'uk_fuel_2008/*.csv'))
df_uk_fuel_08 = pd.concat((pd.read_csv(f, encoding='cp850', usecols=[0,1,2,3,4,8,9,10]) for f in uk_fuel_08_glob),\
                          ignore_index=True)

df_uk_fuel_08.rename(columns={df_uk_fuel_08.columns[0]:'Manufacturer',\
                              df_uk_fuel_08.columns[2]:'Transmission',\
                             df_uk_fuel_08.columns[3]:'Engine Capacity'},\
                     inplace=True)

df_uk_fuel_08['year']=2008
df_uk_fuel_08.head(1)

Unnamed: 0,Manufacturer,Model,Transmission,Engine Capacity,Fuel Type,Imperial Urban (Cold),Imperial Extra-Urban,Imperial Combined,year
0,ALFA ROMEO,147,M5,1598,Petrol,25.2,44.1,34.4,2008


2009

In [785]:
uk_fuel_09_glob = glob.glob(os.path.join(uk_data_fuel_path, r'uk_fuel_2009/*.csv'))
df_uk_fuel_09 = pd.concat((pd.read_csv(f, encoding='cp850', usecols=[0,1,2,3,4,8,9,10]) for f in uk_fuel_09_glob),\
                          ignore_index=True)

df_uk_fuel_09.rename(columns={df_uk_fuel_09.columns[0]:'Manufacturer',\
                              df_uk_fuel_09.columns[2]:'Transmission',\
                             df_uk_fuel_09.columns[3]:'Engine Capacity'},\
                     inplace=True)

df_uk_fuel_09['year']=2009
df_uk_fuel_09.head(1)

Unnamed: 0,Manufacturer,Model,Transmission,Engine Capacity,Fuel Type,Imperial Urban (Cold),Imperial Extra-Urban,Imperial Combined,year
0,ABARTH,500,M5,1368,Petrol,33.2,52.3,43.5,2009


2010

In [786]:
uk_fuel_10_glob = glob.glob(os.path.join(uk_data_fuel_path, r'uk_fuel_2010/*.csv'))
df_uk_fuel_10 = pd.concat((pd.read_csv(f, encoding='cp850', usecols=[0,1,2,3,4,8,9,10]) for f in uk_fuel_10_glob),\
                          ignore_index=True)

df_uk_fuel_10.rename(columns={df_uk_fuel_10.columns[0]:'Manufacturer',\
                              df_uk_fuel_10.columns[2]:'Transmission',\
                             df_uk_fuel_10.columns[3]:'Engine Capacity'},\
                     inplace=True)

df_uk_fuel_10['year']=2010
df_uk_fuel_10.head(1)

Unnamed: 0,Manufacturer,Model,Transmission,Engine Capacity,Fuel Type,Imperial Urban (Cold),Imperial Extra-Urban,Imperial Combined,year
0,ABARTH,Grande Punto,M6,1368,Petrol,30.1,54.3,42.2,2010


2011

In [787]:
uk_fuel_11_glob = glob.glob(os.path.join(uk_data_fuel_path, r'uk_fuel_2011/*.xls'))
df_uk_fuel_11 = pd.concat((pd.read_excel(f, encoding='cp850', usecols=[0,1,2,3,4,8,9,10]) for f in uk_fuel_11_glob),\
                          ignore_index=True)

df_uk_fuel_11.rename(columns={df_uk_fuel_11.columns[0]:'Manufacturer',\
                              df_uk_fuel_11.columns[2]:'Transmission',\
                             df_uk_fuel_11.columns[3]:'Engine Capacity'},\
                     inplace=True)

df_uk_fuel_11['year']=2011
df_uk_fuel_11.head(1)

Unnamed: 0,Manufacturer,Model,Transmission,Engine Capacity,Fuel Type,Imperial Urban (Cold),Imperial Extra-Urban,Imperial Combined,year
0,CHEVROLET,Aveo,M5,1206,Petrol,39.2,61.4,51.4,2011


2012

In [788]:
uk_fuel_12_glob = glob.glob(os.path.join(uk_data_fuel_path, r'uk_fuel_2012/*.csv'))
df_uk_fuel_12 = pd.concat((pd.read_csv(f, encoding='cp850', usecols=[0,1,3,4,5,9,10,11]) for f in uk_fuel_12_glob),\
                          ignore_index=True)

df_uk_fuel_12.rename(columns={df_uk_fuel_12.columns[0]:'Manufacturer',\
                              df_uk_fuel_12.columns[2]:'Transmission',\
                             df_uk_fuel_12.columns[3]:'Engine Capacity'},\
                     inplace=True)

df_uk_fuel_12['year']=2012
df_uk_fuel_12.head(1)

Unnamed: 0,Manufacturer,Model,Transmission,Engine Capacity,Fuel Type,Imperial Urban (Cold),Imperial Extra-Urban,Imperial Combined,year
0,KIA,Sedona,A4,2199,Diesel,28.2,45.6,37.7,2012


2013

In [789]:
uk_fuel_13_glob = glob.glob(os.path.join(uk_data_fuel_path, r'uk_fuel_2013/*.csv'))
df_uk_fuel_13 = pd.concat((pd.read_csv(f, encoding='cp850', usecols=[0,1,3,4,5,9,10,11]) for f in uk_fuel_13_glob),\
                          ignore_index=True)

df_uk_fuel_13.rename(columns={df_uk_fuel_13.columns[0]:'Manufacturer',\
                              df_uk_fuel_13.columns[2]:'Transmission',\
                             df_uk_fuel_13.columns[3]:'Engine Capacity'},\
                     inplace=True)

df_uk_fuel_13['year']=2013
df_uk_fuel_13.head(1)

Unnamed: 0,Manufacturer,Model,Transmission,Engine Capacity,Fuel Type,Imperial Urban (Cold),Imperial Extra-Urban,Imperial Combined,year
0,ABARTH,500,SAT5,1368.0,Petrol,33.6,52.3,43.5,2013


2014

In [790]:
uk_fuel_14_glob = glob.glob(os.path.join(uk_data_fuel_path, r'uk_fuel_2014/*.csv'))
df_uk_fuel_14 = pd.concat((pd.read_csv(f, encoding='cp850', usecols=[0,1,3,4,5,9,10,11]) for f in uk_fuel_14_glob),\
                          ignore_index=True)

df_uk_fuel_14.rename(columns={df_uk_fuel_14.columns[0]:'Manufacturer',\
                              df_uk_fuel_14.columns[2]:'Transmission',\
                             df_uk_fuel_14.columns[3]:'Engine Capacity'},\
                     inplace=True)

df_uk_fuel_14['year']=2014
df_uk_fuel_14.head(1)

Unnamed: 0,Manufacturer,Model,Transmission,Engine Capacity,Fuel Type,Imperial Urban (Cold),Imperial Extra-Urban,Imperial Combined,Transmission.1,year
0,CITROEN,C-Zero,,,Electricity,,,,,2014


In [791]:
# Though the datasets only have 1 "Transmission" column in the same column index, it's duplicated here.
df_uk_fuel_14.Transmission.head(4)

Unnamed: 0,Transmission,Transmission.1
0,,
1,,
2,,
3,,


In [792]:
# There are no rows where both Transmission columns hold data, only rows with 1 or both columns missing values
df_uk_fuel_14[df_uk_fuel_14.Transmission.isna().sum(axis=1) == 0]

Unnamed: 0,Manufacturer,Model,Transmission,Engine Capacity,Fuel Type,Imperial Urban (Cold),Imperial Extra-Urban,Imperial Combined,Transmission.1,year


In [793]:
uk_fuel14_transmission_vals = df_uk_fuel_14.Transmission.fillna("").sum(axis=1)
display(uk_fuel14_transmission_vals[5:12])

df_uk_fuel_14.drop('Transmission',axis=1, inplace=True)
df_uk_fuel_14['Transmission'] = uk_fuel14_transmission_vals
df_uk_fuel_14.Transmission.replace("", np.nan, inplace=True)
df_uk_fuel_14.head(3)

5          
6          
7          
8          
9     E-CVT
10    E-CVT
11    E-CVT
dtype: object

Unnamed: 0,Manufacturer,Model,Engine Capacity,Fuel Type,Imperial Urban (Cold),Imperial Extra-Urban,Imperial Combined,year,Transmission
0,CITROEN,C-Zero,,Electricity,,,,2014,
1,PEUGEOT,iOn,,Electricity,,,,2014,
2,RENAULT,Fluence Z.E.,,Electricity,,,,2014,


In [794]:
uk_fuel14_colOrder = [
    'Manufacturer',
    'Model',
    'Transmission',
    'Engine Capacity',
    'Fuel Type',
    'Imperial Urban (Cold)',
    'Imperial Extra-Urban',
    'Imperial Combined',
    'year'
 ]
df_uk_fuel_14 = df_uk_fuel_14[uk_fuel14_colOrder]
df_uk_fuel_14.head(1)

Unnamed: 0,Manufacturer,Model,Transmission,Engine Capacity,Fuel Type,Imperial Urban (Cold),Imperial Extra-Urban,Imperial Combined,year
0,CITROEN,C-Zero,,,Electricity,,,,2014


2015

In [795]:
uk_fuel_15_glob = glob.glob(os.path.join(uk_data_fuel_path, r'uk_fuel_2015/*.csv'))
df_uk_fuel_15 = pd.concat((pd.read_csv(f, encoding='cp850', usecols=[0,1,2,3,4,8,9,10]) for f in uk_fuel_15_glob),\
                          ignore_index=True)

df_uk_fuel_15.rename(columns={df_uk_fuel_15.columns[0]:'Manufacturer',\
                              df_uk_fuel_15.columns[2]:'Transmission',\
                             df_uk_fuel_15.columns[3]:'Engine Capacity'},\
                     inplace=True)

df_uk_fuel_15['year']=2015
df_uk_fuel_15.head(1)

Unnamed: 0,Manufacturer,Model,Transmission,Engine Capacity,Fuel Type,Imperial Urban (Cold),Imperial Extra-Urban,Imperial Combined,year
0,ALFA ROMEO,Giulietta,M6,1598.0,Diesel,56.5,83.1,70.6,2015


2016

In [796]:
uk_fuel_16_glob = glob.glob(os.path.join(uk_data_fuel_path, r'uk_fuel_2016/*.csv'))
df_uk_fuel_16 = pd.concat((pd.read_csv(f, encoding='cp850', usecols=[0,1,2,3,4,8,9,10]) for f in uk_fuel_16_glob),\
                          ignore_index=True)

df_uk_fuel_16.rename(columns={df_uk_fuel_16.columns[0]:'Manufacturer',\
                              df_uk_fuel_16.columns[2]:'Transmission',\
                             df_uk_fuel_16.columns[3]:'Engine Capacity'},\
                     inplace=True)

df_uk_fuel_16['year']=2016
df_uk_fuel_16.head(1)

Unnamed: 0,Manufacturer,Model,Transmission,Engine Capacity,Fuel Type,Imperial Urban (Cold),Imperial Extra-Urban,Imperial Combined,year
0,ABARTH,"500, 2012 onwards",M5,1368.0,Petrol,35.8,57.6,47.1,2016


2017

In [797]:
uk_fuel_17_glob = glob.glob(os.path.join(uk_data_fuel_path, r'uk_fuel_2017/*.csv'))
df_uk_fuel_17 = pd.concat((pd.read_csv(f, encoding='cp850', usecols=[0,1,2,3,4,8,9,10]) for f in uk_fuel_17_glob),\
                          ignore_index=True)

df_uk_fuel_17.rename(columns={df_uk_fuel_17.columns[0]:'Manufacturer',\
                              df_uk_fuel_17.columns[2]:'Transmission',\
                             df_uk_fuel_17.columns[3]:'Engine Capacity'},\
                     inplace=True)

df_uk_fuel_17['year']=2017
df_uk_fuel_17.head(1)

Unnamed: 0,Manufacturer,Model,Transmission,Engine Capacity,Fuel Type,Imperial Urban (Cold),Imperial Extra-Urban,Imperial Combined,year
0,ALFA ROMEO,"MiTo Series 3, September 2016 Onwards",M5,1248.0,Diesel,65.7,97.4,83.1,2017


In [798]:
list_of_df_uk_fuel = [
    df_uk_fuel_00,
    df_uk_fuel_01,
    df_uk_fuel_02,
    df_uk_fuel_03,
    df_uk_fuel_04,
    df_uk_fuel_05,
    df_uk_fuel_06,
    df_uk_fuel_07,
    df_uk_fuel_08,
    df_uk_fuel_09,
    df_uk_fuel_10,
    df_uk_fuel_11,
    df_uk_fuel_12,
    df_uk_fuel_13,
    df_uk_fuel_14,
    df_uk_fuel_15,
    df_uk_fuel_16,
    df_uk_fuel_17
]
cols = []
for df in list_of_df_uk_fuel:
    cols.append(df.columns.values)
    
pd.DataFrame(cols, index=range(2000,2018))

Unnamed: 0,0,1,2,3,4,5,6,7,8
2000,Manufacturer,Model,Transmission,Engine Capacity,Fuel Type,Imperial (cold),Imperial Extra-urban,Imperial Combined,year
2001,Manufacturer,Model,Transmission,Engine Capacity,Fuel Type,Imperial (cold),Imperial Extra-urban,Imperial Combined,year
2002,Manufacturer,Model,Transmission,Engine Capacity,Fuel Type,Imperial Urban (Cold),Imperial Extra-Urban,Imperial Combined,year
2003,Manufacturer,Model,Transmission,Engine Capacity,Fuel Type,Imperial Urban (Cold),Imperial Extra-Urban,Imperial Combined,year
2004,Manufacturer,Model,Transmission,Engine Capacity,Fuel Type,Imperial Urban (Cold),Imperial Extra-Urban,Imperial Combined,year
2005,Manufacturer,Model,Transmission,Engine Capacity,Fuel Type,Imperial Urban (Cold),Imperial Extra-Urban,Imperial Comb-ined,year
2006,Manufacturer,Model,Transmission,Engine Capacity,Fuel Type,Imperial Urban (Cold),Imperial Extra-Urban,Imperial Combined,year
2007,Manufacturer,Model,Transmission,Engine Capacity,Fuel Type,Imperial Urban (Cold),Imperial Extra-Urban,Imperial Combined,year
2008,Manufacturer,Model,Transmission,Engine Capacity,Fuel Type,Imperial Urban (Cold),Imperial Extra-Urban,Imperial Combined,year
2009,Manufacturer,Model,Transmission,Engine Capacity,Fuel Type,Imperial Urban (Cold),Imperial Extra-Urban,Imperial Combined,year


In [799]:
# Standardize column names
for df in list_of_df_uk_fuel:
    rename_cols_dict = {df.columns[0]:'make',
                       df.columns[1]:'model',
                       df.columns[2]:'transmission',
                       df.columns[3]:'engine_size',
                       df.columns[4]:'fuel_type',
                       df.columns[5]:'city_mpg',
                       df.columns[6]:'highway_mpg',
                       df.columns[7]:'combined_mpg'}
    
    
    df.rename(columns=rename_cols_dict, inplace=True)

In [800]:
df_uk_fuel = pd.concat(list_of_df_uk_fuel, ignore_index=True)
df_uk_fuel.head(20)

Unnamed: 0,make,model,transmission,engine_size,fuel_type,city_mpg,highway_mpg,combined_mpg,year
0,FORD,Fiesta,M5,1242,Petrol,32.5,48.7,40.9,2000
1,FORD,Fiesta,M5,1242,Petrol,32.1,49.6,40.9,2000
2,FORD,Fiesta,M5,1596,Petrol,29.7,47.1,38.7,2000
3,FORD,New Galaxy,M6,2792,Petrol,19.1,33.2,26.2,2000
4,SEAT,Alhambra,M6,1984,Petrol,21.7,36.7,29.1,2000
5,SEAT,Alhambra,M6,1781,Petrol,21.1,38.2,29.4,2000
6,SKODA,Fabia Hatch,M5,1390,Petrol,29.4,49.6,39.8,2000
7,SKODA,Fabia Hatch,M5,1390,Petrol,29.4,49.6,39.8,2000
8,SKODA,Octavia Hatch,M5,1984,Petrol,25.4,44.8,34.9,2000
9,SKODA,Octavia Hatch,M5,1984,Petrol,25.4,44.8,34.9,2000


In [801]:
# Drop electric vehicles and vehicles that are hybrid electric/fossil fuel
df_uk_fuel = df_uk_fuel.drop(np.where(df_uk_fuel.fuel_type.str.contains('Electric'))[0]).reset_index(drop=True)
df_uk_fuel['model'] = [str(model) for model in df_uk_fuel.model]

In [802]:
uk_fuel_MYidx = df_uk_fuel[np.where(df_uk_fuel.model.str.contains("MY\d+$"), True, False)].index

In [803]:
correcting_models = [model[:1] for model in df_uk_fuel.iloc[uk_fuel_MYidx].model.str.split("MY")]
df_uk_fuel.loc[uk_fuel_MYidx, 'model'] = correcting_models

In [807]:
df_uk_fuel.to_csv("../data/transformed_data/UK/df_uk_fuel.csv", index=False)

transmission_decode_dict = {
    'A':'automatic',
    'SS':'sequential shift',
    'SM':'semi-auto',
    'L':'low ratio',
    'NE':'normal economy ratio',
    'N':'normal ratio',
    'Q':'4 wheel drive',
    'T':'turbocharger',
    'x2':'high and low gears',
    'M':'manual',
    'F':'front wheel drive',
    '/s':'close ratio',
    'SAT':'semi-auto',
}