In [1]:
import pandas as pd
from sources.decoding import decoding 

## Global Power Plants

In [2]:
global_power_plants_path = '../globalpowerplantdatabasev120/global_power_plants_clean.csv'
cols = list(pd.read_csv(global_power_plants_path, nrows=1))
cols = [i for i in cols if 'Unnamed' not in i]
power_plants = pd.read_csv(global_power_plants_path, usecols=cols) # , index_col=['country']

In [3]:
power_plants.shape, power_plants.columns

((29910, 10),
 Index(['country', 'country_long', 'name', 'gppd_idnr', 'capacity_mw',
        'latitude', 'longitude', 'primary_fuel', 'estimated_generation_gwh',
        'continent'],
       dtype='object'))

## Geodata collected using google geolocalization API

In [4]:
geo_path = '../globalpowerplantdatabasev120/geo_df.csv'
cols = list(pd.read_csv(geo_path, nrows=1))
cols = [i for i in cols if 'Unnamed' not in i]
geo_df = pd.read_csv(geo_path, usecols=cols, encoding='utf8')

In [5]:
geo_df.shape, geo_df.columns

((29910, 3), Index(['locality', 'city', 'state'], dtype='object'))

## AirQuality data collected using weatherbit API

In [6]:
air_path = '../globalpowerplantdatabasev120/air_df.csv'
cols = list(pd.read_csv(air_path, nrows=1))
cols = [i for i in cols if 'Unnamed' not in i]
air_df = pd.read_csv(air_path, usecols=cols)

In [7]:
air_df.shape, air_df.columns

((29910, 6), Index(['aqi', 'pm10', 'co', 'o3', 'so2', 'no2'], dtype='object'))

## Now let's merge the three previously datasets on one DataFrame
### Note: the goal is to build a dataframe so it can be possible to analyze the air quality state per state in Spain, in relation to the quantity and the type of power plants around

In [8]:
spain_geo = power_plants.merge(geo_df, how='inner', left_on=power_plants.index, right_on=geo_df.index)
spain_geo.set_index(['country'], inplace=True)
spain_geo = spain_geo[spain_geo.index.isin(['ESP'])]
spain_geo.shape, air_df.shape

((614, 13), (29910, 6))

In [9]:
spain_geo.drop(columns=['key_0'], inplace=True)
spain_geo.reset_index(level=0, inplace=True)

In [10]:
spain_geo_air = spain_geo.merge(air_df, how='inner', left_on=spain_geo.index, right_on=air_df.index)
# The weather API just allow to perform 500 requests, 20 were used for testing
spain_geo_air = spain_geo_air.iloc[0:480]
# drop some columns that won't be used
spain_geo_air = spain_geo_air.drop(columns=['key_0', 'country', 'country_long', 'locality', 'continent'])
# rename state column to autonomous_community

In [11]:
spain_geo_air['city'] = spain_geo_air['city'].apply(decoding)
spain_geo_air['state'] = spain_geo_air['state'].apply(decoding)

In [12]:
spanish_state_standard = {
    'Andalucía': ['Andalucía', 'Andalusia'],
    'Canarias': ['Canarias', 'Canary Islands'],
    'Cataluña': ['Catalunya', 'Cataluña'],
    'Islas Baleares': ['Illes Balears', 'Islas Baleares']
}

def standardize_state(string,standard):
    for key_state,state_list in standard.items():
        if string in state_list:
            return key_state
        return string

In [13]:
spain_geo_air.state = spain_geo_air.state.apply(standardize_state, standard=spanish_state_standard)

In [14]:
spain_geo_air.state.unique()

array(['Principado de Asturias', 'Castilla-La Mancha', 'Catalunya',
       'Castilla y León', 'Andalucía', 'Cantabria', 'Galicia',
       'Illes Balears', 'Aragón', 'País Vasco', 'Canarias', 'La Rioja',
       'Extremadura', 'Comunidad Valenciana', 'Euskadi', 'Canary Islands',
       'Región de Murcia', 'Navarra', 'Valencian Community', 'Ceuta',
       'Cataluña', 'Brong Ahafo Region', 'Jubbada Dhexe',
       'Islas Baleares', 'Comunidad de Madrid', 'Melilla',
       'Kilimanjaro Region', 'Garissa County'], dtype=object)

In [15]:
save_path = '../globalpowerplantdatabasev120/spain_geo_air.csv'

spain_geo_air.to_csv(save_path)