In [9]:
import pandas as pd
import numpy as np
import pycountry
import pycountry_convert
import json
import time
import urllib.error
import urllib.parse
import urllib.request
import html
import os
from dotenv import load_dotenv

# Load the new dataset:

In [10]:
path = '/Users/gerardovitaleerrico/Documents/DataCamp/globalpowerplantdatabasev120/global_power_plant.csv'
cols = list(pd.read_csv(path, nrows=1))
cols = [i for i in cols if 'Unnamed' not in i]
power_plants = pd.read_csv(path, usecols=cols) # index_col=['country']

## Normalizing some variables

In [11]:
power_plants[
    ['country_long', 'name', 'primary_fuel']
] = power_plants[['country_long', 'name', 'primary_fuel']].apply(lambda x: x.str.lower())

In [12]:
power_plants.head()

Unnamed: 0,country,country_long,name,gppd_idnr,capacity_mw,latitude,longitude,primary_fuel,estimated_generation_gwh
0,AFG,afghanistan,kajaki hydroelectric power plant afghanistan,GEODB0040538,33.0,32.322,65.119,hydro,
1,AFG,afghanistan,mahipar hydroelectric power plant afghanistan,GEODB0040541,66.0,34.556,69.4787,hydro,
2,AFG,afghanistan,naghlu dam hydroelectric power plant afghanistan,GEODB0040534,100.0,34.641,69.717,hydro,
3,AFG,afghanistan,nangarhar (darunta) hydroelectric power plant ...,GEODB0040536,11.55,34.4847,70.3633,hydro,
4,AFG,afghanistan,northwest kabul power plant afghanistan,GEODB0040540,42.0,34.5638,69.1134,gas,


## Adding continent as a new column

In [20]:
def get_continent(alpha_3):
    try:
        alpha_2 = pycountry_convert.country_alpha3_to_country_alpha2(alpha_3.upper())
        continent = pycountry_convert.country_alpha2_to_continent_code(alpha_2)
        if continent == 'NA':
            return 'NA*'
        else:
            return continent
    except:
        return np.nan

### Due to the pandas convention of interpreting 'NA' as NaN value, the North America code 'NA' has been change to 'NA*'

In [21]:
get_continent('usa')

'NA*'

In [22]:
power_plants['continent'] = power_plants['country'].apply(get_continent)

### There are 5 "countries" that don't have ISO 3166-1 code

In [23]:
power_plants['continent'].unique()

array(['AS', 'EU', 'AF', nan, 'SA', 'OC', 'NA*'], dtype=object)

In [24]:
power_plants[power_plants['continent'].isnull()]

Unnamed: 0,country,country_long,name,gppd_idnr,capacity_mw,latitude,longitude,primary_fuel,estimated_generation_gwh,continent
61,ATA,antarctica,mcmurdo station generator,WRI1023843,6.6,-77.847,166.6605,oil,,
62,ATA,antarctica,ross island,WRI1022458,1.0,-77.8428,166.7271,wind,,
13996,KOS,kosovo,kosovo a coal power plant kosovo,GEODB0042698,800.0,42.6767,21.0867,coal,2852.503383,
13997,KOS,kosovo,kosovo b coal power plant kosovo,GEODB0042699,678.0,42.6937,21.057,coal,2417.496617,
29886,ESH,western sahara,dakhla ic power plant western sahara,GEODB0042583,23.4,23.6816,-15.9594,oil,,


In [25]:
save_path_1 = '/Users/gerardovitaleerrico/Documents/DataCamp/globalpowerplantdatabasev120/global_power_plants_clean.csv'
power_plants.to_csv(save_path_1)

## Creating a numpy array [ lat, lng ] = [ latitude, longitude ]

In [11]:
latlng_serie = power_plants[['latitude', 'longitude']]

In [12]:
latlng_array = latlng_serie.to_numpy()

In [13]:
latlng_array, latlng_array.shape

(array([[ 32.322 ,  65.119 ],
        [ 34.556 ,  69.4787],
        [ 34.641 ,  69.717 ],
        ...,
        [-17.9167,  25.85  ],
        [-18.3835,  26.47  ],
        [-16.5222,  28.7619]]),
 (29910, 2))

****
# Geocode function =>
## Creating a new DataFrame with the following columns, based on the geolocation:
- Locality
- City
- Stade

In [14]:
load_dotenv('../global-power-plants-pipeline/.env')
GEO_KEY = os.getenv("GEO_KEY")
BASE_URL = "https://maps.googleapis.com/maps/api/geocode/json"

In [15]:
def geocode(lat, lng):
    params = urllib.parse.urlencode({"latlng": f"{lat},{lng}", "key": API_KEY,})
    url = f"{BASE_URL}?{params}"
    current_delay = 0.1  # Set the initial retry delay to 100ms.
    max_delay = 5  # Set the maximum retry delay to 5 seconds.
    
    while True:
        try:
            # Get the API response.
            response = urllib.request.urlopen(url)
        except urllib.error.URLError:
            pass  # Fall through to the retry loop.
        else:
            # If we didn't get an IOError then parse the result.
            result = json.load(response)

            if result["status"] == "OK":
                return result['results'][0]['address_components']
            elif result["status"] != "UNKNOWN_ERROR":
                # Many API errors cannot be fixed by a retry, e.g. INVALID_REQUEST or
                # ZERO_RESULTS. There is no point retrying these requests.
                raise Exception(result["error_message"])

        if current_delay > max_delay:
            raise Exception("Too many retry attempts.")
        
        print("Waiting", current_delay, "seconds before retrying.")

        time.sleep(current_delay)
        current_delay *= 2  # Increase the delay each time we retry.

In [16]:
def apply_geocode():    
    geo_array = np.empty((29910, 3), dtype='S25')
    type_1 = 'locality'
    type_2 = 'administrative_area_level_2'
    type_3 = 'administrative_area_level_1'

    for index,lat_lng in enumerate(latlng_array):
        geo_data = geocode(lat_lng[0], lat_lng[1])
        for i in range(len(geo_data)):
            if type_1 in geo_data[i]['types']:
                geo_array[index][0] = geo_data[i]['long_name'].encode('utf8')
            elif type_2 in geo_data[i]['types']:
                geo_array[index][1] = geo_data[i]['long_name'].encode('utf8')
            elif type_3 in geo_data[i]['types']:
                geo_array[index][2] = geo_data[i]['long_name'].encode('utf8')
    
    return geo_array

In [None]:
geo_array = apply_geocode()
geo_df = pd.DataFrame(geo_array, columns=['locality', 'city', 'state'])

save_path = '/Users/gerardovitaleerrico/Documents/DataCamp/globalpowerplantdatabasev120/geo_df.csv'

geo_df.to_csv(save_path)

****
# AirQuality function =>
## Creating another DataFrame, the air quality df associate with the geolocation [ lat, lon ], just for Country == Sapin due to the API's request limitation
### The data added by the AirQuality API describe the air quality in the area, when data is found; on the contrary, it would describe the closest area/locality/city.
- "aqi":
- "pm10":
- "co":
- "o3":
- "so2":
- "no2":

In [17]:
load_dotenv('../global-power-plants-pipeline/.env')
AIR_KEY = os.getenv("AIR_KEY")
BASE_URL = "https://api.weatherbit.io/v2.0/current/airquality"

In [18]:
def air_quality(lat, lng):
    params = urllib.parse.urlencode({"lat":f"{lat}", "lon":f"{lng}", "key": API_KEY,})
    url = f"{BASE_URL}?{params}"
    current_delay = 0.1  # Set the initial retry delay to 100ms.
    max_delay = 5  # Set the maximum retry delay to 5 seconds.
    
    while True:
        try:
            # Get the API response.
            response = urllib.request.urlopen(url)
        except urllib.error.URLError:
            pass  # Fall through to the retry loop.
        else:
            # If we didn't get an IOError then parse the result.
            result = json.load(response)
            return result['data'][0]

        if current_delay > max_delay:
            raise Exception("Too many retry attempts.")
        
        
        print("Waiting", current_delay, "seconds before retrying.")

        time.sleep(current_delay)
        current_delay *= 2  # Increase the delay each time we retry.

In [19]:
spain_df  = power_plants[power_plants.index.isin(['ESP'])]

In [20]:
latlng_spain = spain_df[['latitude', 'longitude']]

In [21]:
latlng_spain = latlng_spain.to_numpy()

In [22]:
latlng_spain, latlng_spain.shape

(array([], shape=(0, 2), dtype=float64), (0, 2))

In [23]:
pollutants = ["aqi","pm10","co","o3","so2","no2"]

def apply_aq():
    air_array = np.empty((29910, 6))

    for index,lat_lng in enumerate(latlng_spain[:480]):
        air_data = air_quality(lat_lng[0], lat_lng[1])
        for i in range(len(air_data)):
            for j,k in enumerate(pollutants):
                if k in air_data.keys():
                    air_array[index][j] = air_data[k]
                else:
                    air_array[index][j] = np.nan
    
    return air_array

In [None]:
air_array = apply_aq()
air_df = pd.DataFrame(air_array, columns=pollutants)

save_path = '/Users/gerardovitaleerrico/Documents/DataCamp/globalpowerplantdatabasev120/air_df.csv'

air_df.to_csv(save_path)