In [1]:
!pip install reverse_geocoder

import pandas as pd
import io
import reverse_geocoder as rg



In [2]:
# Output
#  df_ts_complete: df with values where time series is complete
#  df_estimate: df where all time series values are NaN but there is an estimate
#  df_ts_incomplete: df with values where at least one of time series values is NaN
# 
# Reads global powerplant data data from CSV, drops irrelevant columns, splits df (see output), 
# and re-indexes df with gppdf_idnr as index
def read_clean_and_split():  
    df = pd.read_csv("global_power_plant_database.csv")
    df_all = df.drop(['country_long', 'name', 'url', 'geolocation_source', 'wepp_id'], axis=1)
    
    df_ts = (df_all.dropna(how='all', subset=['generation_gwh_2013', 'generation_gwh_2014', 'generation_gwh_2015', 'generation_gwh_2016', 'generation_gwh_2017'])
                      .set_index('gppd_idnr'))
    df_ts_complete = df_ts.dropna(how='any', subset=['generation_gwh_2013', 'generation_gwh_2014', 'generation_gwh_2015', 'generation_gwh_2016', 'generation_gwh_2017'])
    df_estimate = (df_all.dropna(how='any', subset=['estimated_generation_gwh'])
                   .set_index('gppd_idnr'))
    df_ts_incomplete = df_ts[df_ts[['generation_gwh_2013', 'generation_gwh_2014', 'generation_gwh_2015', 'generation_gwh_2016', 'generation_gwh_2017']].isnull().any(axis=1)]
    return df_ts_complete, df_ts_incomplete, df_estimate    

In [3]:
df_ts_complete, *_ = read_clean_and_split()

In [4]:
df_US = df_ts_complete[df_ts_complete['country'] == 'USA']

In [5]:
# Adds a column with us_state to dataframe df
# The used library for reverse geocoding works with K-D trees.
# Might produce inaccurate results for border regions
def coordinate_to_state(df):
    
    coordinates = [*zip(df['latitude'], df['longitude'])]
    #Uncomment below to include (lat, long) tuple as new column in df
    #df.insert(4, 'coordinates', coordinates)

    geo_infos = rg.search(tuple(coordinates))

    locations = list()

    for item in geo_infos:
        locations.append(item['admin1'])

    df.insert(1, 'us_state', locations)   


In [6]:
# Fixes the obvious mistakes made by coordinate_to_state function
# Based on looking up the locations manually
def fix_misclassification(df):
    #British Columbia
    df.at['USA0054249', 'us_state'] = 'Idaho'
    df.loc[df['us_state']=='British Columbia', 'us_state'] = 'Washington'
    
    #Quebec
    df.at['USA0056829', 'us_state'] = 'Maine'
    df.loc[df['us_state']=='Quebec', 'us_state'] = 'New York'
    
    #Baja California
    df.at['USA0000120', 'us_state'] = 'Arizona'
    df.loc[df['us_state']=='Baja California', 'us_state'] = 'California'
    
    #Ontario 
    df.at['USA0006369', 'us_state'] = 'Michigan'
    df.at['USA0010487', 'us_state'] = 'Minnesota'
    df.at['USA0002694', 'us_state'] = 'New York'
    
    #Chukotskiy Avtonomnyy Okrug
    df.loc[df['us_state']=='Chukotskiy Avtonomnyy Okrug', 'us_state'] = 'Alaska'
    
    #Tamaulipas
    df.loc[df['us_state']=='Tamaulipas', 'us_state'] = 'Texas'
    
    #Yukon
    df.loc[df['us_state']=='Yukon', 'us_state'] = 'Alaska'   

In [7]:

coordinate_to_state(df_US)
fix_misclassification(df_US)

Loading formatted geocoded file...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [8]:
df_US['us_state'].value_counts()

California          912
New York            327
Texas               320
Minnesota           255
Iowa                213
North Carolina      207
Michigan            199
Pennsylvania        185
Illinois            183
New Jersey          182
Wisconsin           169
Massachusetts       160
Washington          132
Idaho               127
Ohio                126
Alaska              125
Florida             125
Colorado            124
Oregon              124
Kansas              117
Virginia            116
Georgia             113
Indiana             100
Missouri             95
Arizona              93
South Carolina       93
Nebraska             90
Maine                89
Oklahoma             77
Connecticut          75
Louisiana            75
Alabama              72
Utah                 67
New Mexico           65
Nevada               63
Maryland             63
New Hampshire        60
Wyoming              58
Vermont              57
Tennessee            53
Arkansas             52
Montana         

In [10]:
import geopandas

In [11]:
!pip install plotly



In [12]:
df = pd.read_csv("global_power_plant_database.csv")
df

Unnamed: 0,country,country_long,name,gppd_idnr,capacity_mw,latitude,longitude,primary_fuel,other_fuel1,other_fuel2,...,url,geolocation_source,wepp_id,year_of_capacity_data,generation_gwh_2013,generation_gwh_2014,generation_gwh_2015,generation_gwh_2016,generation_gwh_2017,estimated_generation_gwh
0,AFG,Afghanistan,Kajaki Hydroelectric Power Plant Afghanistan,GEODB0040538,33.00,32.3220,65.1190,Hydro,,,...,http://globalenergyobservatory.org,GEODB,1009793,2017.0,,,,,,
1,AFG,Afghanistan,Mahipar Hydroelectric Power Plant Afghanistan,GEODB0040541,66.00,34.5560,69.4787,Hydro,,,...,http://globalenergyobservatory.org,GEODB,1009795,2017.0,,,,,,
2,AFG,Afghanistan,Naghlu Dam Hydroelectric Power Plant Afghanistan,GEODB0040534,100.00,34.6410,69.7170,Hydro,,,...,http://globalenergyobservatory.org,GEODB,1009797,2017.0,,,,,,
3,AFG,Afghanistan,Nangarhar (Darunta) Hydroelectric Power Plant ...,GEODB0040536,11.55,34.4847,70.3633,Hydro,,,...,http://globalenergyobservatory.org,GEODB,1009787,2017.0,,,,,,
4,AFG,Afghanistan,Northwest Kabul Power Plant Afghanistan,GEODB0040540,42.00,34.5638,69.1134,Gas,,,...,http://globalenergyobservatory.org,GEODB,,2017.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29905,ZMB,Zambia,Ndola,WRI1022386,50.00,-12.9667,28.6333,Oil,,,...,http://www.erb.org.zm/reports/EnergySectorRepo...,Power Africa,1089529,,,,,,,120.872642
29906,ZMB,Zambia,Nkana,WRI1022384,20.00,-12.8167,28.2000,Oil,,,...,http://www.erb.org.zm/reports/EnergySectorRepo...,Power Africa,1043097,,,,,,,48.349057
29907,ZMB,Zambia,Victoria Falls,WRI1022380,108.00,-17.9167,25.8500,Hydro,,,...,http://www.erb.org.zm/reports/EnergySectorRepo...,Power Africa,1033763,,,,,,,702.100000
29908,ZWE,Zimbabwe,Hwange Coal Power Plant Zimbabwe,GEODB0040404,920.00,-18.3835,26.4700,Coal,,,...,http://globalenergyobservatory.org,GEODB,1033856,2017.0,,,,,,4397.000000


In [16]:
import plotly.express as px
import geopandas as gpd


fig = px.scatter_geo(df,
                    lat=df["latitude"],
                    lon=df["longitude"],
                    hover_name=df["name"],
                    color = df["primary_fuel"])
fig.show()

In [34]:
df_US = df[df['country'] == 'USA']
df_US

Unnamed: 0,country,country_long,name,gppd_idnr,capacity_mw,latitude,longitude,primary_fuel,other_fuel1,other_fuel2,...,url,geolocation_source,wepp_id,year_of_capacity_data,generation_gwh_2013,generation_gwh_2014,generation_gwh_2015,generation_gwh_2016,generation_gwh_2017,estimated_generation_gwh
20849,USA,United States of America,12 Applegate Solar LLC,USA0059371,1.9,40.2003,-74.5761,Solar,,,...,http://www.eia.gov/electricity/data/browser/,U.S. Energy Information Administration,,2017.0,2.41461,2.35000,2.430000,2.492000,2.27600,
20850,USA,United States of America,126 Grove Solar LLC,USA0060858,2.0,42.0761,-71.4227,Solar,,,...,http://www.eia.gov/electricity/data/browser/,U.S. Energy Information Administration,,2017.0,,,,2.416000,2.25100,0.891855
20851,USA,United States of America,1420 Coil Av #C,USA0057310,1.3,33.7943,-118.2414,Solar,,,...,http://www.eia.gov/electricity/data/browser/,U.S. Energy Information Administration,,2017.0,1.52800,2.14900,1.515000,1.592000,1.66000,
20852,USA,United States of America,145 Talmadge Solar,USA0057458,3.8,40.5358,-74.3913,Solar,,,...,http://www.eia.gov/electricity/data/browser/,U.S. Energy Information Administration,67644,2017.0,5.03600,4.52400,4.802000,5.051000,4.81900,
20853,USA,United States of America,1515 S Caron Road,USA0007770,4.2,41.9084,-89.0466,Gas,,,...,http://www.eia.gov/electricity/data/browser/,U.S. Energy Information Administration,52308,2017.0,0.21500,0.17800,0.271000,0.306000,0.26400,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29530,USA,United States of America,Zion Energy Center,USA0055392,596.7,42.4776,-87.8950,Gas,Oil,,...,http://www.eia.gov/electricity/data/browser/,U.S. Energy Information Administration,49140,2017.0,0.00000,63.65600,132.434005,435.493999,462.06300,
29531,USA,United States of America,Zion Landfill Gas to Energy Facility,USA0056871,7.0,42.4803,-87.8861,Waste,,,...,http://www.eia.gov/electricity/data/browser/,U.S. Energy Information Administration,50113,2017.0,43.59200,49.13400,46.699000,42.480000,38.66900,
29532,USA,United States of America,Zorn,USA0001368,18.0,38.2803,-85.7023,Gas,,,...,http://www.eia.gov/electricity/data/browser/,U.S. Energy Information Administration,25154,2017.0,0.20300,0.07800,1.058000,0.062000,0.01500,
29533,USA,United States of America,Zotos International WPGF,USA0057648,3.4,42.8869,-76.9683,Wind,,,...,http://www.eia.gov/electricity/data/browser/,U.S. Energy Information Administration,,2017.0,3.67100,2.46015,2.489000,1.670000,1.23349,


In [17]:
fig = px.scatter_geo(df,
                    lat=df["latitude"],
                    lon=df["longitude"],
                    hover_name=df["name"],
                    color = df["primary_fuel"],
                    scope = 'usa')
fig.show()