In [1]:
#!pip install reverse_geocoder

import pandas as pd
import io
import reverse_geocoder as rg

In [2]:
# Output
#  df_ts_complete: df with values where time series is complete
#  df_estimate: df where all time series values are NaN but there is an estimate
#  df_ts_incomplete: df with values where at least one of time series values is NaN
# 
# Reads global powerplant data data from CSV, drops irrelevant columns, splits df (see output), 
# and re-indexes df with gppdf_idnr as index
def read_clean_and_split():  
    df = pd.read_csv("..\data\global_power_plant_database.csv")
    df_all = df.drop(['country_long', 'name', 'url', 'geolocation_source', 'wepp_id'], axis=1)
    
    df_ts = (df_all.dropna(how='all', subset=['generation_gwh_2013', 'generation_gwh_2014', 'generation_gwh_2015', 'generation_gwh_2016', 'generation_gwh_2017'])
                      .set_index('gppd_idnr'))
    df_ts_complete = df_ts.dropna(how='any', subset=['generation_gwh_2013', 'generation_gwh_2014', 'generation_gwh_2015', 'generation_gwh_2016', 'generation_gwh_2017'])
    df_estimate = (df_all.dropna(how='any', subset=['estimated_generation_gwh'])
                   .set_index('gppd_idnr'))
    df_ts_incomplete = df_ts[df_ts[['generation_gwh_2013', 'generation_gwh_2014', 'generation_gwh_2015', 'generation_gwh_2016', 'generation_gwh_2017']].isnull().any(axis=1)]
    return df_ts_complete, df_ts_incomplete, df_estimate    

In [3]:
df_ts_complete, *_ = read_clean_and_split()

In [4]:
df_US = df_ts_complete[df_ts_complete['country'] == 'USA']

In [7]:
# Adds a column with us_state to dataframe df
# The used library for reverse geocoding works with K-D trees.
# Might produce inaccurate results for border regions
def coordinate_to_state(df):
    
    coordinates = [*zip(df['latitude'], df['longitude'])]
    #Uncomment below to include (lat, long) tuple as new column in df
    #df.insert(4, 'coordinates', coordinates)

    geo_infos = rg.search(tuple(coordinates))

    locations = list()

    for item in geo_infos:
        locations.append(item['admin1'])

    df.insert(1, 'us_state', locations)   


In [8]:
# Fixes the obvious mistakes made by coordinate_to_state function
# Based on looking up the locations manually
def fix_misclassification(df):
    #British Columbia
    df.at['USA0054249', 'us_state'] = 'Idaho'
    df.loc[df['us_state']=='British Columbia', 'us_state'] = 'Washington'
    
    #Quebec
    df.at['USA0056829', 'us_state'] = 'Maine'
    df.loc[df['us_state']=='Quebec', 'us_state'] = 'New York'
    
    #Baja California
    df.at['USA0000120', 'us_state'] = 'Arizona'
    df.loc[df['us_state']=='Baja California', 'us_state'] = 'California'
    
    #Ontario 
    df.at['USA0006369', 'us_state'] = 'Michigan'
    df.at['USA0010487', 'us_state'] = 'Minnesota'
    df.at['USA0002694', 'us_state'] = 'New York'
    
    #Chukotskiy Avtonomnyy Okrug
    df.loc[df['us_state']=='Chukotskiy Avtonomnyy Okrug', 'us_state'] = 'Alaska'
    
    #Tamaulipas
    df.loc[df['us_state']=='Tamaulipas', 'us_state'] = 'Texas'
    
    #Yukon
    df.loc[df['us_state']=='Yukon', 'us_state'] = 'Alaska'   

In [10]:

coordinate_to_state(df_US)
fix_misclassification(df_US)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [13]:
df_US['us_state'].value_counts()

California          912
New York            327
Texas               320
Minnesota           255
Iowa                213
North Carolina      207
Michigan            199
Pennsylvania        185
Illinois            183
New Jersey          182
Wisconsin           169
Massachusetts       160
Washington          132
Idaho               127
Ohio                126
Florida             125
Alaska              125
Oregon              124
Colorado            124
Kansas              117
Virginia            116
Georgia             113
Indiana             100
Missouri             95
Arizona              93
South Carolina       93
Nebraska             90
Maine                89
Oklahoma             77
Connecticut          75
Louisiana            75
Alabama              72
Utah                 67
New Mexico           65
Nevada               63
Maryland             63
New Hampshire        60
Wyoming              58
Vermont              57
Tennessee            53
Arkansas             52
Montana         