# Assign NERC labels to plants using 860 data and k-nearest neighbors

In [92]:
%matplotlib inline
import matplotlib.pyplot as plt
# import geopandas as gpd
# from shapely.geometry import Point
# from geopandas import GeoDataFrame
import os
from os.path import join
import pandas as pd
# import fiona
# from urllib.request import urlopen
# from zipfile import ZipFile
# from io import BytesIO
from sklearn import neighbors
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from collections import Counter


cwd = os.getcwd()
data_path = join(cwd, '..', 'Data storage')

## Load data

### Load EIA facility data
Only need to keep the plant id, year (as a check that plants don't move between years), and lat/lon

In [48]:
path = os.path.join(data_path, 'Facility gen fuels and CO2 2017-08-31.zip')
facility_df = pd.read_csv(path)
facility_df['state'] = facility_df['geography'].str[-2:]

In [49]:
plants = facility_df.loc[:, ['plant id', 'year', 'lat', 'lon', 'state']]
plants.drop_duplicates(inplace=True)

### Load known NERC labels from EIA-860
Current NERCS go back to 2011. Use that, 2015, and the 2016 early release.

In [81]:
path = join(data_path, 'EIA downloads', 'eia8602015', '2___Plant_Y2015.xlsx')
nercs2015 = pd.read_excel(path, skiprows=0, parse_cols='C,L')
nercs2015.columns = ['plant id', 'nerc']
nercs2015['year'] = 2015

path = join(data_path, 'EIA downloads', 'eia8602016er', '2___Plant_Y2016_Early_Release.xlsx')
nercs2016 = pd.read_excel(path, skiprows=[0, 1], parse_cols='D,M')
nercs2016.columns = ['plant id', 'nerc']
nercs2016['year'] = 2016

path = join(data_path, 'EIA downloads', 'eia8602011', 'Plant.xlsx')
nercs2012 = pd.read_excel(path, skiprows=0, parse_cols='B,J')
nercs2012.columns = ['plant id', 'nerc']
nercs2012['year'] = 2012

In [82]:
nercs = pd.concat([nercs2012, nercs2015, nercs2016]).drop_duplicates(subset=['plant id', 'nerc'])

### Look for plants listed with different NERC labels
There are 22 plants duplicated. Two of them don't have a NERC label in one of the years. The largest move is from MRO to other regions (12), with most of those to SPP (7). After that, moves from RFC (5) to MRO (3) and SERC (2).

The plants that have duplicate NERC region labels represent a small fraction of national generation, but one that is growing over time. By 2016 they consist of 0.15% of national generation.

In [83]:
for df_ in [nercs2012, nercs2015, nercs2016, nercs]:
    print('{} total records'.format(len(df_)))
    print('{} unique plants'.format(len(df_['plant id'].unique())))

6855 total records
6855 unique plants
8928 total records
8928 unique plants
9610 total records
9610 unique plants
9997 total records
9975 unique plants


In [99]:
dup_plants = nercs.loc[nercs['plant id'].duplicated(keep=False), 'plant id'].unique()
dup_plants

array([   66,  1120,  1121,  7757,  7848,  7847,  6280, 57251, 57252,
          70,   899,  1168, 57449, 55836, 56266, 56106, 56856, 56985,
       57622, 57623, 57650, 58690])

In [100]:
region_list = []
for plant in dup_plants:
    regions = nercs.loc[nercs['plant id'] == plant, 'nerc'].unique()
#     regions = regions.tolist()
    region_list.append(regions)
Counter(tuple(x) for x in region_list)

Counter({('ASCC', nan): 2,
         ('MRO', 'RFC'): 2,
         ('MRO', 'SERC'): 1,
         ('MRO', 'SPP'): 7,
         ('MRO', 'WECC'): 2,
         ('RFC', 'MRO'): 3,
         ('RFC', 'SERC'): 2,
         ('SERC', 'SPP'): 1,
         ('SPP', 'SERC'): 2})

In [97]:
(facility_df.loc[facility_df['plant id'].isin(plants), :]
            .groupby('year')['generation (MWh)'].sum()
 / facility_df.loc[:, :]
              .groupby('year')['generation (MWh)'].sum())

year
2001    0.000345
2002    0.000269
2003    0.000262
2004    0.000313
2005    0.000426
2006    0.000514
2007    0.000509
2008    0.000527
2009    0.000631
2010    0.000683
2011    0.000763
2012    0.001268
2013    0.001119
2014    0.001032
2015    0.001420
2016    0.001590
2017    0.000878
Name: generation (MWh), dtype: float64

### Some plants in EIA-860 don't have NERC labels. Drop them now.

In [17]:
nan_plants = nercs.loc[nercs.isnull().any(axis=1)]
len(nan_plants)

35

In [16]:
nercs.loc[nercs['plant id'].isin(nan_plants)]

Unnamed: 0,plant id,nerc


In [18]:
nercs.dropna(inplace=True)

## Clean and prep data for KNN

In [19]:
# nercs = ['SERC', 'RFC', 'SPP', 'NPCC', 'WECC', 'MRO', 'TRE', 'FRCC']
nerc_names = nercs.nerc.unique()
nerc_names

array(['SERC', 'RFC', 'SPP', 'NPCC', 'WECC', 'MRO', 'TRE', 'HICC', 'ASCC',
       'FRCC'], dtype=object)

In [50]:
df = pd.merge(plants, nercs, on=['plant id'], how='left')

In [51]:
df.columns

Index(['plant id', 'year', 'lat', 'lon', 'state', 'nerc'], dtype='object')

Drop plants that don't have lat/lon data (using just lon to check), and then drop duplicates. If any plants have kept the same plant id but moved over time (maybe a diesel generator?) they will show up twice.

In [52]:
cols = ['plant id', 'lat', 'lon', 'nerc', 'state']
df_slim = (df.loc[:, cols].dropna(subset=['lon']).drop_duplicates())

In [108]:
len(df_slim)

8441

Separate out the list of plants where we don't have NERC labels from EIA-860.

In [107]:
unknown = df_slim.loc[df_slim.nerc.isnull()]

In [109]:
print("{} plants don't have NERC labels\n".format(len(unknown)))
print(unknown.head())

266 plants don't have NERC labels

       plant id        lat         lon nerc state
10892     61172  21.452002 -158.187888  NaN    HI
12757     58425  61.130000 -150.243611  NaN    AK
12799     58380  61.286000 -149.610000  NaN    AK
13166     58277  20.886700 -156.337800  NaN    HI
19106     58651  21.328056 -158.040000  NaN    HI


### Create X and y matricies
X is lat/lon

y is the NERC label

For both, I'm only using plants where we have all data (no `NaN`s). Not doing any transformation of the lat/lon at this time. There is certainly some error here, as KNN will use the Euclidian distance to calculate nearest neighbors. Not sure how I plan on dealing with this, or if it is even necessary.

In [55]:
X = df_slim.loc[df_slim.notnull().all(axis=1), ['lat', 'lon']]
y = df_slim.loc[df_slim.notnull().all(axis=1), 'nerc']

In [56]:
len(X)

8175

In [30]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

## GridSearch to find the best parameters

### Regular KNN classifier
Run gridsearch testing parameter values for weights, n_neighbors, and p. Interestingly, the score is slightly higher with p=1 (use Manhattan distance rather than Euclidean).

In [105]:
knn = neighbors.KNeighborsClassifier()

params = {'weights': ['uniform', 'distance'],
          'n_neighbors': [3, 5, 10, 15, 20],
          'p': [1, 2]
         }

clf_knn = GridSearchCV(knn, params, n_jobs=-1)

clf_knn.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'weights': ['uniform', 'distance'], 'n_neighbors': [3, 5, 10, 15, 20], 'p': [1, 2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [104]:
clf_knn.best_estimator_, clf_knn.best_score_

(KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
            metric_params=None, n_jobs=1, n_neighbors=10, p=2,
            weights='distance'), 0.96439656746394009)

In [106]:
clf_knn.best_estimator_, clf_knn.best_score_

(KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
            metric_params=None, n_jobs=1, n_neighbors=5, p=1,
            weights='distance'), 0.96457914916925325)

### Radius neighbors classifier
Doesn't work as well as KNN

In [45]:
rnn = neighbors.RadiusNeighborsClassifier()

params = {'weights': ['uniform', 'distance'],
          'radius': [10, 50, 100],
          'leaf_size': [3, 5, 10],
          'p': [1, 2]}

clf_rnn = GridSearchCV(rnn, params, n_jobs=-1)

clf_rnn.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=RadiusNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
             metric_params=None, outlier_label=None, p=2, radius=1.0,
             weights='uniform'),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'weights': ['uniform', 'distance'], 'radius': [10, 50, 100], 'leaf_size': [3, 5, 10], 'p': [1, 2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [46]:
clf_rnn.best_estimator_, clf_rnn.best_score_

(RadiusNeighborsClassifier(algorithm='auto', leaf_size=3, metric='minkowski',
              metric_params=None, outlier_label=None, p=1, radius=10,
              weights='distance'), 0.94431257987949613)

## Use best KNN parameters to predict NERC for unknown plants

In [111]:
unknown.loc[:, 'nerc'] = clf.predict(unknown.loc[:, ['lat', 'lon']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Ensuring that no plants in Alaska or Hawaii are assigned to continental NERCs, or the other way around.

In [117]:
print(unknown.loc[unknown.state.isin(['AK', 'HI']), 'nerc'].unique())
print(unknown.loc[unknown.nerc.isin(['HICC', 'ASCC']), 'state'].unique())

['HICC' 'ASCC']
['HI' 'AK']


In [119]:
Counter(unknown['nerc'])

Counter({'ASCC': 14,
         'FRCC': 4,
         'HICC': 10,
         'MRO': 8,
         'NPCC': 30,
         'RFC': 52,
         'SERC': 39,
         'SPP': 21,
         'TRE': 23,
         'WECC': 65})

## Export plants with lat/lon, state, and nerc

In [64]:
unknown.head()

Unnamed: 0,plant id,lat,lon,nerc,state
10892,61172,21.452002,-158.187888,HICC,HI
12757,58425,61.13,-150.243611,ASCC,AK
12799,58380,61.286,-149.61,ASCC,AK
13166,58277,20.8867,-156.3378,HICC,HI
19106,58651,21.328056,-158.04,HICC,HI


In [121]:
unknown.tail()

Unnamed: 0,plant id,lat,lon,nerc,state
89046,499,37.643611,-120.7575,WECC,CA
89073,7478,32.738889,-114.700278,WECC,AZ
89076,56197,35.301389,-77.631111,SERC,NC
89099,56508,35.226389,-119.628333,WECC,CA
89148,596,39.733889,-75.564444,RFC,DE


In [65]:
df_slim.head()

Unnamed: 0,plant id,lat,lon,nerc,state
0,10360,44.4936,-88.0303,MRO,WI
11,10052,40.7995,-124.2028,WECC,CA
26,10036,43.11391,-71.894001,NPCC,NH
41,10377,37.2939,-77.2697,SERC,VA
56,10362,35.7322,-95.2939,SPP,OK


In [122]:
labeled = pd.concat([df_slim.loc[df_slim.notnull().all(axis=1)], unknown])

In [123]:
labeled.loc[labeled.nerc.isnull()]

Unnamed: 0,plant id,lat,lon,nerc,state


In [124]:
path = join(data_path, 'Facility labels', 'Facility locations_knn.csv')
labeled.to_csv(path, index=False)