# Assign NERC labels to plants using 860 data and k-nearest neighbors

In [99]:
%matplotlib inline
import matplotlib.pyplot as plt
import os
from os.path import join
import pandas as pd
from sklearn import neighbors, metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from collections import Counter


cwd = os.getcwd()
data_path = join(cwd, '..', 'Data storage')

## Load data
This loads facility data that has been assembled from the EIA bulk data file, and EIA-860 excel files. The EIA-860 excel files need to be downloaded manually.

### Load EIA facility data
Only need to keep the plant id, year (as a check that plants don't move between years), and lat/lon

In [20]:
path = os.path.join(data_path, 'Facility gen fuels and CO2 2017-08-31.zip')
facility_df = pd.read_csv(path)
facility_df['state'] = facility_df['geography'].str[-2:]

In [21]:
plants = facility_df.loc[:, ['plant id', 'year', 'lat', 'lon', 'state']]
plants.drop_duplicates(inplace=True)

### Load known NERC labels from EIA-860
Current NERCS go back to 2012. Use that, 2015, and the 2016 early release.

In [9]:
eia_base_path = join(data_path, 'EIA downloads')
file_860_info = {
    2011: {'io': join(eia_base_path, 'eia8602011', 'Plant.xlsx'),
           'skiprows': 0,
           'parse_cols': 'B,J'},
    2012: {'io': join(eia_base_path, 'eia8602012', 'PlantY2012.xlsx'),
           'skiprows': 0,
           'parse_cols': 'B,J'},
    2013: {'io': join(eia_base_path, 'eia8602013', '2___Plant_Y2013.xlsx'),
           'skiprows': 0,
           'parse_cols': 'C,L'},
    2014: {'io': join(eia_base_path, 'eia8602014', '2___Plant_Y2014.xlsx'),
           'skiprows': 0,
           'parse_cols': 'C,L'},
    2015: {'io': join(eia_base_path, 'eia8602015', '2___Plant_Y2015.xlsx'),
           'skiprows': 0,
           'parse_cols': 'C,L'},
    2016: {'io': join(eia_base_path, 'eia8602016er',
                        '2___Plant_Y2016_Early_Release.xlsx'),
           'skiprows': [0, 1],
           'parse_cols': 'D,M'}
}

In [13]:
eia_nercs = {}
for key, args in file_860_info.items():
    eia_nercs[key] = pd.read_excel(**args)
    eia_nercs[key].columns = ['plant id', 'nerc']
    eia_nercs[key]['year'] = key

In [81]:
path = join(data_path, 'EIA downloads', 'eia8602015', '2___Plant_Y2015.xlsx')
nercs2015 = pd.read_excel(path, skiprows=0, parse_cols='C,L')
nercs2015.columns = ['plant id', 'nerc']
nercs2015['year'] = 2015

path = join(data_path, 'EIA downloads', 'eia8602016er', '2___Plant_Y2016_Early_Release.xlsx')
nercs2016 = pd.read_excel(path, skiprows=[0, 1], parse_cols='D,M')
nercs2016.columns = ['plant id', 'nerc']
nercs2016['year'] = 2016

path = join(data_path, 'EIA downloads', 'eia8602011', 'Plant.xlsx')
nercs2012 = pd.read_excel(path, skiprows=0, parse_cols='B,J')
nercs2012.columns = ['plant id', 'nerc']
nercs2012['year'] = 2012

In [14]:
nercs = pd.concat(eia_nercs.values()).drop_duplicates(subset=['plant id', 'nerc'])

### Look for plants listed with different NERC labels
There are 30 plants duplicated. Five of them don't have a NERC label in one of the years. The largest move is from MRO to other regions (12), with most of those to SPP (7). After that, moves from RFC (5) to MRO (3) and SERC (2). There are also some moves from WECC and FRCC to HICC/ASCC - these might be diesel generators that get moved.

The plants that have duplicate NERC region labels represent a small fraction of national generation, but one that is growing over time. By 2016 they consist of 0.15% of national generation.

In [16]:
for df_ in list(eia_nercs.values()) + [nercs]:
    print('{} total records'.format(len(df_)))
    print('{} unique plants'.format(len(df_['plant id'].unique())))

6855 total records
6855 unique plants
7289 total records
7289 unique plants
8060 total records
8060 unique plants
8520 total records
8520 unique plants
8928 total records
8928 unique plants
9610 total records
9610 unique plants
10072 total records
10038 unique plants


In [23]:
dup_plants = nercs.loc[nercs['plant id'].duplicated(keep=False), 'plant id'].unique()
dup_plants

array([   66,  1120,  1121,  7757,  7848,  7847,  6280, 57251, 57252,
          70,   899,  1168, 57449, 55836, 56266, 56106, 56856, 56985,
       57622, 57623, 57650, 58469, 58117, 58278, 58511, 59027, 59037,
       58690, 58655, 58676])

In [24]:
region_list = []
for plant in dup_plants:
    regions = nercs.loc[nercs['plant id'] == plant, 'nerc'].unique()
#     regions = regions.tolist()
    region_list.append(regions)
Counter(tuple(x) for x in region_list)

Counter({('ASCC', nan): 2,
         ('FRCC', 'HICC'): 1,
         ('MRO', 'RFC'): 2,
         ('MRO', 'SERC'): 1,
         ('MRO', 'SPP'): 7,
         ('MRO', 'WECC'): 2,
         ('RFC', 'MRO'): 3,
         ('RFC', 'SERC'): 2,
         ('SERC', 'SPP'): 1,
         ('SPP', 'SERC'): 2,
         ('WECC', 'ASCC'): 2,
         ('WECC', 'HICC'): 1,
         (nan, 'WECC', 'ASCC'): 3,
         (nan, 'WECC', 'HICC'): 1})

In [25]:
(facility_df.loc[facility_df['plant id'].isin(dup_plants), :]
            .groupby('year')['generation (MWh)'].sum()
 / facility_df.loc[:, :]
              .groupby('year')['generation (MWh)'].sum())

year
2001    0.000345
2002    0.000269
2003    0.000262
2004    0.000313
2005    0.000426
2006    0.000514
2007    0.000509
2008    0.000527
2009    0.000631
2010    0.000683
2011    0.000763
2012    0.001286
2013    0.001138
2014    0.001052
2015    0.001442
2016    0.001590
2017    0.000878
Name: generation (MWh), dtype: float64

### Some plants in EIA-860 don't have NERC labels. Drop them now.
This is my training data. All of these plants should still be in my `plants` dataframe.

In [26]:
nan_plants = nercs.loc[nercs.isnull().any(axis=1)]
len(nan_plants)

40

In [28]:
nercs.loc[nercs['plant id'].isin(nan_plants['plant id'])]

Unnamed: 0,plant id,nerc,year
66,66,ASCC,2011
1933,70,ASCC,2011
1637,58277,,2012
1961,58405,,2012
4145,58469,,2012
6945,58117,,2012
6946,58278,,2012
7164,58380,,2012
7194,58425,,2012
7262,58511,,2012


In [18]:
nercs.dropna(inplace=True)

## Clean and prep data for KNN

In [44]:
df = pd.merge(plants, nercs.drop('year', axis=1), on=['plant id'], how='left')

In [45]:
df.columns

Index(['plant id', 'year', 'lat', 'lon', 'state', 'nerc'], dtype='object')

Drop plants that don't have lat/lon data (using just lon to check), and then drop duplicates. If any plants have kept the same plant id but moved over time (maybe a diesel generator?) or switched NERC they will show up twice.

In [46]:
df.loc[df.lon.isnull()].drop_duplicates(subset='plant id')

Unnamed: 0,plant id,year,lat,lon,state,nerc
80369,10851,2006,,,NJ,
80932,2666,2004,,,NY,
81768,50249,2004,,,TX,
82462,54243,2005,,,GA,
83119,56672,2010,,,MN,
87336,56168,2003,,,MN,
87552,55303,2008,32.295556,,MS,
87560,54516,2006,41.8875,,IL,
88212,50168,2003,,,LA,
88454,50313,2004,,,NJ,


In [47]:
df.loc[df.lat.isnull()].drop_duplicates(subset='plant id')

Unnamed: 0,plant id,year,lat,lon,state,nerc
80369,10851,2006,,,NJ,
80932,2666,2004,,,NY,
81768,50249,2004,,,TX,
82462,54243,2005,,,GA,
83119,56672,2010,,,MN,
87336,56168,2003,,,MN,
88212,50168,2003,,,LA,
88454,50313,2004,,,NJ,
89011,10257,2005,,,CA,


In [48]:
cols = ['plant id', 'lat', 'lon', 'nerc', 'state']
df_slim = (df.loc[:, cols].dropna(subset=['lon']).drop_duplicates(subset=['plant id', 'nerc']))

In [49]:
len(df_slim)

8450

In [51]:
df_slim.head()

Unnamed: 0,plant id,lat,lon,nerc,state
0,10360,44.4936,-88.0303,MRO,WI
11,10052,40.7995,-124.2028,WECC,CA
26,10036,43.11391,-71.894001,NPCC,NH
41,10377,37.2939,-77.2697,SERC,VA
56,10362,35.7322,-95.2939,SPP,OK


Separate out the list of plants where we don't have NERC labels from EIA-860.

In [52]:
unknown = df_slim.loc[df_slim.nerc.isnull()]

In [53]:
print("{} plants don't have NERC labels\n".format(len(unknown)))
print(unknown.head())

258 plants don't have NERC labels

       plant id        lat         lon nerc state
7034         70  55.615208 -131.354170  NaN    AK
10907     61172  21.452002 -158.187888  NaN    HI
12772     58425  61.130000 -150.243611  NaN    AK
12814     58380  61.286000 -149.610000  NaN    AK
13181     58277  20.886700 -156.337800  NaN    HI


### Create X and y matricies
X is lat/lon

y is the NERC label

For both, I'm only using plants where we have all data (no `NaN`s). Not doing any transformation of the lat/lon at this time. There is certainly some error here, as KNN will use the Euclidian distance to calculate nearest neighbors. Not sure how I plan on dealing with this, or if it is even necessary.

In [116]:
X = df_slim.loc[df_slim.notnull().all(axis=1), ['lat', 'lon']]
y = df_slim.loc[df_slim.notnull().all(axis=1), 'nerc']

# le = LabelEncoder()
# le.fit(y)

# y = le.transform(y)

In [117]:
len(X)

8192

In [118]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

## GridSearch to find the best parameters

### Regular KNN classifier
Run gridsearch testing parameter values for weights, n_neighbors, and p (use Euclidean or Manhattan distance).

With 15 neighbors, weights by distance, and Euclidean distance, the model is able to accurately predict the test sample NERC region with 96% accuracy. This varies by region, with the lowest accuracy scores for TRE and SPP (89% and 87%), and the highest accuracy scores for WECC and NPCC (each 99%). F1 scores tend to be similar to the accuracy, although TRE has slightly higher F1 (0.94 vs 0.89).

In [119]:
knn = neighbors.KNeighborsClassifier()

params = {'weights': ['uniform', 'distance'],
          'n_neighbors': [10, 15, 20],
          'p': [1, 2]
         }

clf_knn = GridSearchCV(knn, params, n_jobs=-1, iid=False, verbose=1)

clf_knn.fit(X_train, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:    0.7s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=False, n_jobs=-1,
       param_grid={'weights': ['uniform', 'distance'], 'n_neighbors': [10, 15, 20], 'p': [1, 2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

In [120]:
clf_knn.best_estimator_, clf_knn.best_score_

(KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
            metric_params=None, n_jobs=1, n_neighbors=15, p=2,
            weights='distance'), 0.96210623124413541)

In [121]:
clf_knn.score(X_test, y_test)

0.96042899408284022

In [129]:
nerc_labels = nercs.nerc.dropna().unique()

Accuracy score by region

In [133]:
for region in nerc_labels:
    mask = y_test == region
    
    X_masked = X_test[mask]
    y_hat_masked = clf_knn.predict(X_masked)
    y_test_masked = y_test[mask]
    
    accuracy = metrics.accuracy_score(y_test_masked, y_hat_masked)
    print('{} : {}'.format(region, accuracy))

SERC : 0.9537037037037037
RFC : 0.9454148471615721
SPP : 0.8671328671328671
NPCC : 0.9880597014925373
WECC : 0.9937810945273632
MRO : 0.9461279461279462
TRE : 0.8934426229508197
HICC : 0.9285714285714286
ASCC : 1.0
FRCC : 0.9298245614035088


F1 score by region

In [134]:
y_hat = clf_knn.predict(X_test)

for region in nerc_labels:
    f1 = metrics.f1_score(y_test, y_hat, labels=[region], average='macro')
    print('{} : {}'.format(region, f1))

SERC : 0.9460390355912743
RFC : 0.9454148471615721
SPP : 0.8611111111111112
NPCC : 0.9880597014925373
WECC : 0.9919304779639975
MRO : 0.9413735343383585
TRE : 0.9396551724137931
HICC : 0.962962962962963
ASCC : 0.988235294117647
FRCC : 0.954954954954955


In [135]:
metrics.f1_score(y_test, y_hat, average='micro')

0.96042899408284022

In [136]:
metrics.f1_score(y_test, y_hat, average='macro')

0.95197370921082081

## Use best KNN parameters to predict NERC for unknown plants

In [73]:
unknown.loc[:, 'nerc'] = clf_knn.predict(unknown.loc[:, ['lat', 'lon']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Ensuring that no plants in Alaska or Hawaii are assigned to continental NERCs, or the other way around.

In [74]:
print(unknown.loc[unknown.state.isin(['AK', 'HI']), 'nerc'].unique())
print(unknown.loc[unknown.nerc.isin(['HICC', 'ASCC']), 'state'].unique())

['ASCC' 'HICC']
['AK' 'HI']


In [75]:
Counter(unknown['nerc'])

Counter({'ASCC': 19,
         'FRCC': 4,
         'HICC': 10,
         'MRO': 9,
         'NPCC': 27,
         'RFC': 52,
         'SERC': 37,
         'SPP': 20,
         'TRE': 23,
         'WECC': 57})

## Export plants with lat/lon, state, and nerc

In [76]:
unknown.head()

Unnamed: 0,plant id,lat,lon,nerc,state
7034,70,55.615208,-131.35417,ASCC,AK
10907,61172,21.452002,-158.187888,HICC,HI
12772,58425,61.13,-150.243611,ASCC,AK
12814,58380,61.286,-149.61,ASCC,AK
13181,58277,20.8867,-156.3378,HICC,HI


In [77]:
unknown.tail()

Unnamed: 0,plant id,lat,lon,nerc,state
89101,499,37.643611,-120.7575,WECC,CA
89128,7478,32.738889,-114.700278,WECC,AZ
89131,56197,35.301389,-77.631111,SERC,NC
89154,56508,35.226389,-119.628333,WECC,CA
89203,596,39.733889,-75.564444,RFC,DE


In [78]:
df_slim.head()

Unnamed: 0,plant id,lat,lon,nerc,state
0,10360,44.4936,-88.0303,MRO,WI
11,10052,40.7995,-124.2028,WECC,CA
26,10036,43.11391,-71.894001,NPCC,NH
41,10377,37.2939,-77.2697,SERC,VA
56,10362,35.7322,-95.2939,SPP,OK


In [79]:
labeled = pd.concat([df_slim.loc[df_slim.notnull().all(axis=1)], unknown])

In [80]:
labeled.loc[labeled.nerc.isnull()]

Unnamed: 0,plant id,lat,lon,nerc,state


There are 11 facilities that don't show up in my labeled data - they didn't have lat/lon info.

In [83]:
facility_df.loc[~facility_df['plant id'].isin(labeled['plant id']),
                'plant id'].unique()

array([10851,  2666, 50249, 54243, 56672, 56168, 55303, 54516, 50168,
       50313, 10257])

In [84]:
path = join(data_path, 'Facility labels', 'Facility locations_knn.csv')
labeled.to_csv(path, index=False)