# Assign NERC labels to plants using 860 data and RandomForest

## Instructions
Make sure the `file_date` parameter below is set to whatever value you would like appended to file names.

Change the `most_recent_860_year` parameter below to match the most up-to-date EIA-860 annual data file. As of March 2018 this is 2016.

EIA-860 (annual) excel files will need to be [downloaded](https://www.eia.gov/electricity/data/eia860/) and unzipped to the `EIA downloads` folder. Make sure that all years from 2012 through the most recent data year are available. Also download the most recent [EIA-860m](https://www.eia.gov/electricity/data/eia860m/) to `EIA downloads`.

The most recent annual 860 file available (as of March 2018) represents 2016 data. When newer EIA-860 annual files are added the dictionary with pandas `read_excel` parameters will need to be updated. Note that EIA puts out an Early Release version of 860 with extra header rows and columns, so be sure to appropriately adjust the `skiprows` and `usecols` parameters if using an Early Release file.

The entire notebook can be run at once using *Run All Cells*

In [26]:
%matplotlib inline
import matplotlib.pyplot as plt
import os
from os.path import join
import pandas as pd
from sklearn import neighbors, metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from collections import Counter
from copy import deepcopy

from src.params import DATA_PATHS, DATA_DATE, LAST_ANNUAL_923_YEAR, EIA_860_NERC_INFO
from src.util import download_unzip, download_save


# cwd = os.getcwd()
# data_path = join(cwd, '..', 'Data storage')

## Load data
This loads facility data that has been assembled from the EIA bulk data file, and EIA-860 excel files. The EIA-860 excel files need to be downloaded manually.

In [104]:
def load_plants():

    from src.analysis.load_transformed_data import FACILITY_DF

    plants = FACILITY_DF.loc[:, ['plant id', 'year', 'lat', 'lon', 'state']]

    # Because the most recent year facility dataframe only includes annually reporting
    # facilities I'm going to duplicate the plant id, lat/lon, and state information
    # from the last year will full data (LAST_ANNUAL_923_YEAR).

    all_years = plants['year'].unique()

    df_list = []
    last_full_year_plants = plants.loc[plants.year == LAST_ANNUAL_923_YEAR, :].copy()
    for year in all_years:
        if year > LAST_ANNUAL_923_YEAR:
            copy_plants = last_full_year_plants.copy()
            copy_plants['year'] = year
            df_list.append(copy_plants)
    
    copy_years = pd.concat(df_list)

    plants = pd.concat([plants.loc[plants.year <= LAST_ANNUAL_923_YEAR, :], copy_years])
    plants.drop_duplicates(inplace=True)
    plants.reset_index(inplace=True, drop=True)

    return plants

In [105]:
plants = load_plants()

In [106]:
plants.head(10)

Unnamed: 0,plant id,year,lat,lon,state
0,10120,2017,31.164772,-81.478724,GA
1,10120,2016,31.164772,-81.478724,GA
2,10120,2015,31.164772,-81.478724,GA
3,10120,2014,31.164772,-81.478724,GA
4,10120,2013,31.164772,-81.478724,GA
5,10120,2012,31.164772,-81.478724,GA
6,10120,2011,31.164772,-81.478724,GA
7,10120,2010,31.164772,-81.478724,GA
8,10120,2009,31.164772,-81.478724,GA
9,10120,2008,31.164772,-81.478724,GA


### Load known NERC labels from EIA-860
Current NERCS go back to 2012. Use all annual 860 files from 2012 through the most recent available. Extend the dictionary of dictionaries below with any files available after 2016. `io`, `skiprows`, and `usecols` are all input parameters for the Pandas `read_excel` function.

In [7]:
eia_base_path = join(data_path, 'EIA downloads')
file_860_info = {
#     2011: {'io': join(eia_base_path, 'eia8602011', 'Plant.xlsx'),
#            'skiprows': 0,
#            'parse_cols': 'B,J'},
    2012: {'io': join(eia_base_path, 'eia8602012', 'PlantY2012.xlsx'),
           'skiprows': 0,
           'usecols': 'B,J'},
    2013: {'io': join(eia_base_path, 'eia8602013', '2___Plant_Y2013.xlsx'),
           'skiprows': 0,
           'usecols': 'C,L'},
    2014: {'io': join(eia_base_path, 'eia8602014', '2___Plant_Y2014.xlsx'),
           'skiprows': 0,
           'usecols': 'C,L'},
    2015: {'io': join(eia_base_path, 'eia8602015', '2___Plant_Y2015.xlsx'),
           'skiprows': 0,
           'usecols': 'C,L'},
    2016: {'io': join(eia_base_path, 'eia8602016', '2___Plant_Y2016.xlsx'),
           'skiprows': 0,
           'usecols': 'C,L'},
    2017: {'io': join(eia_base_path, 'eia8602017', '2___Plant_Y2017.xlsx'),
           'skiprows': 0,
           'usecols': 'C,L'},
}

In [58]:
def extract_860_nerc_labels(year):
    
    if year <= 2012:
        params = {'io': DATA_PATHS['eia860'] / 'eia8602012' / 'PlantY2012.xlsx',
           'skiprows': 1,
           'usecols': 'B,J'}
        data_year = 2012
    else:
        params = {'io': DATA_PATHS['eia860'] / f'eia860{year}' / f'2___Plant_Y{year}.xlsx',
           'skiprows': 1,
           'usecols': 'C,L'}
        data_year = year
    
    
    if not params['io'].exists():
        save_path = params['io'].parent
        try:
            url = (
                'https://www.eia.gov/electricity/data/eia860/'
                + f'archive/xls/eia860{data_year}.zip'
            )
            download_unzip(url, save_path)
        except ValueError:
            url = (
                'https://www.eia.gov/electricity/data/eia860/'
                + f'xls/eia860{data_year}.zip'
            )
            download_unzip(url, save_path)
    
    eia_nercs = pd.read_excel(**params)
    eia_nercs.columns = ['plant id', 'nerc']
    eia_nercs['year'] = year
    
    return eia_nercs

In [59]:
test = extract_860_nerc_labels(2015)

In [60]:
test.head()

Unnamed: 0,plant id,nerc,year
0,2,SERC,2015
1,3,SERC,2015
2,4,SERC,2015
3,7,SERC,2015
4,8,SERC,2015


In [77]:
nercs = pd.concat(
    [extract_860_nerc_labels(year) for year in range(2001, LAST_ANNUAL_923_YEAR + 1)]
)

In [62]:
nercs.head()

Unnamed: 0,plant id,nerc,year
0,10867,SERC,2001
1,50903,RFC,2001
2,10671,SPP,2001
3,2527,NPCC,2001
4,3305,SERC,2001


In [63]:
nercs.tail()

Unnamed: 0,plant id,nerc,year
10122,61955,RFC,2017
10123,61956,SERC,2017
10124,61957,SPP,2017
10125,61958,NPCC,2017
10126,61959,NPCC,2017


### Define training data (has all info)

In [113]:
training = nercs.dropna()
unknown = nercs.loc[nercs['nerc'].isnull(), :]

In [118]:
unknown.head()

Unnamed: 0,plant id,nerc,year
1637,58277,,2001
1961,58405,,2001
4145,58469,,2001
6945,58117,,2001
6946,58278,,2001


### Some plants in EIA-860 don't have NERC labels. Drop them now.
This is my training data. All of these plants should still be in my `plants` dataframe.

nan_plants = {}
all_nan = []
years = nercs.year.unique()
for year in years:
    nan_plants[year] = nercs.loc[(nercs.year == year) &
                                 (nercs.isnull().any(axis=1)), 'plant id'].tolist()
    all_nan.extend(nan_plants[year])

# number of plants that don't have a nerc in at least one year
len(all_nan)

# drop all the rows without a nerc value
nercs.dropna(inplace=True)

nan_plants[2017]

## Load EIA-860m for some info on recent facilities
The EIA-860m (monthly) data file has an up-to-date list of all operating power plants and their associated balancing authority. It does not list the NERC region, so it can't be used to assign NERC labels for all plants. But in some cases knowing the state and balancing authority is enough to make a good guess about which NERC a plant is in.

Assigning NERC region labels has the lowest accuracy for plants in SPP and TRE. To compensate, I'm going to assume that anything in TX or OK and SWPP balancing authority is in SPP. On the flip side, if it's in TX and ERCOT I'll assign it to TRE.

Only do this for plants that come online since the most recent 860 annual data.

**NOTE**
Because I'm looking at units that came online in 2017 some of the plant ids will already exist

In [111]:
def label_new_spp_ercot(filename=None):
    """
    Download and save an EIA860 monthly generator excel file. Can either be the
    most recent month available or a specific month.
    
    Parameters
    ----------
    filename : str, optional
        the excel filename to download, in format <month>_generator<year>.xlsx
        (the default is None, in which case the most recent file is determined
        from the 860m website)
    
    """
    base_url = 'https://www.eia.gov/electricity/data/eia860m/'
    
    if not filename:
        
        # Scrape the 860m website and find the newest monthly file
        table = pd.read_html(base_url, header=0, flavor='lxml')[0]
        month, year = table['EIA 860M'][0].split() # 'Month year' as a string
        month = month.lower()
        filename = '{}_generator{}.xlsx'.format(month, year)

    url = base_url + f'xls/{filename}'
    save_path = DATA_PATHS['eia860m'] / filename
    if not save_path.exists():
        download_save(url=url, save_path=save_path)
    
    _m860 = pd.read_excel(save_path, sheet_name='Operating',skipfooter=1,
                         usecols='C,F,P,AE', skiprows=1)
    _m860.columns = _m860.columns.str.lower()
#     print(_m860.columns)
    
    m860 = _m860.loc[(_m860['operating year'] > LAST_ANNUAL_923_YEAR)].copy()
    
    m860.loc[(m860['plant state'].isin(['TX', 'OK'])) &
             (m860['balancing authority code'] == 'SWPP'), 'nerc'] = 'SPP'

    m860.loc[(m860['plant state'].isin(['TX'])) &
             (m860['balancing authority code'] == 'ERCO'), 'nerc'] = 'TRE'
    
    m860.dropna(inplace=True)
    m860.reset_index(inplace=True, drop=True)
    
    
    m860 = m860[['plant id', 'nerc', 'operating year']]
    m860.columns = ['plant id', 'nerc', 'year']
    m860.drop_duplicates(inplace=True)
    
    return m860

### Append my 2017 SPP and TRE guesses to the full nerc dataframe

In [112]:
def add_new_spp_tre_labels(nercs):
    
    new_spp_ercot = label_new_spp_ercot()
    
    nercs = pd.concat([nercs, new_spp_ercot])
    
    return nercs

In [114]:
# nercs = pd.concat([nercs, new_spp_ercot])

training = add_new_spp_tre_labels(training)

## Clean and prep data for KNN

In [107]:
plants.head()

Unnamed: 0,plant id,year,lat,lon,state
0,10120,2017,31.164772,-81.478724,GA
1,10120,2016,31.164772,-81.478724,GA
2,10120,2015,31.164772,-81.478724,GA
3,10120,2014,31.164772,-81.478724,GA
4,10120,2013,31.164772,-81.478724,GA


In [82]:
nercs.tail()

Unnamed: 0,plant id,nerc,year
10122,61955,RFC,2017
10123,61956,SERC,2017
10124,61957,SPP,2017
10125,61958,NPCC,2017
10126,61959,NPCC,2017


Checked to make sure the type of merge doesn't matter once rows without nerc values are dropped

In [115]:
training_features = pd.merge(training, plants, on=['plant id', 'year'], how='inner')
training_features.dropna(inplace=True)

In [116]:
training_features.sample(5)

Unnamed: 0,plant id,nerc,year,lat,lon,state
11599,976,SERC,2003,37.619747,-88.953114,IL
28226,54710,MRO,2007,42.490556,-90.745,IA
19978,1844,RFC,2005,42.2717,-84.9411,MI
57463,71,ASCC,2012,57.6861,-152.895,AK
12658,54365,FRCC,2003,27.87082,-81.825061,FL


In [117]:
training_features.loc[training_features.duplicated()]

Unnamed: 0,plant id,nerc,year,lat,lon,state


In [35]:
omitted = set(df['plant id'].unique()) - set(nercs['plant id'].unique())

In [36]:
df.head()

Unnamed: 0,plant id,year,lat,lon,state,nerc
0,936,2017,38.6106,-89.3583,IL,SERC
1,936,2016,38.6106,-89.3583,IL,SERC
2,936,2015,38.6106,-89.3583,IL,SERC
3,936,2014,38.6106,-89.3583,IL,SERC
4,936,2013,38.6106,-89.3583,IL,SERC


In [37]:
df.tail()

Unnamed: 0,plant id,year,lat,lon,state,nerc
110046,57729,2018,33.058611,-117.119444,CA,WECC
110047,57716,2018,40.949536,-76.04739,PA,RFC
110048,57583,2018,64.8167,-147.725,AK,ASCC
110049,57710,2018,42.556501,-73.377477,NY,NPCC
110050,58826,2018,39.728611,-84.208333,OH,RFC


In [38]:
df.columns

Index(['plant id', 'year', 'lat', 'lon', 'state', 'nerc'], dtype='object')

Drop plants that don't have lat/lon data (using just lon to check), and then drop duplicates. If any plants have kept the same plant id but moved over time (maybe a diesel generator?) or switched NERC they will show up twice.

In [39]:
cols = ['plant id', 'lat', 'lon', 'nerc', 'state', 'year']
df_slim = (df.loc[:, cols].dropna(subset=['lon'])
             .drop_duplicates(subset=['plant id', 'year', 'nerc']))

In [40]:
df_slim.tail()

Unnamed: 0,plant id,lat,lon,nerc,state,year
110046,57729,33.058611,-117.119444,WECC,CA,2018
110047,57716,40.949536,-76.04739,RFC,PA,2018
110048,57583,64.8167,-147.725,ASCC,AK,2018
110049,57710,42.556501,-73.377477,NPCC,NY,2018
110050,58826,39.728611,-84.208333,RFC,OH,2018


Separate out the list of plants where we don't have NERC labels from EIA-860.

In [41]:
unknown = df_slim.loc[df_slim.nerc.isnull()].copy()

In [42]:
print("{} plants/years don't have NERC labels\n".format(len(unknown)))
print(unknown.head())

1604 plants/years don't have NERC labels

      plant id      lat       lon nerc state  year
2032        66  59.4545 -135.3131  NaN    AK  2017
2033        66  59.4545 -135.3131  NaN    AK  2016
2034        66  59.4545 -135.3131  NaN    AK  2015
2035        66  59.4545 -135.3131  NaN    AK  2014
2036        66  59.4545 -135.3131  NaN    AK  2013


In [43]:
unknown.tail()

Unnamed: 0,plant id,lat,lon,nerc,state,year
105025,58277,20.8867,-156.3378,,HI,2018
109895,10672,30.4225,-81.6064,,FL,2018
110004,57932,38.82,-90.818056,,MO,2018
110005,55098,26.208,-98.3992,,TX,2018
110006,50730,33.4885,-86.9212,,AL,2018


### Create X and y matricies
X is lat/lon and year

y is the NERC label

For both, I'm only using plants where we have all data (no `NaN`s). Not doing any transformation of the lat/lon at this time.

In [None]:
def train_model(training_df):
    

In [44]:
X = df_slim.loc[df_slim.notnull().all(axis=1), ['lat', 'lon', 'year']]
y = df_slim.loc[df_slim.notnull().all(axis=1), 'nerc']

In [45]:
len(X)

108300

In [46]:
# Make sure that unknown and X include all records from df_slim
len(X) + len(unknown) - len(df_slim)

0

In [47]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

## GridSearch to find the best parameters in a RandomForest Classifier

I previously used k-nearest neighbors with just lat/lon as input features. The problem is that some facilities don't have lat/lon data. They do usually have a state geography label though. Categorical labels don't work well in KNN, but the combination of lat/lon and a state label will work well in a tree model. RandomForest is usually a quite effective tree model and my results are more accurate with this than they were with KNN.

In [48]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
def classify_rf(X, y, feature_cols, unknown, verbose=1):
    
    rf = RandomForestClassifier()
    params = dict(
        n_estimators = [10, 25, 50],
        min_samples_split = [2, 5, 10],
        min_samples_leaf = [1, 3, 5],
    )
    
    clf_rf = GridSearchCV(rf, params, n_jobs=-1, iid=False, verbose=verbose)
    clf_rf.fit(X[feature_cols], y)
    
    unknown.loc[:, 'nerc'] = clf_rf.predict(uknown[feature_cols])
    
    return unknown

def classify_latlon()
    
    
    
    

In [49]:
rf = RandomForestClassifier()
params = dict(
    n_estimators = [5, 10, 25, 50],
    min_samples_split = [2, 5, 10],
    min_samples_leaf = [1, 3, 5],
)

clf_rf = GridSearchCV(rf, params, n_jobs=-1, iid=False, verbose=1)
clf_rf.fit(X_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   20.4s
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed:   57.8s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=False, n_jobs=-1,
       param_grid={'n_estimators': [5, 10, 25, 50], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 3, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [50]:
clf_rf.best_estimator_, clf_rf.best_score_

(RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=25, n_jobs=1,
             oob_score=False, random_state=None, verbose=0,
             warm_start=False), 0.9802648406321238)

In [51]:
clf_rf.score(X_test, y_test)

0.9820364307898934

In [52]:
nerc_labels = nercs.nerc.dropna().unique()

Accuracy score by region

In [53]:
for region in nerc_labels:
    mask = y_test == region
    
    X_masked = X_test[mask]
    y_hat_masked = clf_rf.predict(X_masked)
    y_test_masked = y_test[mask]
    
    accuracy = metrics.accuracy_score(y_test_masked, y_hat_masked)
    print('{} : {}'.format(region, accuracy))

WECC : 0.9984802431610942
NPCC : 0.9926676730644813
RFC : 0.9676161410647677
SERC : 0.9771043771043771
SPP : 0.9272338148742643
TRE : 0.9808087731322824
MRO : 0.9820689655172414
FRCC : 0.9697732997481109
ASCC : 1.0
HICC : 1.0


F1 score by region

In [54]:
y_hat = clf_rf.predict(X_test)

for region in nerc_labels:
    f1 = metrics.f1_score(y_test, y_hat, labels=[region], average='macro')
    print('{} : {}'.format(region, f1))

WECC : 0.9982779578606159
NPCC : 0.9933103150625809
RFC : 0.9690949227373067
SERC : 0.9725201072386058
SPP : 0.9462189462189461
TRE : 0.9808087731322824
MRO : 0.9755651975336835
FRCC : 0.9821428571428571
ASCC : 1.0
HICC : 0.9979959919839679


## Plants without lat/lon
Use just the state for plants that don't have lat/lon info. Less accurate, especially where NERC regions cross state lines, but better than nothing.

Need to start with the `lon` column so I can filter to only unknown facilities that also don't have lon

In [55]:
cols = ['plant id', 'nerc', 'state', 'year', 'lon']
df_state_slim = (df.loc[:, cols].dropna(subset=['state']).copy())

In [56]:
df_state_slim.head()

Unnamed: 0,plant id,nerc,state,year,lon
0,936,SERC,IL,2017,-89.3583
1,936,SERC,IL,2016,-89.3583
2,936,SERC,IL,2015,-89.3583
3,936,SERC,IL,2014,-89.3583
4,936,SERC,IL,2013,-89.3583


In [57]:
len(df_state_slim)

110026

### Encode state names as numbers for use in sklearn

In [58]:
le = LabelEncoder()
df_state_slim.loc[:, 'enc state'] = le.fit_transform(df_state_slim.loc[:, 'state'].tolist())

In [59]:
len(df_state_slim)

110026

In [60]:
unknown_state = df_state_slim.loc[(df_state_slim.nerc.isnull()) &
                                  (df_state_slim.lon.isnull())].copy()

In [61]:
len(unknown_state), len(unknown)

(116, 1604)

In [62]:
X_state = df_state_slim.loc[df_state_slim.notnull().all(axis=1), ['enc state', 'year']].copy()
y_state = df_state_slim.loc[df_state_slim.notnull().all(axis=1), 'nerc'].copy()

In [63]:
X_state_train, X_state_test, y_state_train, y_state_test = train_test_split(
    X_state, y_state, test_size=0.33, random_state=42)

In [64]:
rf = RandomForestClassifier()
params = dict(
    n_estimators = [5, 10, 25, 50],
    min_samples_split = [2, 5, 10],
    min_samples_leaf = [1, 3, 5],
)

clf_rf_state = GridSearchCV(rf, params, n_jobs=-1, iid=False, verbose=1)
clf_rf_state.fit(X_state_train, y_state_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   16.0s
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed:   39.9s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=False, n_jobs=-1,
       param_grid={'n_estimators': [5, 10, 25, 50], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 3, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [65]:
clf_rf_state.best_estimator_, clf_rf_state.best_score_

(RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=10,
             min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
             oob_score=False, random_state=None, verbose=0,
             warm_start=False), 0.9408513917858344)

In [66]:
clf_rf_state.score(X_state_test, y_state_test)

0.939199776161164

Accuracy score by region

In [244]:
nerc_labels = nercs.nerc.dropna().unique()

for region in nerc_labels:
    mask = y_state_test == region
    
    X_state_masked = X_state_test[mask]
    y_state_hat_masked = clf_rf_state.predict(X_state_masked)
    y_state_test_masked = y_state_test[mask]
    
    accuracy = metrics.accuracy_score(y_state_test_masked, y_state_hat_masked)
    print('{} : {}'.format(region, accuracy))

RFC : 0.9243697478991597
NPCC : 0.9938519744620478
WECC : 0.9943912900032993
SERC : 0.8831431726168568
SPP : 0.5765453495089543
TRE : 1.0
MRO : 0.9614311088556204
FRCC : 0.9973992197659298
ASCC : 1.0
HICC : 1.0


F1 score by region

In [245]:
y_state_hat = clf_rf_state.predict(X_state_test)

for region in nerc_labels:
    f1 = metrics.f1_score(y_state_test, y_state_hat, labels=[region], average='macro')
    print('{} : {}'.format(region, f1))

RFC : 0.9088405397961994
NPCC : 0.9958535718516763
WECC : 0.9933534743202416
SERC : 0.8819259395387301
SPP : 0.7300658376005852
TRE : 0.8763183125599233
MRO : 0.9512929952297263
FRCC : 0.962962962962963
ASCC : 0.9991474850809888
HICC : 0.9975669099756691


## Use best RandomForest parameters to predict NERC for unknown plants

In [67]:
unknown.loc[:, 'nerc'] = clf_rf.predict(unknown.loc[:, ['lat', 'lon', 'year']])
unknown_state.loc[:, 'nerc'] = clf_rf_state.predict(unknown_state.loc[:, ['enc state', 'year']])

Ensuring that no plants in Alaska or Hawaii are assigned to continental NERCs, or the other way around.

In [68]:
print(unknown.loc[unknown.state.isin(['AK', 'HI']), 'nerc'].unique())
print(unknown.loc[unknown.nerc.isin(['HICC', 'ASCC']), 'state'].unique())

['ASCC' 'HICC' 'WECC']
['AK' 'HI']


In [69]:
Counter(unknown['nerc'])

Counter({'ASCC': 63,
         'HICC': 51,
         'SERC': 324,
         'WECC': 378,
         'FRCC': 7,
         'TRE': 153,
         'NPCC': 252,
         'SPP': 98,
         'RFC': 219,
         'MRO': 59})

In [249]:
unknown.head()

Unnamed: 0,plant id,lat,lon,nerc,state,year
28205,3823,38.27,-78.035,SERC,VA,2009
28206,3823,38.27,-78.035,SERC,VA,2005
28207,3823,38.27,-78.035,SERC,VA,2004
28208,3823,38.27,-78.035,SERC,VA,2003
28209,3823,38.27,-78.035,SERC,VA,2002


In [250]:
unknown_state.head()

Unnamed: 0,plant id,nerc,state,year,lon,enc state
84867,10851,RFC,NJ,2006,,31
84868,10851,RFC,NJ,2005,,31
84869,10851,RFC,NJ,2004,,31
84870,10851,RFC,NJ,2002,,31
84871,10851,RFC,NJ,2001,,31


## Export plants with lat/lon, state, and nerc

In [251]:
nercs.tail()

Unnamed: 0,plant id,nerc,year
24,61309,TRE,2017
25,61362,TRE,2017
26,61409,TRE,2017
27,61410,TRE,2017
28,61411,TRE,2017


In [252]:
unknown.head()

Unnamed: 0,plant id,lat,lon,nerc,state,year
28205,3823,38.27,-78.035,SERC,VA,2009
28206,3823,38.27,-78.035,SERC,VA,2005
28207,3823,38.27,-78.035,SERC,VA,2004
28208,3823,38.27,-78.035,SERC,VA,2003
28209,3823,38.27,-78.035,SERC,VA,2002


In [253]:
unknown_state.tail()

Unnamed: 0,plant id,nerc,state,year,lon,enc state
92615,10257,WECC,CA,2004,,4
92616,10257,WECC,CA,2003,,4
92617,10257,WECC,CA,2002,,4
92618,10257,WECC,CA,2001,,4
92782,56508,WECC,CA,2007,,4


In [267]:
len(unknown_state['plant id'].unique())

31

In [254]:
df_slim.head()

Unnamed: 0,plant id,lat,lon,nerc,state,year
0,1001,39.9242,-87.4244,RFC,IN,2016
1,1001,39.9242,-87.4244,RFC,IN,2015
2,1001,39.9242,-87.4244,RFC,IN,2014
3,1001,39.9242,-87.4244,RFC,IN,2013
4,1001,39.9242,-87.4244,RFC,IN,2012


In [70]:
labeled = pd.concat([df_slim.loc[df_slim.notnull().all(axis=1)],
                     unknown,
                     unknown_state.loc[:, ['plant id', 'nerc', 'state', 'year']]])

In [71]:
labeled.tail()

Unnamed: 0,lat,lon,nerc,plant id,state,year
101370,,,WECC,10257,CA,2005
101371,,,WECC,10257,CA,2004
101372,,,WECC,10257,CA,2003
101373,,,WECC,10257,CA,2002
101374,,,WECC,10257,CA,2001


In [72]:
labeled.loc[labeled.nerc.isnull()]

Unnamed: 0,lat,lon,nerc,plant id,state,year


There are 7 facilities that don't show up in my labeled data.

In [258]:
facility_df.loc[~facility_df['plant id'].isin(labeled['plant id']), 'plant id'].unique()

array([57116, 57794, 58690, 58236, 58098, 57913, 57400, 57628, 61084,
       60539, 60540, 60991, 61079, 60383, 61172, 61020, 61221, 61021,
       61022, 60682, 60688, 61407, 60689, 61330, 61357, 60414, 60366,
       60983, 60989, 60372, 60658, 60581, 60583, 60901, 60902, 60905,
       60883, 60885, 60856, 60552, 60467, 60145, 60152, 59308, 59220,
       59309, 59315, 60237, 60306, 60303, 60340, 59665, 59666, 59684,
       59827, 59061, 60033, 59066, 60210, 60261, 61039, 61040, 61048,
       60258, 61050, 60346, 59689, 59690, 59691, 59764, 61261, 61268,
       61303, 59812, 59875, 60043, 59888, 61561, 60122, 59245, 59193,
       59004, 60655, 60987, 61512, 59206, 60436, 60217, 59712, 59940,
       60785, 61222, 61422, 55314, 55952,  7704,  6339, 55975, 55073,
       50728, 60569, 60570, 61197, 60690, 60506])

In [259]:
len(labeled), len(nercs)

(100822, 132244)

In [260]:
nerc_labels

array(['RFC', 'NPCC', 'WECC', 'SERC', 'SPP', 'TRE', 'MRO', 'FRCC', 'ASCC',
       'HICC'], dtype=object)

In [261]:
mro_2016 = set(labeled.loc[(labeled.nerc == 'MRO') &
                                  (labeled.year == 2016), 'plant id'])
mro_2017 = set(labeled.loc[(labeled.nerc == 'MRO') &
                                  (labeled.year == 2017), 'plant id'])

In [262]:
(set(nercs.loc[(nercs.nerc=='MRO') &
              (nercs.year==2017),'plant id'])
 - mro_2017)

{1052,
 1058,
 1175,
 1189,
 1218,
 1771,
 1889,
 1914,
 1918,
 1932,
 1960,
 1995,
 2008,
 2217,
 2791,
 2821,
 2822,
 3334,
 3343,
 3347,
 3348,
 3996,
 4048,
 4054,
 4062,
 4121,
 4140,
 7376,
 7377,
 7602,
 7706,
 7882,
 7956,
 8014,
 8025,
 8057,
 10476,
 54713,
 54930,
 55315,
 55638,
 55764,
 55834,
 56072,
 56183,
 56366,
 57048,
 57116,
 57255,
 57659,
 58236,
 58434,
 58903,
 59053,
 59197,
 59223,
 59224,
 59225,
 59226,
 59227,
 59228,
 59230,
 59231,
 59307,
 59684,
 59875,
 59902,
 59903,
 60066,
 60203,
 60254,
 60503,
 60504,
 60505,
 60519,
 60520,
 60521,
 60522,
 60523,
 60524,
 60525,
 60526,
 60527,
 60528,
 60529,
 60530,
 60531,
 60532,
 60533,
 60534,
 60564,
 60595,
 60631,
 60632,
 60647,
 60674,
 60694,
 60695,
 60711,
 60712,
 60713,
 60714,
 60715,
 60716,
 60717,
 60795,
 60823,
 60830,
 60832,
 60833,
 60834,
 60835,
 60836,
 60837,
 60838,
 60873,
 60887,
 60888,
 60889,
 60890,
 60891,
 60892,
 60893,
 60894,
 60895,
 60905,
 60934,
 60935,
 60936,
 609

In [263]:
for nerc in nerc_labels:
    l = len((set(labeled.loc[(labeled.nerc == nerc) &
              (labeled.year == 2016), 'plant id'])
     - set(labeled.loc[(labeled.nerc == nerc) &
              (labeled.year == 2017), 'plant id'])))
    
    print('{} plants dropped in {}'.format(l, nerc))

0 plants dropped in RFC
0 plants dropped in NPCC
0 plants dropped in WECC
0 plants dropped in SERC
0 plants dropped in SPP
0 plants dropped in TRE
0 plants dropped in MRO
0 plants dropped in FRCC
0 plants dropped in ASCC
0 plants dropped in HICC


In [264]:
(set(labeled.loc[(labeled.nerc == 'MRO') &
              (labeled.year == 2016), 'plant id'])
 - set(labeled.loc[(labeled.nerc == 'MRO') &
              (labeled.year == 2017), 'plant id']))

set()

In [73]:
path = join(data_path, 'Facility labels', 'Facility locations_RF.csv')
labeled.to_csv(path, index=False)