## **Outage Repredition EDA**

Factors to look at while doing repredictions: 
1. Current datetime - Time since the outage? 
    - Is the current time more than the Estimated restoration time
    - What is the treshold for outages greater than 24 hours / 1440 minutes staying into the system
    - When are outages moved out of the live tables 
2. No of outages in last N hours
3. No of customer out 
4. Live outages in system 


In [1]:
import csv
import math
import time
import warnings
import operator
import statistics
import requests
import json
import seaborn as sns
import pandas as pd
import numpy as np
import geopy.distance
import matplotlib.pyplot as plt

from dateutil.parser import parse
from datetime import datetime
from datetime import date, timedelta
from scipy import stats
from IPython.display import display_html
from multiprocessing import Pool
from sklearn.model_selection import train_test_split
from google.cloud import storage

plt.style.use('fivethirtyeight')
warnings.filterwarnings('ignore')
%matplotlib inline

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.float_format = '{:.2f}'.format

## **Read necessary files from GCS Bucket**

In [2]:
today = date.today()
yesterday = today - timedelta(days = 1)
yesterday = yesterday.strftime('%Y-%m-%d')
today = today.strftime('%Y-%m-%d')
print(today)
print(yesterday)
client = storage.Client()
BUCKET_NAME = 'aes-datahub-0002-raw'
bucket = client.get_bucket(BUCKET_NAME)

blobs = bucket.list_blobs(prefix='OMS/'+today)
dirlist = []

for blob in blobs:
    dirlist.append(str(blob.name))

2020-11-23
2020-11-22


In [3]:
matching_facility = [s for s in dirlist if "FACILITY_IPL_Daily" in s]
matching_live_facility = [s for s in matching_facility if "HIS" in s]
print(matching_live_facility)
print('\n')

['OMS/2020-11-23/HIS_FACILITY_IPL_Daily_202011230600.csv']




In [4]:
matching_location = [s for s in dirlist if "LOCATION_IPL_Daily" in s]
matching_live_location = [s for s in matching_location if "HIS" in s]
print(matching_live_location)
print('\n')

['OMS/2020-11-23/HIS_LOCATION_IPL_Daily_202011230600.csv']




In [5]:
bucket_name = 'gs://aes-datahub-0001-raw/'

live_df_facility_job_his = pd.read_csv(bucket_name + matching_live_facility[-1],encoding = "ISO-8859-1",sep=",")
df_facility_job_his = live_df_facility_job_his.copy(deep=True)
print(df_facility_job_his.shape)

(21350, 70)


In [6]:
bucket_name = 'gs://aes-datahub-0001-raw/'

live_df_location_his = pd.read_csv(bucket_name + matching_live_location[-1],encoding = "ISO-8859-1",sep=",")
df_his_location = live_df_location_his.copy(deep=True)
print(df_his_location.shape)

(20376, 71)


In [7]:
print(list(df_facility_job_his.columns))

['FAC_JOB_ID', 'CIRCT_ID', 'MAJ_OTG_ID', 'EQUIP_STN_NO', 'DIST_NO', 'HOST_SEQ_ID', 'PRIORITY_VAL', 'CUST_QTY', 'CLUE_CD', 'CLUE_DESC', 'CREATION_DATETIME', 'CALL_QTY', 'KEY_CUST_QTY', 'SPLIT_FAC_JOB_FLG', 'CAUSE_CD', 'CAUSE_DESC', 'OCCURN_CD', 'OCCURN_DESC', 'CLIMATIC_CD', 'CLIMATIC_DESC', 'CITY_NAM', 'LOC_DESC', 'WRK_ORD_NUM', 'COMMENT_TEXT', 'CALL_ID', 'KVA_VAL', 'BOOK_NO', 'ADDRESS', 'CIRCT_NAM', 'CLUE_CD2', 'INSERTED_DATE', 'DOWNSTREAM_KVA_VAL', 'DOWNSTREAM_CUST_QTY', 'COMPL_DATETIME', 'TOT_LOSS_POWER_FLG', 'ISOLATED_TO_CUST_FLG', 'PLANNED_OUTAGE_FLG', 'ROUTINE_FLG', 'DNI_EQUIP_TYPE', 'SUBST_ID', 'WORK_ORD_1_NO', 'WORK_ORD_2_NO', 'WORK_ORD_3_NO', 'WORK_ORD_4_NO', 'WORK_ORD_5_NO', 'ENERGIZED_DATETIME', 'DISPLAY_TEXT', 'POLICE_OPERATOR_ID', 'POLICE_INC_NO', 'FIRE_OPERATOR_ID', 'FIRE_INC_NO', 'CAD_ID', 'STRCTUR_NO', 'FAC_JOB_PARENT_ID', 'MAJ_INCIDENT_FLG', 'MAJ_INCIDENT_CAUSE', 'ZONE_DESC', 'DIST_DESC', 'ZONE_ID', 'GEO_DIST_NO', 'ETR_DATETIME', 'SUBST_SHUTDOWN_FLG', 'HIS_FAC_JOB_COMME

In [8]:
# changing the date format

df_facility_job_his["CREATION_DATETIME"] = pd.to_datetime(df_facility_job_his["CREATION_DATETIME"], errors ='coerce')
df_facility_job_his["ENERGIZED_DATETIME"] = pd.to_datetime(df_facility_job_his["ENERGIZED_DATETIME"], errors ='coerce')
df_facility_job_his["ETR_DATETIME"] = pd.to_datetime(df_facility_job_his["ETR_DATETIME"], errors ='coerce')

# creating blue sky flags

df_facility_job_his['BLUE_SKY_FLG'] = ( (df_facility_job_his.TOT_LOSS_POWER_FLG == 'T') & 
                                           ((df_facility_job_his.MAJ_OTG_ID == 0) | (df_facility_job_his.MAJ_OTG_ID.isnull())) &
                                           ((df_facility_job_his.ISOLATED_TO_CUST_FLG == 'F') | (df_facility_job_his.ISOLATED_TO_CUST_FLG.isnull())) &
                                           ((df_facility_job_his.ROUTINE_FLG == 'F') | (df_facility_job_his.ROUTINE_FLG.isnull())) &
                                           ((df_facility_job_his.ENERGIZED_DATETIME - 
                                             df_facility_job_his.CREATION_DATETIME).dt.total_seconds().div(60).round(2) > 5) 
                                          & (df_facility_job_his.CREATION_DATETIME.dt.year > 2019) & (df_facility_job_his.CREATION_DATETIME.dt.year <= 2020))
print("Total blue sky Events: ", len(df_facility_job_his[df_facility_job_his.BLUE_SKY_FLG == True]))

# creating storm event flags

df_facility_job_his['STORM_EVENT_FLG'] = ( (df_facility_job_his.TOT_LOSS_POWER_FLG == 'T') & 
                                           ((df_facility_job_his.MAJ_OTG_ID != 0) & (df_facility_job_his.MAJ_OTG_ID.notnull())) &
                                           ((df_facility_job_his.ISOLATED_TO_CUST_FLG == 'F') | (df_facility_job_his.ISOLATED_TO_CUST_FLG.isnull())) &
                                           ((df_facility_job_his.ROUTINE_FLG == 'F') | (df_facility_job_his.ROUTINE_FLG.isnull())) &
                                           ((df_facility_job_his.ENERGIZED_DATETIME -
                                             df_facility_job_his.CREATION_DATETIME).dt.total_seconds().div(60).round(2) > 5) 
                                          & (df_facility_job_his.CREATION_DATETIME.dt.year > 2019) & (df_facility_job_his.CREATION_DATETIME.dt.year <= 2020))
print("Total Storm Events: ", len(df_facility_job_his[df_facility_job_his.STORM_EVENT_FLG == True]))


######################################################################################################################################################################################################
######################################################################### APPLYING FILTERS FOR CORRECT DATA INPUTS####################################################################################
######################################################################################################################################################################################################
print("\n")
print("After Filtering Creation_datetime from 2020 & greater")
df_facility_job_his = df_facility_job_his[(df_facility_job_his.CREATION_DATETIME.dt.year > 2019) & (df_facility_job_his.CREATION_DATETIME.dt.year <= 2020)]
_incident_ = len(df_facility_job_his[['INCIDENT_ID','STRCTUR_NO']].drop_duplicates())
print("Rows", len(df_facility_job_his))
print("blue sky events", len(df_facility_job_his[df_facility_job_his.BLUE_SKY_FLG == True]))
print("Storm events", len(df_facility_job_his[df_facility_job_his.STORM_EVENT_FLG == True]))
print("Number of incident id", df_facility_job_his.INCIDENT_ID.nunique())
print(df_facility_job_his.shape)
print("\n")

# customer quantity greater than 0
print('Filter for customer quantity greater than 0')
# print("****QC Check****")
print("Rows left after checking for INCIDENTS whose CUSTOMER QUANTITY IS > 0")
df_facility_job_his = df_facility_job_his[(df_facility_job_his.CUST_QTY > 0)]
print(df_facility_job_his.shape)
print("\n")

# equip_stn_no is not NCC and not null
print('Filter for equp_stn_no is not NCC or not null')
# print("****QC Check****")
print("Rows left after checking that EQUIP_STN_NO is not from <<NON CONNECTED CUSTOMERS>>")
df_facility_job_his = df_facility_job_his[(df_facility_job_his.EQUIP_STN_NO != '<NCC>') & (df_facility_job_his.EQUIP_STN_NO.notnull())]
print(df_facility_job_his.shape)
print("\n")


# removing NAN from DNI_EQUIP_TYPE, CIRCT_ID, STRCTUR_NO
print('Removing NAN from DNI_EQIP_TYPE, CICRT_ID, STRCTUR_NO')
# print("****QC Check****")
print("Rows left after checking CIRCT_ID is not 0 and not null, STRCTUR_NO is not null and DNI_EQIP_TYPE is not null")
df_facility_job_his = df_facility_job_his[(df_facility_job_his.CIRCT_ID != 0)]
df_facility_job_his = df_facility_job_his[~df_facility_job_his.CIRCT_ID.isnull()]
df_facility_job_his = df_facility_job_his[~df_facility_job_his.STRCTUR_NO.isnull()]
df_facility_job_his = df_facility_job_his[~df_facility_job_his.DNI_EQUIP_TYPE.isnull()]
print(df_facility_job_his.shape)
print("\n")

# removing CLUE_CD which start with 0 but does not start with 00
print('Removing CLUE_CD which start with 0 but do not start with 00')
# print("****QC Check****")
print("Rows left after filtering for CLUE CODES which start with 0 but do not start with 00")
df_facility_job_his = df_facility_job_his[(df_facility_job_his.CLUE_CD.str[:1] == '0') & (df_facility_job_his.CLUE_CD.str[:2] != '00')]
df_facility_job_his = df_facility_job_his[df_facility_job_his.CLUE_CD != '01']
print(df_facility_job_his.shape)
print("\n")

# removing occurence codes starting with cancel, found ok and duplicate
print('Removing CLUE_CD which start with 0 but do not start with 00')
# print("****QC Check****")

print("Rows left after removing OCCURN_CD which have descriptions starting with CANCEL, FOUND OK or DUPLICATE")
occur_remov = [30003001, 33003301, 33003302, 34003400, 34003401, 34003402, 34003403, 34003404, 34003405, 34003406, 34003407, 34003408, 34003409, 35003500,
                35003501, 35003502, 35003503, 35003504, 35003505, 35003506, 35003507, 35003508, 36003600, 36003601, 36003602, 36003603, 36003604, 36003605,
                36003606, 36003607, 36003608, 37003703, 38003802, 38003803, 38003804, 38003807, 39003910, 41004100, 41004101, 41004102, 48004800, 48004802,
                48004803, 49004900, 49004901, 49004902, 50005000, 50005001, 50005002, 52005200, 52005201, 52005202, 52005203, 52005204, 52005205, 52005206,
                52005207, 53005300, 53005301, 53005302, 53005303, 53005304, 53005305, 53005306, 53005307, 53005308, 53005309, 53005310, 54005400, 54005401,
                54005402, 54005403, 54005404, 54005405, 34003410, 30003000, 36503650, 36503651, 36503652, 36503653, 36503654, 36503655, 36503656, 36503657,
                36503658]
df_facility_job_his = df_facility_job_his[~(df_facility_job_his.OCCURN_CD.isin(occur_remov))]
print(df_facility_job_his.shape)
print("\n")

Total blue sky Events:  2782
Total Storm Events:  166


After Filtering Creation_datetime from 2020 & greater
Rows 21350
blue sky events 2782
Storm events 166
Number of incident id 18881
(21350, 72)


Filter for customer quantity greater than 0
Rows left after checking for INCIDENTS whose CUSTOMER QUANTITY IS > 0
(12707, 72)


Filter for equp_stn_no is not NCC or not null
Rows left after checking that EQUIP_STN_NO is not from <<NON CONNECTED CUSTOMERS>>
(12692, 72)


Removing NAN from DNI_EQIP_TYPE, CICRT_ID, STRCTUR_NO
Rows left after checking CIRCT_ID is not 0 and not null, STRCTUR_NO is not null and DNI_EQIP_TYPE is not null
(12692, 72)


Removing CLUE_CD which start with 0 but do not start with 00
Rows left after filtering for CLUE CODES which start with 0 but do not start with 00
(8174, 72)


Removing CLUE_CD which start with 0 but do not start with 00
Rows left after removing OCCURN_CD which have descriptions starting with CANCEL, FOUND OK or DUPLICATE
(8174, 72)




In [9]:
df_fac_final = df_facility_job_his.copy(deep=True)
print("Rows", len(df_fac_final))
_incident_ = len(df_fac_final[['INCIDENT_ID','STRCTUR_NO']].drop_duplicates())
print("Number of incident id", df_fac_final.INCIDENT_ID.nunique())
print("Unique structure no",_incident_)
print(df_fac_final.shape)

Rows 8174
Number of incident id 6675
Unique structure no 7420
(8174, 72)


In [10]:
df_event_flg = df_fac_final[['INCIDENT_ID','STRCTUR_NO','CIRCT_ID' ,'DNI_EQUIP_TYPE','STORM_EVENT_FLG']]
def event_flag(group):
    group = group.reset_index(drop = True)
    if(group.STORM_EVENT_FLG.sum() >=1):
        group['EVENT'] = 'STORM'
        return group
    else:
        group['EVENT'] = 'BLUE SKY'
        return group
df_event_flg = df_event_flg.groupby(['INCIDENT_ID','STRCTUR_NO', 'CIRCT_ID', 'DNI_EQUIP_TYPE'], as_index = False).apply(event_flag).reset_index(drop = True)

In [11]:
df_check = df_fac_final.groupby(['INCIDENT_ID','STRCTUR_NO', 'CIRCT_ID', 'DNI_EQUIP_TYPE']).nunique()
df_check.sum()

FAC_JOB_ID              8174
CIRCT_ID                7423
MAJ_OTG_ID              7425
EQUIP_STN_NO            8150
DIST_NO                 7423
HOST_SEQ_ID                0
PRIORITY_VAL            7490
CUST_QTY                8068
CLUE_CD                 7670
CLUE_DESC               7670
CREATION_DATETIME       7457
CALL_QTY                7975
KEY_CUST_QTY            7726
SPLIT_FAC_JOB_FLG       4663
CAUSE_CD                4144
CAUSE_DESC              4091
OCCURN_CD               7423
OCCURN_DESC             7205
CLIMATIC_CD             7411
CLIMATIC_DESC           7411
CITY_NAM                7225
LOC_DESC                7413
WRK_ORD_NUM                0
COMMENT_TEXT            3599
CALL_ID                 8174
KVA_VAL                 7936
BOOK_NO                    0
ADDRESS                 7912
CIRCT_NAM               7423
CLUE_CD2                  52
INSERTED_DATE           7448
DOWNSTREAM_KVA_VAL      8010
DOWNSTREAM_CUST_QTY     8068
COMPL_DATETIME          7465
TOT_LOSS_POWER

In [12]:
df_numerical = df_fac_final.groupby(['INCIDENT_ID','STRCTUR_NO', 'CIRCT_ID', 'DNI_EQUIP_TYPE' ], as_index = False).agg({'CUST_QTY':'sum','CALL_QTY':'sum','KEY_CUST_QTY':'sum',
                                                                                                                        'DOWNSTREAM_CUST_QTY':'sum','KVA_VAL':'mean',
                                                                                                                        'DOWNSTREAM_KVA_VAL':'mean', 'FAC_JOB_ID': 'max',
                                                                                                                        'ETR_DATETIME': 'max', 'CREATION_DATETIME': 'min',
                                                                                                                        'MAJ_OTG_ID' : 'max','ENERGIZED_DATETIME': 'max',
                                                                                                                        'SUBST_ID': 'min', 'COMMENT_TEXT' : 'last'})

In [13]:
print(df_numerical.shape)
display(df_numerical.head())

(7423, 17)


Unnamed: 0,INCIDENT_ID,STRCTUR_NO,CIRCT_ID,DNI_EQUIP_TYPE,CUST_QTY,CALL_QTY,KEY_CUST_QTY,DOWNSTREAM_CUST_QTY,KVA_VAL,DOWNSTREAM_KVA_VAL,FAC_JOB_ID,ETR_DATETIME,CREATION_DATETIME,MAJ_OTG_ID,ENERGIZED_DATETIME,SUBST_ID,COMMENT_TEXT
0,2001535383,433-B/6,3209,1TBOH,1,1,0,1,0.0,0.0,2002739794,2020-08-24 21:15:00,2020-08-24 17:31:47,0,2020-08-24 19:05:39,320,"PER CUSTOMER, METER IS MISSING. RB"
1,2001535387,564-A/706,1105,3TPUG,1,1,0,1,500.0,500.0,2002739798,2020-08-24 23:00:00,2020-08-24 17:49:35,0,2020-08-24 19:05:08,110,STATES RESET CIRCUIT BREAKER BOX AND STILL ONL...
2,2001535403,421BA/157,2905,1TBOH,13,11,2,13,0.0,50.0,2002739836,2020-08-25 05:15:00,2020-08-24 19:38:22,0,2020-08-25 03:55:00,290,STATES IT WAS FLICKERING SO HE RESET CIRCUIT B...
3,2001535408,411-B/275,5455,1TBOH,1,1,0,1,0.0,0.0,2002739823,2020-08-24 23:15:00,2020-08-24 19:41:07,0,2020-08-24 20:37:10,28,(317) 333-1008 SPANISH SPEAKING CUSTOMER...CEL...
4,2001535414,205-A/390,2852,PADELB,2,3,0,2,25.0,25.0,2002739899,2020-08-24 22:00:00,2020-08-24 20:08:39,0,2020-08-24 21:57:17,285,


In [14]:
df_numerical = pd.merge(df_numerical, df_event_flg, on = ['INCIDENT_ID','STRCTUR_NO', 'CIRCT_ID', 'DNI_EQUIP_TYPE' ], how='left')

In [15]:
df_numerical.drop_duplicates(keep='first', inplace=True)
print(df_numerical.shape)
display(df_numerical.head())

(7425, 19)


Unnamed: 0,INCIDENT_ID,STRCTUR_NO,CIRCT_ID,DNI_EQUIP_TYPE,CUST_QTY,CALL_QTY,KEY_CUST_QTY,DOWNSTREAM_CUST_QTY,KVA_VAL,DOWNSTREAM_KVA_VAL,FAC_JOB_ID,ETR_DATETIME,CREATION_DATETIME,MAJ_OTG_ID,ENERGIZED_DATETIME,SUBST_ID,COMMENT_TEXT,STORM_EVENT_FLG,EVENT
0,2001535383,433-B/6,3209,1TBOH,1,1,0,1,0.0,0.0,2002739794,2020-08-24 21:15:00,2020-08-24 17:31:47,0,2020-08-24 19:05:39,320,"PER CUSTOMER, METER IS MISSING. RB",False,BLUE SKY
1,2001535387,564-A/706,1105,3TPUG,1,1,0,1,500.0,500.0,2002739798,2020-08-24 23:00:00,2020-08-24 17:49:35,0,2020-08-24 19:05:08,110,STATES RESET CIRCUIT BREAKER BOX AND STILL ONL...,False,BLUE SKY
2,2001535403,421BA/157,2905,1TBOH,13,11,2,13,0.0,50.0,2002739836,2020-08-25 05:15:00,2020-08-24 19:38:22,0,2020-08-25 03:55:00,290,STATES IT WAS FLICKERING SO HE RESET CIRCUIT B...,False,BLUE SKY
3,2001535408,411-B/275,5455,1TBOH,1,1,0,1,0.0,0.0,2002739823,2020-08-24 23:15:00,2020-08-24 19:41:07,0,2020-08-24 20:37:10,28,(317) 333-1008 SPANISH SPEAKING CUSTOMER...CEL...,False,BLUE SKY
4,2001535414,205-A/390,2852,PADELB,2,3,0,2,25.0,25.0,2002739899,2020-08-24 22:00:00,2020-08-24 20:08:39,0,2020-08-24 21:57:17,285,,False,BLUE SKY


In [16]:
# creating day night flag for outages

df_numerical['DAY_FLAG'] = df_numerical.CREATION_DATETIME.dt.hour.apply(lambda x: 1 if ((x >= 6) & (x<18)) else 0)
df_numerical['TTR'] = (df_numerical.ENERGIZED_DATETIME - df_numerical.CREATION_DATETIME).dt.total_seconds().div(60).round(4)

## **Adding Live Outages**

In [17]:
df_numerical.rename({'FAC_JOB_ID' : 'OUTAGE_ID'}, axis=1, inplace=True)

In [18]:
def count_outage(group):
    group = group.reset_index(drop = True)
    group['LIVE_OUTAGE'] = len(df_numerical[(df_numerical.CREATION_DATETIME < group.CREATION_DATETIME[0]) & (df_numerical.ENERGIZED_DATETIME > group.CREATION_DATETIME[0])])
    return group

def grouping_fn(df):
    liveoutage = df.groupby(['OUTAGE_ID'], as_index=False).apply(count_outage)
    return liveoutage

if __name__ == '__main__':
    starttime = time.time()
    with Pool(30) as p:
            live_outage = p.map(grouping_fn, [df_numerical[:5000], df_numerical[5000:10000], df_numerical[10000:15000],
                                  df_numerical[15000:20000], df_numerical[20000:25000], df_numerical[25000:30000],
                                  df_numerical[30000:35000], df_numerical[35000:40000], df_numerical[40000:50000],
                                  df_numerical[50000:55000], df_numerical[55000:60000], df_numerical[60000:65000],
                                  df_numerical[65000:70000], df_numerical[70000:75000], df_numerical[75000:80000],
                                  df_numerical[80000:90000], df_numerical[90000:100000], df_numerical[100000:105000],
                                  df_numerical[105000:]])
    print('That took {} seconds'.format(time.time() - starttime))

That took 24.220856428146362 seconds


In [19]:
ads_final=pd.concat(live_outage)
ads_final.reset_index(drop=True, inplace=True)
ads_final.head()

Unnamed: 0,INCIDENT_ID,STRCTUR_NO,CIRCT_ID,DNI_EQUIP_TYPE,CUST_QTY,CALL_QTY,KEY_CUST_QTY,DOWNSTREAM_CUST_QTY,KVA_VAL,DOWNSTREAM_KVA_VAL,OUTAGE_ID,ETR_DATETIME,CREATION_DATETIME,MAJ_OTG_ID,ENERGIZED_DATETIME,SUBST_ID,COMMENT_TEXT,STORM_EVENT_FLG,EVENT,DAY_FLAG,TTR,LIVE_OUTAGE
0,2001535383,433-B/6,3209,1TBOH,1,1,0,1,0.0,0.0,2002739794,2020-08-24 21:15:00,2020-08-24 17:31:47,0,2020-08-24 19:05:39,320,"PER CUSTOMER, METER IS MISSING. RB",False,BLUE SKY,1,93.87,0
1,2001535387,564-A/706,1105,3TPUG,1,1,0,1,500.0,500.0,2002739798,2020-08-24 23:00:00,2020-08-24 17:49:35,0,2020-08-24 19:05:08,110,STATES RESET CIRCUIT BREAKER BOX AND STILL ONL...,False,BLUE SKY,1,75.55,1
2,2001535408,411-B/275,5455,1TBOH,1,1,0,1,0.0,0.0,2002739823,2020-08-24 23:15:00,2020-08-24 19:41:07,0,2020-08-24 20:37:10,28,(317) 333-1008 SPANISH SPEAKING CUSTOMER...CEL...,False,BLUE SKY,0,56.05,1
3,2001535414,205-A/51,2852,FUSE,7,13,0,7,100.0,100.0,2002739830,2020-08-24 22:00:00,2020-08-24 20:08:39,0,2020-08-25 00:50:00,285,,False,BLUE SKY,0,281.35,2
4,2001535417,697-B/109,2210,1TBOH,1,1,0,1,0.0,0.0,2002739833,2020-08-25 00:45:00,2020-08-24 21:08:40,0,2020-08-24 22:19:47,220,STATES THEIR SERVICE LINE IS SEVERED AND THEY ...,False,BLUE SKY,0,71.12,3


## **Outage Feature -Adding OUTAGES in last N hours**

In [20]:
def count_outage_minutes(group):
    group = group.reset_index(drop = True)
    df_temp = ads_final[['OUTAGE_ID','CREATION_DATETIME']]
    df_temp['minutes'] = (group['CREATION_DATETIME'][0] - ads_final['CREATION_DATETIME']).dt.total_seconds().div(60)
    df_temp = df_temp[df_temp.minutes > 0]
    group['Outages_in_last_1hr'] = len(df_temp[df_temp.minutes <= 60])
    group['Outages_in_last_2hr'] = len(df_temp[df_temp.minutes <= 120])
    group['Outages_in_last_3hr'] = len(df_temp[df_temp.minutes <= 180])
    group['Outages_in_last_4hr'] = len(df_temp[df_temp.minutes <= 240])
    group['Outages_in_last_5hr'] = len(df_temp[df_temp.minutes <= 300])
    group['Outages_in_last_6hr'] = len(df_temp[df_temp.minutes <= 360])
    group['Outages_in_last_7hr'] = len(df_temp[df_temp.minutes <= 420])
    group['Outages_in_last_8hr'] = len(df_temp[df_temp.minutes <= 480])
    group['Outages_in_last_9hr'] = len(df_temp[df_temp.minutes <= 540])
    group['Outages_in_last_10hr'] = len(df_temp[df_temp.minutes <= 600])
    return group

def grouping_fn_minutes(df):
    liveoutage = df.groupby(['OUTAGE_ID'], as_index=False).apply(count_outage_minutes)
    return liveoutage

if __name__ == '__main__':
    starttime = time.time()
    with Pool(30) as p:
            live_outage_minutes = p.map(grouping_fn_minutes, [ads_final[:5000], ads_final[5000:10000], ads_final[10000:15000],
                                  ads_final[15000:20000], ads_final[20000:25000], ads_final[25000:30000],
                                  ads_final[30000:35000], ads_final[35000:40000], ads_final[40000:50000],
                                  ads_final[50000:55000], ads_final[55000:60000], ads_final[60000:65000],
                                  ads_final[65000:70000], ads_final[70000:75000], ads_final[75000:80000],
                                  ads_final[80000:90000], ads_final[90000:100000], ads_final[100000:105000],
                                  ads_final[105000:]])
    print('That took {} seconds'.format(time.time() - starttime))

That took 96.62513160705566 seconds


In [21]:
ads_final=pd.concat(live_outage_minutes)
ads_final.reset_index(drop=True, inplace=True)
ads_final.head()

Unnamed: 0,INCIDENT_ID,STRCTUR_NO,CIRCT_ID,DNI_EQUIP_TYPE,CUST_QTY,CALL_QTY,KEY_CUST_QTY,DOWNSTREAM_CUST_QTY,KVA_VAL,DOWNSTREAM_KVA_VAL,OUTAGE_ID,ETR_DATETIME,CREATION_DATETIME,MAJ_OTG_ID,ENERGIZED_DATETIME,SUBST_ID,COMMENT_TEXT,STORM_EVENT_FLG,EVENT,DAY_FLAG,TTR,LIVE_OUTAGE,Outages_in_last_1hr,Outages_in_last_2hr,Outages_in_last_3hr,Outages_in_last_4hr,Outages_in_last_5hr,Outages_in_last_6hr,Outages_in_last_7hr,Outages_in_last_8hr,Outages_in_last_9hr,Outages_in_last_10hr
0,2001535383,433-B/6,3209,1TBOH,1,1,0,1,0.0,0.0,2002739794,2020-08-24 21:15:00,2020-08-24 17:31:47,0,2020-08-24 19:05:39,320,"PER CUSTOMER, METER IS MISSING. RB",False,BLUE SKY,1,93.87,0,0,0,0,0,0,0,0,0,0,0
1,2001535387,564-A/706,1105,3TPUG,1,1,0,1,500.0,500.0,2002739798,2020-08-24 23:00:00,2020-08-24 17:49:35,0,2020-08-24 19:05:08,110,STATES RESET CIRCUIT BREAKER BOX AND STILL ONL...,False,BLUE SKY,1,75.55,1,1,1,1,1,1,1,1,1,1,1
2,2001535408,411-B/275,5455,1TBOH,1,1,0,1,0.0,0.0,2002739823,2020-08-24 23:15:00,2020-08-24 19:41:07,0,2020-08-24 20:37:10,28,(317) 333-1008 SPANISH SPEAKING CUSTOMER...CEL...,False,BLUE SKY,0,56.05,1,1,2,3,3,3,3,3,3,3,3
3,2001535414,205-A/51,2852,FUSE,7,13,0,7,100.0,100.0,2002739830,2020-08-24 22:00:00,2020-08-24 20:08:39,0,2020-08-25 00:50:00,285,,False,BLUE SKY,0,281.35,2,2,2,4,4,4,4,4,4,4,4
4,2001535417,697-B/109,2210,1TBOH,1,1,0,1,0.0,0.0,2002739833,2020-08-25 00:45:00,2020-08-24 21:08:40,0,2020-08-24 22:19:47,220,STATES THEIR SERVICE LINE IS SEVERED AND THEY ...,False,BLUE SKY,0,71.12,3,0,4,4,6,6,6,6,6,6,6


## **Read from Predictions table**

In [22]:
DF_PRED = 'SELECT * FROM `aes-analytics-0002.mds_outage_restoration.IPL_Predictions` where Last_Updated is not NULL'
DF_PRED = pd.read_gbq(DF_PRED, project_id="aes-analytics-0002")
print(DF_PRED.shape)

(1296, 10)


In [23]:
DF_PRED.dtypes

OUTAGE_ID                      object
INCIDENT_ID                     int64
STRCTUR_NO                     object
CIRCT_ID                        int64
DNI_EQUIP_TYPE                 object
Creation_Time                  object
Estimated_Restoration_Time     object
ETR                           float64
Weather_Profile                object
Last_Updated                   object
dtype: object

In [24]:
DF_PRED['Creation_Time'] = pd.to_datetime(DF_PRED['Creation_Time'], errors='coerce')
DF_PRED['Estimated_Restoration_Time'] = pd.to_datetime(DF_PRED['Estimated_Restoration_Time'], errors='coerce')

In [25]:
DF_PRED.head()

Unnamed: 0,OUTAGE_ID,INCIDENT_ID,STRCTUR_NO,CIRCT_ID,DNI_EQUIP_TYPE,Creation_Time,Estimated_Restoration_Time,ETR,Weather_Profile,Last_Updated
0,2001560016254--/512360FUSE,2001560016,254--/51,2360,FUSE,2020-11-16 12:28:31,2020-11-16 15:30:00,177.0,Hot Days with Chance of Rain,2020-11-16 19:05
1,2001560021254--/432360FUSE,2001560021,254--/43,2360,FUSE,2020-11-16 12:29:06,2020-11-16 14:40:00,133.0,Hot Days with Chance of Rain,2020-11-16 19:05
2,2001560087391-B/471201FUSE,2001560087,391-B/47,1201,FUSE,2020-11-16 14:18:44,2020-11-16 16:10:00,112.0,Hot Days with Chance of Rain,2020-11-16 21:05
3,2001560087391-B/471201FUSE,2001560087,391-B/47,1201,FUSE,2020-11-16 14:18:44,2020-11-16 16:10:00,113.0,Hot Days with Chance of Rain,2020-11-16 21:37
4,2001560087391-B/471201FUSE,2001560087,391-B/47,1201,FUSE,2020-11-16 14:18:44,2020-11-16 16:00:00,100.0,Hot Days with Chance of Rain,2020-11-16 20:31


In [26]:
df_merged = pd.merge(DF_PRED, ads_final,
                     on=['INCIDENT_ID', 'STRCTUR_NO', 'CIRCT_ID', 'DNI_EQUIP_TYPE'],
                     how='left',
                     suffixes=('','_y'))

In [27]:
df_merged = df_merged[['OUTAGE_ID', 'INCIDENT_ID', 'STRCTUR_NO', 'CIRCT_ID', 'DNI_EQUIP_TYPE', 'Creation_Time',
                       'Estimated_Restoration_Time', 'ETR', 'Weather_Profile', 'Last_Updated', 'CUST_QTY',
                       'ETR_DATETIME', 'CREATION_DATETIME', 'ENERGIZED_DATETIME', 'DAY_FLAG', 'TTR',
                       'LIVE_OUTAGE', 'Outages_in_last_1hr', 'Outages_in_last_2hr', 'Outages_in_last_3hr', 'Outages_in_last_4hr', 'Outages_in_last_5hr',
                       'Outages_in_last_6hr', 'Outages_in_last_7hr', 'Outages_in_last_8hr', 'Outages_in_last_9hr', 'Outages_in_last_10hr']]

In [28]:
print(df_merged.OUTAGE_ID.nunique())
print(df_merged.shape)

488
(1296, 27)


In [29]:
unique_df_first = df_merged.drop_duplicates(subset='OUTAGE_ID', keep="first")
print(unique_df_first.shape)

(488, 27)


In [30]:
unique_df_last = df_merged.drop_duplicates(subset='OUTAGE_ID', keep="last")
print(unique_df_last.shape)

(488, 27)


In [31]:
unstack_df = df_merged.groupby('OUTAGE_ID')['ETR'].apply(lambda x: pd.Series(list(x))).unstack()
unstack_df.head()
print(unstack_df.shape)

(488, 21)


In [32]:
df_merged.head()

Unnamed: 0,OUTAGE_ID,INCIDENT_ID,STRCTUR_NO,CIRCT_ID,DNI_EQUIP_TYPE,Creation_Time,Estimated_Restoration_Time,ETR,Weather_Profile,Last_Updated,CUST_QTY,ETR_DATETIME,CREATION_DATETIME,ENERGIZED_DATETIME,DAY_FLAG,TTR,LIVE_OUTAGE,Outages_in_last_1hr,Outages_in_last_2hr,Outages_in_last_3hr,Outages_in_last_4hr,Outages_in_last_5hr,Outages_in_last_6hr,Outages_in_last_7hr,Outages_in_last_8hr,Outages_in_last_9hr,Outages_in_last_10hr
0,2001560016254--/512360FUSE,2001560016,254--/51,2360,FUSE,2020-11-16 12:28:31,2020-11-16 15:30:00,177.0,Hot Days with Chance of Rain,2020-11-16 19:05,1.0,1900-01-01 00:00:00,2020-11-16 12:28:31,2020-11-16 13:34:26,1.0,65.92,89.0,9.0,25.0,42.0,56.0,72.0,81.0,84.0,85.0,86.0,86.0
1,2001560021254--/432360FUSE,2001560021,254--/43,2360,FUSE,2020-11-16 12:29:06,2020-11-16 14:40:00,133.0,Hot Days with Chance of Rain,2020-11-16 19:05,5.0,1900-01-01 00:00:00,2020-11-16 12:29:06,2020-11-16 13:46:00,1.0,76.9,90.0,10.0,26.0,43.0,57.0,73.0,82.0,85.0,86.0,87.0,87.0
2,2001560087391-B/471201FUSE,2001560087,391-B/47,1201,FUSE,2020-11-16 14:18:44,2020-11-16 16:10:00,112.0,Hot Days with Chance of Rain,2020-11-16 21:05,77.0,2020-11-16 19:00:00,2020-11-16 14:18:44,2020-11-16 17:33:00,1.0,194.27,60.0,14.0,34.0,46.0,57.0,77.0,91.0,107.0,114.0,116.0,117.0
3,2001560087391-B/471201FUSE,2001560087,391-B/47,1201,FUSE,2020-11-16 14:18:44,2020-11-16 16:10:00,113.0,Hot Days with Chance of Rain,2020-11-16 21:37,77.0,2020-11-16 19:00:00,2020-11-16 14:18:44,2020-11-16 17:33:00,1.0,194.27,60.0,14.0,34.0,46.0,57.0,77.0,91.0,107.0,114.0,116.0,117.0
4,2001560087391-B/471201FUSE,2001560087,391-B/47,1201,FUSE,2020-11-16 14:18:44,2020-11-16 16:00:00,100.0,Hot Days with Chance of Rain,2020-11-16 20:31,77.0,2020-11-16 19:00:00,2020-11-16 14:18:44,2020-11-16 17:33:00,1.0,194.27,60.0,14.0,34.0,46.0,57.0,77.0,91.0,107.0,114.0,116.0,117.0


In [33]:
#Join Unstack_df and unique_df
final_1 = pd.merge(unstack_df,unique_df_last,on='OUTAGE_ID',how='left')
print(final_1.shape)
final_1.head()

(488, 48)


Unnamed: 0,OUTAGE_ID,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,INCIDENT_ID,STRCTUR_NO,CIRCT_ID,DNI_EQUIP_TYPE,Creation_Time,Estimated_Restoration_Time,ETR,Weather_Profile,Last_Updated,CUST_QTY,ETR_DATETIME,CREATION_DATETIME,ENERGIZED_DATETIME,DAY_FLAG,TTR,LIVE_OUTAGE,Outages_in_last_1hr,Outages_in_last_2hr,Outages_in_last_3hr,Outages_in_last_4hr,Outages_in_last_5hr,Outages_in_last_6hr,Outages_in_last_7hr,Outages_in_last_8hr,Outages_in_last_9hr,Outages_in_last_10hr
0,2001558684230-A/1202805FUSE,186.0,,,,,,,,,,,,,,,,,,,,,2001558684,230-A/120,2805,FUSE,2020-11-15 08:59:31,2020-11-15 12:10:00,186.0,Strong Breeze with Sudden Rain,2020-11-16 19:05,2.0,2020-11-15 13:30:00,2020-11-15 08:59:31,2020-11-16 14:25:27,1.0,1765.93,11.0,24.0,30.0,30.0,35.0,37.0,40.0,45.0,45.0,46.0,48.0
1,2001558699EDIT20000632072805CUT,191.0,188.0,188.0,188.0,188.0,,,,,,,,,,,,,,,,,2001558699,EDIT2000063207,2805,CUT,2020-11-15 09:06:42,2020-11-15 12:10:00,188.0,Strong Breeze with Sudden Rain,2020-11-16 20:37,4.0,1900-01-01 00:00:00,2020-11-15 09:06:42,2020-11-16 16:52:00,1.0,1905.3,15.0,27.0,36.0,36.0,40.0,43.0,46.0,51.0,51.0,52.0,54.0
2,2001558705788--/7125081TBOH,167.0,,,,,,,,,,,,,,,,,,,,,2001558705,788--/71,2508,1TBOH,2020-11-15 09:09:33,2020-11-15 12:00:00,167.0,Strong Breeze with Sudden Rain,2020-11-16 19:05,1.0,1900-01-01 00:00:00,2020-11-15 09:09:33,2020-11-16 14:45:07,1.0,1775.57,15.0,28.0,37.0,37.0,41.0,42.0,47.0,52.0,52.0,53.0,55.0
3,2001558710EDIT20000649482456CUT,183.0,,,,,,,,,,,,,,,,,,,,,2001558710,EDIT2000064948,2456,CUT,2020-11-15 09:15:25,2020-11-15 12:20:00,183.0,Strong Breeze with Sudden Rain,2020-11-16 19:05,2.0,2020-11-15 13:45:00,2020-11-15 09:15:20,2020-11-16 13:42:33,1.0,1707.22,20.0,32.0,42.0,42.0,46.0,47.0,50.0,57.0,57.0,58.0,59.0
4,2001558816170-A/253107FUSE,237.0,231.0,229.0,232.0,230.0,226.0,202.0,202.0,225.0,,,,,,,,,,,,,2001558816,170-A/25,3107,FUSE,2020-11-15 09:32:20,2020-11-15 13:20:00,225.0,Strong Breeze with Sudden Rain,2020-11-16 22:05,2.0,1900-01-01 00:00:00,2020-11-15 09:32:20,2020-11-16 19:46:00,1.0,2053.67,62.0,72.0,85.0,85.0,89.0,90.0,92.0,100.0,100.0,100.0,101.0


In [34]:
#renaming columns
final_1 = final_1.rename(columns = {0:'Pred_1',
                                1:'Pred_2',
                                2:'Pred_3',
                                3:'Pred_4',
                                4:'Pred_5',
                                5:'Pred_6',
                                6:'Pred_7',
                                7:'Pred_8',
                                8:'Pred_9',
                                9:'Pred_10',
                                10:'Pred_11',
                                11:'Pred_12',
                                12:'Pred_13',
                                13:'Pred_14',
                                14:'Pred_15',
                                15:'Pred_16',
                                16:'Pred_17',
                                17:'Pred_18',
                                18:'Pred_19',
                                19:'Pred_20',
                                20:'Pred_21'}) 

In [35]:
final_1.head()

Unnamed: 0,OUTAGE_ID,Pred_1,Pred_2,Pred_3,Pred_4,Pred_5,Pred_6,Pred_7,Pred_8,Pred_9,Pred_10,Pred_11,Pred_12,Pred_13,Pred_14,Pred_15,Pred_16,Pred_17,Pred_18,Pred_19,Pred_20,Pred_21,INCIDENT_ID,STRCTUR_NO,CIRCT_ID,DNI_EQUIP_TYPE,Creation_Time,Estimated_Restoration_Time,ETR,Weather_Profile,Last_Updated,CUST_QTY,ETR_DATETIME,CREATION_DATETIME,ENERGIZED_DATETIME,DAY_FLAG,TTR,LIVE_OUTAGE,Outages_in_last_1hr,Outages_in_last_2hr,Outages_in_last_3hr,Outages_in_last_4hr,Outages_in_last_5hr,Outages_in_last_6hr,Outages_in_last_7hr,Outages_in_last_8hr,Outages_in_last_9hr,Outages_in_last_10hr
0,2001558684230-A/1202805FUSE,186.0,,,,,,,,,,,,,,,,,,,,,2001558684,230-A/120,2805,FUSE,2020-11-15 08:59:31,2020-11-15 12:10:00,186.0,Strong Breeze with Sudden Rain,2020-11-16 19:05,2.0,2020-11-15 13:30:00,2020-11-15 08:59:31,2020-11-16 14:25:27,1.0,1765.93,11.0,24.0,30.0,30.0,35.0,37.0,40.0,45.0,45.0,46.0,48.0
1,2001558699EDIT20000632072805CUT,191.0,188.0,188.0,188.0,188.0,,,,,,,,,,,,,,,,,2001558699,EDIT2000063207,2805,CUT,2020-11-15 09:06:42,2020-11-15 12:10:00,188.0,Strong Breeze with Sudden Rain,2020-11-16 20:37,4.0,1900-01-01 00:00:00,2020-11-15 09:06:42,2020-11-16 16:52:00,1.0,1905.3,15.0,27.0,36.0,36.0,40.0,43.0,46.0,51.0,51.0,52.0,54.0
2,2001558705788--/7125081TBOH,167.0,,,,,,,,,,,,,,,,,,,,,2001558705,788--/71,2508,1TBOH,2020-11-15 09:09:33,2020-11-15 12:00:00,167.0,Strong Breeze with Sudden Rain,2020-11-16 19:05,1.0,1900-01-01 00:00:00,2020-11-15 09:09:33,2020-11-16 14:45:07,1.0,1775.57,15.0,28.0,37.0,37.0,41.0,42.0,47.0,52.0,52.0,53.0,55.0
3,2001558710EDIT20000649482456CUT,183.0,,,,,,,,,,,,,,,,,,,,,2001558710,EDIT2000064948,2456,CUT,2020-11-15 09:15:25,2020-11-15 12:20:00,183.0,Strong Breeze with Sudden Rain,2020-11-16 19:05,2.0,2020-11-15 13:45:00,2020-11-15 09:15:20,2020-11-16 13:42:33,1.0,1707.22,20.0,32.0,42.0,42.0,46.0,47.0,50.0,57.0,57.0,58.0,59.0
4,2001558816170-A/253107FUSE,237.0,231.0,229.0,232.0,230.0,226.0,202.0,202.0,225.0,,,,,,,,,,,,,2001558816,170-A/25,3107,FUSE,2020-11-15 09:32:20,2020-11-15 13:20:00,225.0,Strong Breeze with Sudden Rain,2020-11-16 22:05,2.0,1900-01-01 00:00:00,2020-11-15 09:32:20,2020-11-16 19:46:00,1.0,2053.67,62.0,72.0,85.0,85.0,89.0,90.0,92.0,100.0,100.0,100.0,101.0


In [36]:
final_1["ETR_MAX"] =final_1[["Pred_1", "Pred_2", 'Pred_3', 'Pred_4', 'Pred_5', 'Pred_6', 'Pred_7', 'Pred_8', 'Pred_9', 'Pred_10',
              'Pred_11', 'Pred_12', 'Pred_13', 'Pred_14', 'Pred_15', 'Pred_16', 'Pred_17', 'Pred_18', 'Pred_19', 'Pred_20', 'Pred_21']].max(axis=1)

final_1["ETR_MIN"] =final_1[["Pred_1", "Pred_2", 'Pred_3', 'Pred_4', 'Pred_5', 'Pred_6', 'Pred_7', 'Pred_8', 'Pred_9', 'Pred_10',
              'Pred_11', 'Pred_12', 'Pred_13', 'Pred_14', 'Pred_15', 'Pred_16', 'Pred_17', 'Pred_18', 'Pred_19', 'Pred_20', 'Pred_21']].min(axis=1)

In [37]:
unique_df_first.rename({'Last_Updated': 'First_Updated'}, axis=1, inplace=True)
unique_df_first = unique_df_first[['OUTAGE_ID', 'First_Updated']]
unique_df_first.head()

Unnamed: 0,OUTAGE_ID,First_Updated
0,2001560016254--/512360FUSE,2020-11-16 19:05
1,2001560021254--/432360FUSE,2020-11-16 19:05
2,2001560087391-B/471201FUSE,2020-11-16 21:05
6,2001560089491XB/202716FUSE,2020-11-16 20:31
7,2001560132581F-/472757FUSE,2020-11-16 21:05


In [42]:
merged_final = pd.merge(final_1, unique_df_first, on=['OUTAGE_ID'], how='left')
merged_final.head()

Unnamed: 0,OUTAGE_ID,Pred_1,Pred_2,Pred_3,Pred_4,Pred_5,Pred_6,Pred_7,Pred_8,Pred_9,Pred_10,Pred_11,Pred_12,Pred_13,Pred_14,Pred_15,Pred_16,Pred_17,Pred_18,Pred_19,Pred_20,Pred_21,INCIDENT_ID,STRCTUR_NO,CIRCT_ID,DNI_EQUIP_TYPE,Creation_Time,Estimated_Restoration_Time,ETR,Weather_Profile,Last_Updated,CUST_QTY,ETR_DATETIME,CREATION_DATETIME,ENERGIZED_DATETIME,DAY_FLAG,TTR,LIVE_OUTAGE,Outages_in_last_1hr,Outages_in_last_2hr,Outages_in_last_3hr,Outages_in_last_4hr,Outages_in_last_5hr,Outages_in_last_6hr,Outages_in_last_7hr,Outages_in_last_8hr,Outages_in_last_9hr,Outages_in_last_10hr,ETR_MAX,ETR_MIN,First_Updated
0,2001558684230-A/1202805FUSE,186.0,,,,,,,,,,,,,,,,,,,,,2001558684,230-A/120,2805,FUSE,2020-11-15 08:59:31,2020-11-15 12:10:00,186.0,Strong Breeze with Sudden Rain,2020-11-16 14:05:00-05:00,2.0,2020-11-15 13:30:00,2020-11-15 08:59:31,2020-11-16 14:25:27,1.0,1765.93,11.0,24.0,30.0,30.0,35.0,37.0,40.0,45.0,45.0,46.0,48.0,186.0,186.0,2020-11-16 19:05
1,2001558699EDIT20000632072805CUT,191.0,188.0,188.0,188.0,188.0,,,,,,,,,,,,,,,,,2001558699,EDIT2000063207,2805,CUT,2020-11-15 09:06:42,2020-11-15 12:10:00,188.0,Strong Breeze with Sudden Rain,2020-11-16 15:37:00-05:00,4.0,1900-01-01 00:00:00,2020-11-15 09:06:42,2020-11-16 16:52:00,1.0,1905.3,15.0,27.0,36.0,36.0,40.0,43.0,46.0,51.0,51.0,52.0,54.0,191.0,188.0,2020-11-16 19:05
2,2001558705788--/7125081TBOH,167.0,,,,,,,,,,,,,,,,,,,,,2001558705,788--/71,2508,1TBOH,2020-11-15 09:09:33,2020-11-15 12:00:00,167.0,Strong Breeze with Sudden Rain,2020-11-16 14:05:00-05:00,1.0,1900-01-01 00:00:00,2020-11-15 09:09:33,2020-11-16 14:45:07,1.0,1775.57,15.0,28.0,37.0,37.0,41.0,42.0,47.0,52.0,52.0,53.0,55.0,167.0,167.0,2020-11-16 19:05
3,2001558710EDIT20000649482456CUT,183.0,,,,,,,,,,,,,,,,,,,,,2001558710,EDIT2000064948,2456,CUT,2020-11-15 09:15:25,2020-11-15 12:20:00,183.0,Strong Breeze with Sudden Rain,2020-11-16 14:05:00-05:00,2.0,2020-11-15 13:45:00,2020-11-15 09:15:20,2020-11-16 13:42:33,1.0,1707.22,20.0,32.0,42.0,42.0,46.0,47.0,50.0,57.0,57.0,58.0,59.0,183.0,183.0,2020-11-16 19:05
4,2001558816170-A/253107FUSE,237.0,231.0,229.0,232.0,230.0,226.0,202.0,202.0,225.0,,,,,,,,,,,,,2001558816,170-A/25,3107,FUSE,2020-11-15 09:32:20,2020-11-15 13:20:00,225.0,Strong Breeze with Sudden Rain,2020-11-16 17:05:00-05:00,2.0,1900-01-01 00:00:00,2020-11-15 09:32:20,2020-11-16 19:46:00,1.0,2053.67,62.0,72.0,85.0,85.0,89.0,90.0,92.0,100.0,100.0,100.0,101.0,237.0,202.0,2020-11-16 19:05


In [43]:
merged_final.isnull().sum()

OUTAGE_ID                       0
Pred_1                          0
Pred_2                        276
Pred_3                        338
Pred_4                        378
Pred_5                        407
Pred_6                        427
Pred_7                        443
Pred_8                        450
Pred_9                        460
Pred_10                       470
Pred_11                       473
Pred_12                       477
Pred_13                       479
Pred_14                       481
Pred_15                       483
Pred_16                       485
Pred_17                       485
Pred_18                       485
Pred_19                       485
Pred_20                       485
Pred_21                       485
INCIDENT_ID                     0
STRCTUR_NO                      0
CIRCT_ID                        0
DNI_EQUIP_TYPE                  0
Creation_Time                   0
Estimated_Restoration_Time      0
ETR                             0
Weather_Profil

In [48]:
merged_final['Last_Updated'] = pd.to_datetime(merged_final['Last_Updated'], format = '%Y-%m-%d %H:%M:%S', errors ='coerce')
merged_final['First_Updated'] = pd.to_datetime(merged_final['First_Updated'], format = '%Y-%m-%d %H:%M:%S', errors ='coerce')

merged_final['Creation_Time'] = pd.to_datetime(merged_final['Creation_Time'], format = '%Y-%m-%d %H:%M:%S', errors ='coerce')
merged_final['CREATION_DATETIME'] = pd.to_datetime(merged_final['CREATION_DATETIME'], format = '%Y-%m-%d %H:%M:%S', errors ='coerce')

merged_final['ENERGIZED_DATETIME'] = pd.to_datetime(merged_final['ENERGIZED_DATETIME'], format = '%Y-%m-%d %H:%M:%S', errors ='coerce')
merged_final['Estimated_Restoration_Time'] = pd.to_datetime(merged_final['Estimated_Restoration_Time'], format = '%Y-%m-%d %H:%M:%S', errors ='coerce')
merged_final['ETR_DATETIME'] = pd.to_datetime(merged_final['ETR_DATETIME'], format = '%Y-%m-%d %H:%M:%S', errors ='coerce')

# merged_final['Last_Updated'] = merged_final['Last_Updated'].dt.tz_localize('UTC')
merged_final['Last_Updated'] = merged_final['Last_Updated'].dt.tz_convert('US/Eastern')

merged_final['First_Updated'] = merged_final['First_Updated'].dt.tz_localize('UTC')
merged_final['First_Updated'] = merged_final['First_Updated'].dt.tz_convert('US/Eastern')

In [49]:
#Saving the DF with repredictions in parallel with min, max reprediction and their diffference
merged_final.to_csv('gs://aes-analytics-0002-curated/Outage_Restoration/Test/Outage_final_11232020.csv', index=False)