In [1]:
import csv
import math
import time
import warnings
import operator
import statistics
import requests
import json
import seaborn as sns
import pandas as pd
import numpy as np
import geopy.distance
import matplotlib.pyplot as plt

from dateutil.parser import parse
from datetime import datetime
from datetime import date, timedelta
from scipy import stats
from IPython.display import display_html
from multiprocessing import Pool
from sklearn.model_selection import train_test_split
from google.cloud import storage

plt.style.use('fivethirtyeight')
warnings.filterwarnings('ignore')
%matplotlib inline

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.float_format = '{:.2f}'.format

## **Read necessary files from GCS Bucket**

In [2]:
today = date.today()
yesterday = today - timedelta(days = 2)
yesterday = yesterday.strftime('%Y-%m-%d')
print(today)
print(yesterday)
client = storage.Client()
BUCKET_NAME = 'aes-datahub-0002-raw'
bucket = client.get_bucket(BUCKET_NAME)

blobs = bucket.list_blobs(prefix='OMS/'+yesterday)
dirlist = []

for blob in blobs:
    dirlist.append(str(blob.name))

2020-11-18
2020-11-16


In [3]:
matching_facility = [s for s in dirlist if "FACILITY_IPL_Daily" in s]
matching_live_facility = [s for s in matching_facility if "HIS" in s]
print(matching_live_facility)
print('\n')

['OMS/2020-11-16/HIS_FACILITY_IPL_Daily_202011160600.csv']




In [4]:
matching_location = [s for s in dirlist if "LOCATION_IPL_Daily" in s]
matching_live_location = [s for s in matching_location if "HIS" in s]
print(matching_live_location)
print('\n')

['OMS/2020-11-16/HIS_LOCATION_IPL_Daily_202011160600.csv']




In [5]:
bucket_name = 'gs://aes-datahub-0001-raw/'

live_df_facility_job_his = pd.read_csv(bucket_name + matching_live_facility[-1],encoding = "ISO-8859-1",sep=",")
df_facility_job_his = live_df_facility_job_his.copy(deep=True)
print(df_facility_job_his.shape)

(20551, 70)


In [6]:
bucket_name = 'gs://aes-datahub-0001-raw/'

live_df_location_his = pd.read_csv(bucket_name + matching_live_location[-1],encoding = "ISO-8859-1",sep=",")
df_his_location = live_df_location_his.copy(deep=True)
print(df_his_location.shape)

(19655, 71)


In [7]:
print(list(df_facility_job_his.columns))

['FAC_JOB_ID', 'CIRCT_ID', 'MAJ_OTG_ID', 'EQUIP_STN_NO', 'DIST_NO', 'HOST_SEQ_ID', 'PRIORITY_VAL', 'CUST_QTY', 'CLUE_CD', 'CLUE_DESC', 'CREATION_DATETIME', 'CALL_QTY', 'KEY_CUST_QTY', 'SPLIT_FAC_JOB_FLG', 'CAUSE_CD', 'CAUSE_DESC', 'OCCURN_CD', 'OCCURN_DESC', 'CLIMATIC_CD', 'CLIMATIC_DESC', 'CITY_NAM', 'LOC_DESC', 'WRK_ORD_NUM', 'COMMENT_TEXT', 'CALL_ID', 'KVA_VAL', 'BOOK_NO', 'ADDRESS', 'CIRCT_NAM', 'CLUE_CD2', 'INSERTED_DATE', 'DOWNSTREAM_KVA_VAL', 'DOWNSTREAM_CUST_QTY', 'COMPL_DATETIME', 'TOT_LOSS_POWER_FLG', 'ISOLATED_TO_CUST_FLG', 'PLANNED_OUTAGE_FLG', 'ROUTINE_FLG', 'DNI_EQUIP_TYPE', 'SUBST_ID', 'WORK_ORD_1_NO', 'WORK_ORD_2_NO', 'WORK_ORD_3_NO', 'WORK_ORD_4_NO', 'WORK_ORD_5_NO', 'ENERGIZED_DATETIME', 'DISPLAY_TEXT', 'POLICE_OPERATOR_ID', 'POLICE_INC_NO', 'FIRE_OPERATOR_ID', 'FIRE_INC_NO', 'CAD_ID', 'STRCTUR_NO', 'FAC_JOB_PARENT_ID', 'MAJ_INCIDENT_FLG', 'MAJ_INCIDENT_CAUSE', 'ZONE_DESC', 'DIST_DESC', 'ZONE_ID', 'GEO_DIST_NO', 'ETR_DATETIME', 'SUBST_SHUTDOWN_FLG', 'HIS_FAC_JOB_COMME

In [8]:
######################################################################################################################################################################################################
######################################################################### APPLYING FILTERS FOR CORRECT DATA INPUTS####################################################################################
######################################################################################################################################################################################################

# customer quantity greater than 0
print('Filter for customer quantity greater than 0')
# print("****QC Check****")
print("Rows left after checking for INCIDENTS whose CUSTOMER QUANTITY IS > 0")
df_facility_job_his = df_facility_job_his[(df_facility_job_his.CUST_QTY > 0)]
print(df_facility_job_his.shape)
print("\n")

# equip_stn_no is not NCC and not null
print('Filter for equp_stn_no is not NCC or not null')
# print("****QC Check****")
print("Rows left after checking that EQUIP_STN_NO is not from <<NON CONNECTED CUSTOMERS>>")
df_facility_job_his = df_facility_job_his[(df_facility_job_his.EQUIP_STN_NO != '<NCC>') & (df_facility_job_his.EQUIP_STN_NO.notnull())]
print(df_facility_job_his.shape)
print("\n")


# removing NAN from DNI_EQUIP_TYPE, CIRCT_ID, STRCTUR_NO
print('Removing NAN from DNI_EQIP_TYPE, CICRT_ID, STRCTUR_NO')
# print("****QC Check****")
print("Rows left after checking CIRCT_ID is not 0 and not null, STRCTUR_NO is not null and DNI_EQIP_TYPE is not null")
df_facility_job_his = df_facility_job_his[(df_facility_job_his.CIRCT_ID != 0)]
df_facility_job_his = df_facility_job_his[~df_facility_job_his.CIRCT_ID.isnull()]
df_facility_job_his = df_facility_job_his[~df_facility_job_his.STRCTUR_NO.isnull()]
df_facility_job_his = df_facility_job_his[~df_facility_job_his.DNI_EQUIP_TYPE.isnull()]
print(df_facility_job_his.shape)
print("\n")

# removing CLUE_CD which start with 0 but does not start with 00
print('Removing CLUE_CD which start with 0 but do not start with 00')
# print("****QC Check****")
print("Rows left after filtering for CLUE CODES which start with 0 but do not start with 00")
df_facility_job_his = df_facility_job_his[(df_facility_job_his.CLUE_CD.str[:1] == '0') & (df_facility_job_his.CLUE_CD.str[:2] != '00')]
df_facility_job_his = df_facility_job_his[df_facility_job_his.CLUE_CD != '01']
print(df_facility_job_his.shape)
print("\n")

# removing occurence codes starting with cancel, found ok and duplicate
print('Removing CLUE_CD which start with 0 but do not start with 00')
# print("****QC Check****")
print("Rows left after removing OCCURN_CD which have descriptions starting with CANCEL, FOUND OK or DUPLICATE")
occur_remov = [30003001, 33003301, 33003302, 34003400, 34003401, 34003402, 34003403, 34003404, 34003405, 34003406, 34003407, 34003408, 34003409, 35003500,
                35003501, 35003502, 35003503, 35003504, 35003505, 35003506, 35003507, 35003508, 36003600, 36003601, 36003602, 36003603, 36003604, 36003605,
                36003606, 36003607, 36003608, 37003703, 38003802, 38003803, 38003804, 38003807, 39003910, 41004100, 41004101, 41004102, 48004800, 48004802,
                48004803, 49004900, 49004901, 49004902, 50005000, 50005001, 50005002, 52005200, 52005201, 52005202, 52005203, 52005204, 52005205, 52005206,
                52005207, 53005300, 53005301, 53005302, 53005303, 53005304, 53005305, 53005306, 53005307, 53005308, 53005309, 53005310, 54005400, 54005401,
                54005402, 54005403, 54005404, 54005405, 34003410, 30003000, 36503650, 36503651, 36503652, 36503653, 36503654, 36503655, 36503656, 36503657,
                36503658]
df_facility_job_his = df_facility_job_his[~(df_facility_job_his.OCCURN_CD.isin(occur_remov))]
print(df_facility_job_his.shape)
print("\n")

Filter for customer quantity greater than 0
Rows left after checking for INCIDENTS whose CUSTOMER QUANTITY IS > 0
(12146, 70)


Filter for equp_stn_no is not NCC or not null
Rows left after checking that EQUIP_STN_NO is not from <<NON CONNECTED CUSTOMERS>>
(12132, 70)


Removing NAN from DNI_EQIP_TYPE, CICRT_ID, STRCTUR_NO
Rows left after checking CIRCT_ID is not 0 and not null, STRCTUR_NO is not null and DNI_EQIP_TYPE is not null
(12132, 70)


Removing CLUE_CD which start with 0 but do not start with 00
Rows left after filtering for CLUE CODES which start with 0 but do not start with 00
(7750, 70)


Removing CLUE_CD which start with 0 but do not start with 00
Rows left after removing OCCURN_CD which have descriptions starting with CANCEL, FOUND OK or DUPLICATE
(7750, 70)




In [9]:
df_fac_final = df_facility_job_his.copy(deep=True)
print("Rows", len(df_fac_final))
_incident_ = len(df_fac_final[['INCIDENT_ID','STRCTUR_NO']].drop_duplicates())
print("Number of incident id", df_fac_final.INCIDENT_ID.nunique())
print("Unique structure no",_incident_)
print(df_fac_final.shape)

Rows 7750
Number of incident id 6305
Unique structure no 7043
(7750, 70)


In [10]:
df_fac_final["CREATION_DATETIME"] = pd.to_datetime(df_fac_final["CREATION_DATETIME"], errors ='coerce')
df_fac_final["ENERGIZED_DATETIME"] = pd.to_datetime(df_fac_final["ENERGIZED_DATETIME"], errors ='coerce')
df_fac_final["ETR_DATETIME"] = pd.to_datetime(df_fac_final["ETR_DATETIME"], errors ='coerce')

In [11]:
df_fac_final['TTR'] = (df_fac_final.ENERGIZED_DATETIME - df_fac_final.CREATION_DATETIME).dt.total_seconds().div(60).round(4)

In [12]:
df_fac_final_30 = df_fac_final[df_fac_final['TTR'] <= 30]
df_fac_final_30.reset_index(drop=True, inplace=True)

In [16]:
print(min(df_fac_final_30['CREATION_DATETIME']), max(df_fac_final_30['CREATION_DATETIME']))
print(min(df_fac_final_30['ENERGIZED_DATETIME']), max(df_fac_final_30['ENERGIZED_DATETIME']))

2020-08-18 00:11:27 2020-11-15 21:37:51
2020-08-18 00:23:00 2020-11-15 22:07:48


In [13]:
print(df_fac_final.shape)
print(df_fac_final_30.shape)
df_fac_final_30.head()

(7750, 71)
(3382, 71)


Unnamed: 0,FAC_JOB_ID,CIRCT_ID,MAJ_OTG_ID,EQUIP_STN_NO,DIST_NO,HOST_SEQ_ID,PRIORITY_VAL,CUST_QTY,CLUE_CD,CLUE_DESC,CREATION_DATETIME,CALL_QTY,KEY_CUST_QTY,SPLIT_FAC_JOB_FLG,CAUSE_CD,CAUSE_DESC,OCCURN_CD,OCCURN_DESC,CLIMATIC_CD,CLIMATIC_DESC,CITY_NAM,LOC_DESC,WRK_ORD_NUM,COMMENT_TEXT,CALL_ID,KVA_VAL,BOOK_NO,ADDRESS,CIRCT_NAM,CLUE_CD2,INSERTED_DATE,DOWNSTREAM_KVA_VAL,DOWNSTREAM_CUST_QTY,COMPL_DATETIME,TOT_LOSS_POWER_FLG,ISOLATED_TO_CUST_FLG,PLANNED_OUTAGE_FLG,ROUTINE_FLG,DNI_EQUIP_TYPE,SUBST_ID,WORK_ORD_1_NO,WORK_ORD_2_NO,WORK_ORD_3_NO,WORK_ORD_4_NO,WORK_ORD_5_NO,ENERGIZED_DATETIME,DISPLAY_TEXT,POLICE_OPERATOR_ID,POLICE_INC_NO,FIRE_OPERATOR_ID,FIRE_INC_NO,CAD_ID,STRCTUR_NO,FAC_JOB_PARENT_ID,MAJ_INCIDENT_FLG,MAJ_INCIDENT_CAUSE,ZONE_DESC,DIST_DESC,ZONE_ID,GEO_DIST_NO,ETR_DATETIME,SUBST_SHUTDOWN_FLG,HIS_FAC_JOB_COMMENT,CIRCT_DESC,SUBST_DESC,DCNID_VAL,CREW_REMARKS,INCIDENT_ID,CLUE_DESC2,DIST_ABBREV,TTR
0,2002741658,3502,0,1539024C_C,1,,2,10,09IV,IVR\POWER OUT,2020-08-27 18:14:23,2,0,F,30003002.0,NO CAUSE\NOT PATROLLED,52005201,CANCEL\BY CUSTOMER,N,Normal - Clear,INDIANAPOLIS,** 3050 ARAGON WOODS CT,,,2033520993,25.0,,3050 ARAGON WOODS CT,CRAWFORDSVILLE NO. 2,,2020-08-27 18:41:18,25.0,10,2020-08-27 18:39:00,F,F,F,F,1TPUG,350,,,,,,2020-08-27 18:39:00,421YA/204,,,,,,421YA/204,0,F,NO CAUSE\NOT PATROLLED,IPL,IPL,1,,2020-08-27 22:45:00,,,CRAWFORDSVILLE NO. 2,CRAWFORDSVILLE RD,,,2001536828,No Description Available for clue code 2:,IPL,24.62
1,2002741660,2205,0,1473675_A,1,,2,52,09IV,IVR\POWER OUT,2020-08-27 18:39:56,1,0,,30003002.0,NO CAUSE\NOT PATROLLED,52005201,CANCEL\BY CUSTOMER,N,Normal - Clear,INDIANAPOLIS,E/S MADISON AVE N/MORGAN,,MAINT TO CHECK BREAKERS. RB,2033521036,100.0,,4725 MADISON AVE STE 48 1200E/S,SOUTH NO. 5,,2020-08-27 18:46:43,100.0,52,2020-08-27 18:47:02,F,F,F,F,FUSE,220,,,,,,2020-08-27 18:47:02,698-A/88,,,,,,698-A/88,0,F,NO CAUSE\NOT PATROLLED,IPL,IPL,1,,2020-08-27 23:15:00,,,SOUTH NO. 5,SOUTH,,,2001536829,No Description Available for clue code 2:,IPL,7.1
2,2002741662,3356,0,1375570C_C,1,,2,24,09IV,IVR\POWER OUT,2020-08-27 18:48:03,1,0,F,30003002.0,NO CAUSE\NOT PATROLLED,52005201,CANCEL\BY CUSTOMER,N,Normal - Clear,INDIANAPOLIS,** 7304 CREEKBROOK DR 1500E/S,,MAINT TO CHECK BREAKERS. RB,2033521064,75.0,,7304 CREEKBROOK DR 1500E/S,EDGEWOOD NO. 6,,2020-08-27 18:53:57,75.0,24,2020-08-27 18:54:18,F,F,F,F,1TPUG,335,,,,,,2020-08-27 18:54:18,758--/346,,,,,,758--/346,0,F,NO CAUSE\NOT PATROLLED,IPL,IPL,1,,2020-08-27 23:30:00,,,EDGEWOOD NO. 6,EDGEWOOD,,,2001536831,No Description Available for clue code 2:,IPL,6.25
3,2002741665,2907,0,2348029C_C,1,,2,8,09IV,IVR\POWER OUT,2020-08-27 19:04:52,1,0,F,39003904.0,MISCELLANEOUS\CUSTOMER EQUIPMENT,52005205,CANCEL\CELL NET ON,N,Normal - Clear,INDIANAPOLIS,** 4072 LITTLE BIGHORN DR,,,2033521075,50.0,,4072 LITTLE BIGHORN DR,GERMAN CHURCH NO. 7,,2020-08-27 19:10:53,50.0,8,2020-08-27 19:10:38,F,F,F,F,1TPUG,290,,,,,,2020-08-27 19:10:38,421F-/466,,,,,,421F-/466,0,F,MISCELLANEOUS\CUSTOMER EQUIPMENT,IPL,IPL,1,,2020-08-27 23:45:00,,,GERMAN CHURCH NO. 7,GERMAN CHURCH,,,2001536834,No Description Available for clue code 2:,IPL,5.77
4,2002741670,2359,0,1691806ABC_A,1,,2,1,09WB,WEB\POWER OUT,2020-08-27 19:54:39,1,1,F,30003002.0,NO CAUSE\NOT PATROLLED,38003807,SUBSTATION\MOMENTARY INTERRUPTION,N,Normal - Clear,INDIANAPOLIS,** 8278 GEORGETOWN RD 4800W/N,,,2033521103,666.7,,8278 GEORGETOWN RD 4800W/N,GEORGETOWN NO. 9,,2020-08-27 20:07:16,666.7,1,2020-08-27 20:05:25,F,F,F,F,3TPUG,235,,,,,,2020-08-27 20:05:25,197-B/272,,,,,,197-B/272,0,F,NO CAUSE\NOT PATROLLED,IPL,IPL,1,,2020-08-28 01:00:00,,,GEORGETOWN NO. 9,GEORGETOWN,,,2001536838,No Description Available for clue code 2:,IPL,10.77


In [14]:
df_fac_final_30.to_csv('gs://aes-analytics-0002-curated/Outage_Restoration/Historical_Data/Master_Dataset/Outage_Less_Than_30_MINS.csv', index=False)

In [2]:
from datetime import datetime
datetime.now().strftime("%Y-%m-%d %H:%M")

'2020-11-19 11:24'