## **STORM ADS CREATION V1.0.1**
v1.0.1 - Create data-set so that we can predict for recent storms 

In [1]:
import csv
import math
import time
import warnings
import operator
import statistics
import requests
import json
import seaborn as sns
import pandas as pd
import numpy as np
import geopy.distance
import matplotlib.pyplot as plt
from scipy.stats import linregress

from dateutil.parser import parse
from datetime import datetime
from datetime import date, timedelta
from scipy import stats
from IPython.display import display_html
from multiprocessing import Pool
from sklearn.model_selection import train_test_split
from google.cloud import storage

plt.style.use('fivethirtyeight')
warnings.filterwarnings('ignore')
%matplotlib inline

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.float_format = '{:.2f}'.format

## **Read necessary files from GCS Bucket**

In [2]:
# today = date.today()
# '2020-09-08'
yesterday = '2020-11-28'
print(yesterday)
client = storage.Client()
BUCKET_NAME = 'aes-datahub-0002-raw'
bucket = client.get_bucket(BUCKET_NAME)

blobs = bucket.list_blobs(prefix='OMS/'+yesterday)
dirlist = []

for blob in blobs:
    dirlist.append(str(blob.name))

2020-11-28


In [3]:
matching_facility = [s for s in dirlist if "FACILITY_IPL_Daily" in s]
matching_live_facility = [s for s in matching_facility if "HIS" in s]
print(matching_live_facility)
print('\n')

['OMS/2020-11-28/HIS_FACILITY_IPL_Daily_202011280600.csv']




In [4]:
matching_location = [s for s in dirlist if "LOCATION_IPL_Daily" in s]
matching_live_location = [s for s in matching_location if "HIS" in s]
print(matching_live_location)
print('\n')

['OMS/2020-11-28/HIS_LOCATION_IPL_Daily_202011280600.csv']




In [5]:
bucket_name = 'gs://aes-datahub-0001-raw/'

live_df_facility_job_his = pd.read_csv(bucket_name + matching_live_facility[-1],encoding = "ISO-8859-1",sep=",")
# print(live_df_facility_job_his.MAJ_OTG_ID.value_counts())
df_facility_job_his = live_df_facility_job_his.copy(deep=True)
print(df_facility_job_his.shape)

(20728, 70)


In [6]:
bucket_name = 'gs://aes-datahub-0001-raw/'

live_df_location_his = pd.read_csv(bucket_name + matching_live_location[-1],encoding = "ISO-8859-1",sep=",")
# print(live_df_location_his.MAJ_OTG_ID.value_counts())
df_his_location = live_df_location_his.copy(deep=True)
print(df_his_location.shape)

(19778, 71)


In [7]:
print(list(df_facility_job_his.columns))

['FAC_JOB_ID', 'CIRCT_ID', 'MAJ_OTG_ID', 'EQUIP_STN_NO', 'DIST_NO', 'HOST_SEQ_ID', 'PRIORITY_VAL', 'CUST_QTY', 'CLUE_CD', 'CLUE_DESC', 'CREATION_DATETIME', 'CALL_QTY', 'KEY_CUST_QTY', 'SPLIT_FAC_JOB_FLG', 'CAUSE_CD', 'CAUSE_DESC', 'OCCURN_CD', 'OCCURN_DESC', 'CLIMATIC_CD', 'CLIMATIC_DESC', 'CITY_NAM', 'LOC_DESC', 'WRK_ORD_NUM', 'COMMENT_TEXT', 'CALL_ID', 'KVA_VAL', 'BOOK_NO', 'ADDRESS', 'CIRCT_NAM', 'CLUE_CD2', 'INSERTED_DATE', 'DOWNSTREAM_KVA_VAL', 'DOWNSTREAM_CUST_QTY', 'COMPL_DATETIME', 'TOT_LOSS_POWER_FLG', 'ISOLATED_TO_CUST_FLG', 'PLANNED_OUTAGE_FLG', 'ROUTINE_FLG', 'DNI_EQUIP_TYPE', 'SUBST_ID', 'WORK_ORD_1_NO', 'WORK_ORD_2_NO', 'WORK_ORD_3_NO', 'WORK_ORD_4_NO', 'WORK_ORD_5_NO', 'ENERGIZED_DATETIME', 'DISPLAY_TEXT', 'POLICE_OPERATOR_ID', 'POLICE_INC_NO', 'FIRE_OPERATOR_ID', 'FIRE_INC_NO', 'CAD_ID', 'STRCTUR_NO', 'FAC_JOB_PARENT_ID', 'MAJ_INCIDENT_FLG', 'MAJ_INCIDENT_CAUSE', 'ZONE_DESC', 'DIST_DESC', 'ZONE_ID', 'GEO_DIST_NO', 'ETR_DATETIME', 'SUBST_SHUTDOWN_FLG', 'HIS_FAC_JOB_COMME

In [8]:
######################################################################################################################################################################################################
######################################################################### APPLYING FILTERS FOR CORRECT DATA INPUTS####################################################################################
######################################################################################################################################################################################################

# customer quantity greater than 0
print('Filter for customer quantity greater than 0')
# print("****QC Check****")
print("Rows left after checking for INCIDENTS whose CUSTOMER QUANTITY IS > 0")
df_facility_job_his = df_facility_job_his[(df_facility_job_his.CUST_QTY > 0)]
print(df_facility_job_his.shape)
print("\n")

# equip_stn_no is not NCC and not null
print('Filter for equp_stn_no is not NCC or not null')
# print("****QC Check****")
print("Rows left after checking that EQUIP_STN_NO is not from <<NON CONNECTED CUSTOMERS>>")
df_facility_job_his = df_facility_job_his[(df_facility_job_his.EQUIP_STN_NO != '<NCC>') & (df_facility_job_his.EQUIP_STN_NO.notnull())]
print(df_facility_job_his.shape)
print("\n")


# removing NAN from DNI_EQUIP_TYPE, CIRCT_ID, STRCTUR_NO
print('Removing NAN from DNI_EQIP_TYPE, CICRT_ID, STRCTUR_NO')
# print("****QC Check****")
print("Rows left after checking CIRCT_ID is not 0 and not null, STRCTUR_NO is not null and DNI_EQIP_TYPE is not null")
df_facility_job_his = df_facility_job_his[(df_facility_job_his.CIRCT_ID != 0)]
df_facility_job_his = df_facility_job_his[~df_facility_job_his.CIRCT_ID.isnull()]
df_facility_job_his = df_facility_job_his[~df_facility_job_his.STRCTUR_NO.isnull()]
df_facility_job_his = df_facility_job_his[~df_facility_job_his.DNI_EQUIP_TYPE.isnull()]
print(df_facility_job_his.shape)
print("\n")

# removing CLUE_CD which start with 0 but does not start with 00
print('Removing CLUE_CD which start with 0 but do not start with 00')
# print("****QC Check****")
print("Rows left after filtering for CLUE CODES which start with 0 but do not start with 00")
df_facility_job_his = df_facility_job_his[(df_facility_job_his.CLUE_CD.str[:1] == '0') & (df_facility_job_his.CLUE_CD.str[:2] != '00')]
df_facility_job_his = df_facility_job_his[df_facility_job_his.CLUE_CD != '09OD']
df_facility_job_his = df_facility_job_his[df_facility_job_his.CLUE_CD != '01']
print(df_facility_job_his.shape)
print("\n")

# removing occurence codes starting with cancel, found ok and duplicate
print('Removing CLUE_CD which start with 0 but do not start with 00')
# print("****QC Check****")
print("Rows left after removing OCCURN_CD which have descriptions starting with CANCEL, FOUND OK or DUPLICATE")
occur_remov = [30003001, 33003301, 33003302, 34003400, 34003401, 34003402, 34003403, 34003404, 34003405, 34003406, 34003407, 34003408, 34003409, 35003500,
                35003501, 35003502, 35003503, 35003504, 35003505, 35003506, 35003507, 35003508, 36003600, 36003601, 36003602, 36003603, 36003604, 36003605,
                36003606, 36003607, 36003608, 37003703, 38003802, 38003803, 38003804, 38003807, 39003910, 41004100, 41004101, 41004102, 48004800, 48004802,
                48004803, 49004900, 49004901, 49004902, 50005000, 50005001, 50005002, 52005200, 52005201, 52005202, 52005203, 52005204, 52005205, 52005206,
                52005207, 53005300, 53005301, 53005302, 53005303, 53005304, 53005305, 53005306, 53005307, 53005308, 53005309, 53005310, 54005400, 54005401,
                54005402, 54005403, 54005404, 54005405, 34003410, 30003000, 36503650, 36503651, 36503652, 36503653, 36503654, 36503655, 36503656, 36503657,
                36503658]
df_facility_job_his = df_facility_job_his[~(df_facility_job_his.OCCURN_CD.isin(occur_remov))]
print(df_facility_job_his.shape)
print("\n")

Filter for customer quantity greater than 0
Rows left after checking for INCIDENTS whose CUSTOMER QUANTITY IS > 0
(12368, 70)


Filter for equp_stn_no is not NCC or not null
Rows left after checking that EQUIP_STN_NO is not from <<NON CONNECTED CUSTOMERS>>
(12351, 70)


Removing NAN from DNI_EQIP_TYPE, CICRT_ID, STRCTUR_NO
Rows left after checking CIRCT_ID is not 0 and not null, STRCTUR_NO is not null and DNI_EQIP_TYPE is not null
(12351, 70)


Removing CLUE_CD which start with 0 but do not start with 00
Rows left after filtering for CLUE CODES which start with 0 but do not start with 00
(7501, 70)


Removing CLUE_CD which start with 0 but do not start with 00
Rows left after removing OCCURN_CD which have descriptions starting with CANCEL, FOUND OK or DUPLICATE
(7501, 70)




In [9]:
df_fac_final = df_facility_job_his.copy(deep=True)
print("Rows", len(df_fac_final))
_incident_ = len(df_fac_final[['INCIDENT_ID','STRCTUR_NO']].drop_duplicates())
print("Number of incident id", df_fac_final.INCIDENT_ID.nunique())
print("Unique structure no",_incident_)
print(df_fac_final.shape)

Rows 7501
Number of incident id 6319
Unique structure no 6972
(7501, 70)


In [10]:
df_check = df_fac_final.groupby(['INCIDENT_ID','STRCTUR_NO', 'CIRCT_ID', 'DNI_EQUIP_TYPE']).nunique()
df_check.sum()

FAC_JOB_ID              7501
MAJ_OTG_ID              6975
EQUIP_STN_NO            7485
DIST_NO                 6973
HOST_SEQ_ID                0
PRIORITY_VAL            7030
CUST_QTY                7436
CLUE_CD                 7174
CLUE_DESC               7174
CREATION_DATETIME       6991
CALL_QTY                7385
KEY_CUST_QTY            7182
SPLIT_FAC_JOB_FLG       4644
CAUSE_CD                3859
CAUSE_DESC              3806
OCCURN_CD               6971
OCCURN_DESC             6802
CLIMATIC_CD             6961
CLIMATIC_DESC           6961
CITY_NAM                6776
LOC_DESC                6958
WRK_ORD_NUM                0
COMMENT_TEXT            3231
CALL_ID                 7501
KVA_VAL                 7343
BOOK_NO                    0
ADDRESS                 7375
CIRCT_NAM               6973
CLUE_CD2                  50
INSERTED_DATE           6996
DOWNSTREAM_KVA_VAL      7401
DOWNSTREAM_CUST_QTY     7436
COMPL_DATETIME          6999
TOT_LOSS_POWER_FLG      6746
ISOLATED_TO_CU

In [11]:
df_numerical = df_fac_final.groupby(['INCIDENT_ID','STRCTUR_NO', 'CIRCT_ID', 'DNI_EQUIP_TYPE' ], as_index = False).agg({'CUST_QTY':'sum','CALL_QTY':'sum','KEY_CUST_QTY':'sum','DOWNSTREAM_CUST_QTY':'sum',
                                                                                                                        'KVA_VAL':'mean','DOWNSTREAM_KVA_VAL':'mean', 'FAC_JOB_ID': 'max',
                                                                                                                        'ETR_DATETIME': 'max', 'CREATION_DATETIME': 'min', 'MAJ_OTG_ID' : 'max',
                                                                                                                        'ENERGIZED_DATETIME': 'max', 'SUBST_ID': 'min'})

In [12]:
df_numerical.head()

Unnamed: 0,INCIDENT_ID,STRCTUR_NO,CIRCT_ID,DNI_EQUIP_TYPE,CUST_QTY,CALL_QTY,KEY_CUST_QTY,DOWNSTREAM_CUST_QTY,KVA_VAL,DOWNSTREAM_KVA_VAL,FAC_JOB_ID,ETR_DATETIME,CREATION_DATETIME,MAJ_OTG_ID,ENERGIZED_DATETIME,SUBST_ID
0,2001537668,737--/72,1758,FUSE,7,4,0,7,170.0,170.0,2002742730,2020-08-29 23:30:00,2020-08-29 18:51:07,0,2020-08-29 23:18:00,175
1,2001537674,301-B/21,2807,1TBOH,1,1,0,1,0.0,0.0,2002742736,2020-08-30 13:15:00,2020-08-29 19:35:29,0,2020-08-30 11:42:31,280
2,2001537677,670-A/166,2203,FUSE,1,1,0,1,100.0,100.0,2002742741,2020-08-30 00:30:00,2020-08-29 19:54:06,0,2020-08-29 20:53:00,220
3,2001537695,650-B/59,1702,FUSE,11,1,0,11,50.0,50.0,2002742761,2020-08-30 03:30:00,2020-08-29 23:59:25,0,2020-08-30 04:07:00,170
4,2001537697,264-B/72,2809,1TBOH,8,8,0,8,50.0,50.0,2002742762,2020-08-30 12:00:00,2020-08-30 03:26:42,0,2020-08-30 10:22:00,280


In [13]:
print(df_numerical.shape)

(6973, 16)


In [14]:
df_fac_final = df_fac_final.groupby(['INCIDENT_ID','STRCTUR_NO', 'CIRCT_ID', 'DNI_EQUIP_TYPE'], as_index=False).agg({'PRIORITY_VAL': 'last', 'OCCURN_DESC' : 'last',
                                                                                                                     'CAUSE_DESC': 'last', 'CLUE_DESC': 'last', 'CITY_NAM' : 'last'})
df_fac_final.head()

Unnamed: 0,INCIDENT_ID,STRCTUR_NO,CIRCT_ID,DNI_EQUIP_TYPE,PRIORITY_VAL,OCCURN_DESC,CAUSE_DESC,CLUE_DESC,CITY_NAM
0,2001537668,737--/72,1758,FUSE,2,CONDUCTOR/WIRE\PRIMARY DOWN,TREE\TREE ON PRI (OUTSIDE TRIM ZONE),WEB\POWER OUT,INDIANAPOLIS
1,2001537674,301-B/21,2807,1TBOH,2,SERVICE\LOOSE CONNECTION,OH EQUIPMENT\LOOSE CONNECTION,IVR\POWER OUT,INDIANAPOLIS
2,2001537677,670-A/166,2203,FUSE,2,CONDUCTOR/WIRE\OTHER,OH EQUIPMENT\BAD OR BROKEN INSULATOR,PART LIGHTS - DIM/BRIGHT\PART OUT (RESET BREAK...,INDIANAPOLIS
3,2001537695,650-B/59,1702,FUSE,2,"FUSE\OPEN, FUSE BLOWN",TREE\TREE ON PRI (INSIDE TRIM ZONE),IVR\POWER OUT,INDIANAPOLIS
4,2001537697,264-B/72,2809,1TBOH,2,CUTOUT\DAMAGED,OH EQUIPMENT\BAD CUTOUT OR BARREL,WEB\POWER OUT,INDIANAPOLIS


In [15]:
df_fac_final.dtypes

INCIDENT_ID        int64
STRCTUR_NO        object
CIRCT_ID           int64
DNI_EQUIP_TYPE    object
PRIORITY_VAL       int64
OCCURN_DESC       object
CAUSE_DESC        object
CLUE_DESC         object
CITY_NAM          object
dtype: object

## **Adding extra columns like Day Night flag and creating dependent variable TTR column**

In [16]:
df_numerical["CREATION_DATETIME"] = pd.to_datetime(df_numerical["CREATION_DATETIME"], errors ='coerce')
df_numerical["ENERGIZED_DATETIME"] = pd.to_datetime(df_numerical["ENERGIZED_DATETIME"], errors ='coerce')
df_numerical["ETR_DATETIME"] = pd.to_datetime(df_numerical["ETR_DATETIME"], errors ='coerce')

In [17]:
# creating day night flag for outages

df_numerical['DAY_FLAG'] = df_numerical.CREATION_DATETIME.dt.hour.apply(lambda x: 1 if ((x >= 6) & (x<18)) else 0)
df_numerical['TTR'] = (df_numerical.ENERGIZED_DATETIME - df_numerical.CREATION_DATETIME).dt.total_seconds().div(60).round(4)

In [18]:
print(df_numerical.shape)

(6973, 18)


## **CLUE CODE CLEAN**

In [19]:
df_fac_final.PRIORITY_VAL = df_fac_final.PRIORITY_VAL.astype(float)

df_fac_final['PRIORITY_VAL_1.0'] = df_fac_final['PRIORITY_VAL'].apply(lambda x: 1 if x == 1 else 0)
df_fac_final['PRIORITY_VAL_2.0'] = df_fac_final['PRIORITY_VAL'].apply(lambda x: 1 if x == 2 else 0)
df_fac_final['PRIORITY_VAL_3.0'] = df_fac_final['PRIORITY_VAL'].apply(lambda x: 1 if x == 3 else 0)
df_fac_final['PRIORITY_VAL_5.0'] = df_fac_final['PRIORITY_VAL'].apply(lambda x: 1 if x == 5 else 0)

df_fac_final.drop(['PRIORITY_VAL'],axis=1,inplace=True)

df_fac_final.CITY_NAM = df_fac_final.CITY_NAM.apply(lambda x: 'INDIANAPOLIS' if(str(x).find('INDIAN') != -1) else x)
df_fac_final.CITY_NAM = df_fac_final.CITY_NAM.apply(lambda x: 'NO_CITY' if(x != x) else x)

df_fac_final['CLUE_DESC'] = df_fac_final['CLUE_DESC'].astype(str)
df_fac_final['CAUSE_DESC'] = df_fac_final['CAUSE_DESC'].astype(str)
df_fac_final['OCCURN_DESC'] = df_fac_final['OCCURN_DESC'].astype(str)

In [20]:
# segregation of clue code desc

df_fac_final['POLE_CLUE_FLG'] = df_fac_final.CLUE_DESC.apply(lambda x: 1 if (x.lower().find('pole') != -1) else 0)
df_fac_final['PART_LIGHT_CLUE_FLG'] = df_fac_final.CLUE_DESC.apply(lambda x: 1 if (x.lower().find('part lights') != -1) else 0)
df_fac_final['EMERGENCY_CLUE_FLG'] = df_fac_final.CLUE_DESC.apply(lambda x: 1 if (x.lower().find('emergency') != -1) else 0)
df_fac_final['POWER_OUT_CLUE_FLG'] = df_fac_final.CLUE_DESC.apply(lambda x: 1 if (x.lower().find('power out') != -1) else 0)
df_fac_final['TREE_CLUE_FLG'] = df_fac_final.CLUE_DESC.apply(lambda x: 1 if (x.lower().find('tree') != -1) else 0)
df_fac_final['WIRE_DOWN_CLUE_FLG'] = df_fac_final.CLUE_DESC.apply(lambda x: 1 if (x.lower().find('wire down') != -1) else 0)
df_fac_final['IVR_CLUE_FLG'] = df_fac_final.CLUE_DESC.apply(lambda x: 1 if (x.lower().find('ivr') != -1) else 0)
df_fac_final['EQUIPMENT_CLUE_FLG'] = df_fac_final.CLUE_DESC.apply(lambda x: 1 if (x.find('EQUIPMENT') != -1) else 0)
df_fac_final['TRANSFORMER_CLUE_FLG'] = df_fac_final.CLUE_DESC.apply(lambda x: 1 if (x.find('TRANSFORMER') != -1) else 0)
df_fac_final['OPEN_DEVICE_CLUE_FLG'] = df_fac_final.CLUE_DESC.apply(lambda x: 1 if (x.find('OPEN DEVICE') != -1) else 0)


#segration of cause desc
df_fac_final['CAUSE_DESC1'] = df_fac_final[['CAUSE_DESC']].fillna('0')
df_fac_final['OH_CAUSE_FLG'] = df_fac_final.CAUSE_DESC1.apply(lambda x: 1 if((x.find('OH') != -1) | (x.find('O.H.') != -1)) else 0)
df_fac_final['UG_CAUSE_FLG'] = df_fac_final.CAUSE_DESC1.apply(lambda x: 1 if((x.find('UG') != -1) | (x.find('U.G.') != -1)) else 0)
df_fac_final['ANIMAL_CAUSE_FLG'] = df_fac_final.CAUSE_DESC1.apply(lambda x: 1 if(x.find('ANIMAL') != -1) else 0)
df_fac_final['WEATHER_CAUSE_FLG'] = df_fac_final.CAUSE_DESC1.apply(lambda x: 1 if(x.find('WEATHER') != -1) else 0)
df_fac_final['WEATHER_COLD_CAUSE_FLG'] = df_fac_final.CAUSE_DESC1.apply(lambda x: 1 if(x.find('COLD') != -1) else 0)
df_fac_final['WEATHER_LIGHTNING_CAUSE_FLG'] = df_fac_final.CAUSE_DESC1.apply(lambda x: 1 if(x.find('LIGHTNING') != -1) else 0)
df_fac_final['WEATHER__SNOW_CAUSE_FLG'] = df_fac_final.CAUSE_DESC1.apply(lambda x: 1 if(x.find('SNOW') != -1) else 0)
df_fac_final['WEATHER__WIND_CAUSE_FLG'] = df_fac_final.CAUSE_DESC1.apply(lambda x: 1 if(x.find('WIND') != -1) else 0)
df_fac_final['WEATHER__HEAT_CAUSE_FLG'] = df_fac_final.CAUSE_DESC1.apply(lambda x: 1 if(x.find('HEAT') != -1) else 0)
df_fac_final['WEATHER__FLOOD_CAUSE_FLG'] = df_fac_final.CAUSE_DESC1.apply(lambda x: 1 if(x.find('FLOOD') != -1) else 0)
df_fac_final['PUBLIC_CAUSE_FLG'] = df_fac_final.CAUSE_DESC1.apply(lambda x: 1 if(x.find('PUBLIC') != -1) else 0)
df_fac_final['STREET_CAUSE_FLG'] = df_fac_final.CAUSE_DESC1.apply(lambda x: 1 if(x.find('ST ') != -1) else 0)
df_fac_final['SUBSTATION_CAUSE_FLG'] = df_fac_final.CAUSE_DESC1.apply(lambda x: 1 if(x.find('SUBSTATION') != -1) else 0)
df_fac_final['TREE_CAUSE_FLG'] = df_fac_final.CAUSE_DESC1.apply(lambda x: 1 if(x.find('TREE') != -1) else 0)
df_fac_final['MISCELLANEOUS_CAUSE_FLG'] = df_fac_final.CAUSE_DESC1.apply(lambda x: 1 if(x.find('MISCELLANEOUS') != -1) else 0)
df_fac_final['CUST_REQUEST_CAUSE_FLG'] = df_fac_final.CAUSE_DESC1.apply(lambda x: 1 if(x.find('CUSTOMER REQUEST') != -1) else 0)
df_fac_final['NO_CAUSE_FLG'] = df_fac_final.CAUSE_DESC1.apply(lambda x: 1 if(x.find('NO CAUSE') != -1) else 0)
df_fac_final['PLANNED_CAUSE_FLG'] = df_fac_final.CAUSE_DESC1.apply(lambda x: 1 if(x.find('PLANNED WORK') != -1) else 0)
df_fac_final['NO_OUTAGE_CAUSE_FLG'] = df_fac_final.CAUSE_DESC1.apply(lambda x: 1 if(x.find('NO OUTAGE') != -1) else 0)


#segration of OCCURN desc
df_fac_final['FUSE_OCCURN_FLG'] = df_fac_final.OCCURN_DESC.apply(lambda x: 1 if((x.find('FUSE') != -1) & (x.find('FUSE NOT') == -1)) else 0)
df_fac_final['CUST_EQUIP_OCCURN_FLG'] = df_fac_final.OCCURN_DESC.apply(lambda x: 1 if(x.find('CUSTOMER EQUIP') != -1) else 0)
df_fac_final['POLE_OCCURN_FLG'] = df_fac_final.OCCURN_DESC.apply(lambda x: 1 if(x.find('POLE') != -1) else 0)
df_fac_final['TRANSFORMER_OCCURN_FLG'] = df_fac_final.OCCURN_DESC.apply(lambda x: 1 if(x.find('TRANSFORMER') != -1) else 0)
df_fac_final['METER_OCCURN_FLG'] = df_fac_final.OCCURN_DESC.apply(lambda x: 1 if(x.find('METER') != -1) else 0)
df_fac_final['SERVICE_OCCURN_FLG'] = df_fac_final.OCCURN_DESC.apply(lambda x: 1 if(x.find('SERVICE') != -1) else 0)
df_fac_final['CABLE_OCCURN_FLG'] = df_fac_final.OCCURN_DESC.apply(lambda x: 1 if(x.find('CABLE') != -1) else 0)
df_fac_final['ST_OCCURN_FLG'] = df_fac_final.OCCURN_DESC.apply(lambda x: 1 if(x.find('ST') != -1) else 0)
df_fac_final['FIRE_OCCURN_FLG'] = df_fac_final.OCCURN_DESC.apply(lambda x: 1 if(x.find('FIRE') != -1) else 0)
df_fac_final['FOUND_OPEN_OCCURN_FLG'] = df_fac_final.OCCURN_DESC.apply(lambda x: 1 if((x.find('FOUND OPEN') != -1) & (x.find('NOT FOUND OPEN') == -1)) else 0)
df_fac_final['PUBLIC_SAFETY_OCCURN_FLG'] = df_fac_final.OCCURN_DESC.apply(lambda x: 1 if(x.find('SAFETY') != -1) else 0)
df_fac_final['WIRE_OCCURN_FLG'] = df_fac_final.OCCURN_DESC.apply(lambda x: 1 if(x.find('WIRE') != -1) else 0)
df_fac_final['SWITCH_OCCURN_FLG'] = df_fac_final.OCCURN_DESC.apply(lambda x: 1 if(x.find('SWITCH') != -1) else 0)
df_fac_final['CUTOUT_OCCURN_FLG'] = df_fac_final.OCCURN_DESC.apply(lambda x: 1 if(x.find('CUTOUT') != -1) else 0)
df_fac_final['REGULATOR_OCCURN_FLG'] = df_fac_final.OCCURN_DESC.apply(lambda x: 1 if(x.find('REGULATOR') != -1) else 0)
df_fac_final['CAP_BANK_OCCURN_FLG'] = df_fac_final.OCCURN_DESC.apply(lambda x: 1 if(x.find('CAP BANK') != -1) else 0)
df_fac_final['OH_OCCURN_FLG'] = df_fac_final.OCCURN_DESC.apply(lambda x: 1 if(x.find('OH') != -1) else 0)
df_fac_final['RECLOSER_OCCURN_FLG'] = df_fac_final.OCCURN_DESC.apply(lambda x: 1 if(x.find('RECLOSER') != -1) else 0)

df_fac_final = df_fac_final.drop(columns = ['CAUSE_DESC1'])

In [21]:
df_fac_cat = df_fac_final.groupby(['INCIDENT_ID','STRCTUR_NO', 'CIRCT_ID', 'DNI_EQUIP_TYPE'], as_index = False).agg({'POLE_CLUE_FLG': 'sum', 'PART_LIGHT_CLUE_FLG': 'sum',
                                                                          'EMERGENCY_CLUE_FLG': 'sum','POWER_OUT_CLUE_FLG': 'sum',
                                                                          'TREE_CLUE_FLG': 'sum', 'WIRE_DOWN_CLUE_FLG': 'sum',
                                                                          'OPEN_DEVICE_CLUE_FLG':'sum', 'EQUIPMENT_CLUE_FLG': 'sum',
                                                                          'TRANSFORMER_CLUE_FLG':'sum','IVR_CLUE_FLG': 'sum',
                                                                          'OH_CAUSE_FLG': 'sum', 'UG_CAUSE_FLG': 'sum', 
                                                                          'ANIMAL_CAUSE_FLG': 'sum','WEATHER_CAUSE_FLG': 'sum', 
                                                                          'WEATHER_COLD_CAUSE_FLG': 'sum','PUBLIC_CAUSE_FLG': 'sum',
                                                                         'WEATHER_LIGHTNING_CAUSE_FLG': 'sum', 'WEATHER__SNOW_CAUSE_FLG': 'sum',
                                                                          'WEATHER__WIND_CAUSE_FLG': 'sum','WEATHER__HEAT_CAUSE_FLG': 'sum',
                                                                         'WEATHER__FLOOD_CAUSE_FLG': 'sum', 'STREET_CAUSE_FLG': 'sum',
                                                                        'MISCELLANEOUS_CAUSE_FLG':'sum', 'CUST_REQUEST_CAUSE_FLG': 'sum',
                                                                          'SUBSTATION_CAUSE_FLG': 'sum','TREE_CAUSE_FLG': 'sum',
                                                                          'NO_CAUSE_FLG': 'sum', 'PLANNED_CAUSE_FLG': 'sum',
                                                                          'NO_OUTAGE_CAUSE_FLG': 'sum',
                                                                          'PRIORITY_VAL_1.0' : 'sum', 'PRIORITY_VAL_2.0': 'sum', 
                                                                          'PRIORITY_VAL_3.0': 'sum', 'PRIORITY_VAL_5.0': 'sum',
                                                                          'FUSE_OCCURN_FLG': 'sum', 'CUST_EQUIP_OCCURN_FLG': 'sum',
                                                                          'POLE_OCCURN_FLG': 'sum', 'TRANSFORMER_OCCURN_FLG': 'sum', 
                                                                          'METER_OCCURN_FLG': 'sum', 'SERVICE_OCCURN_FLG': 'sum',
                                                                          'CABLE_OCCURN_FLG': 'sum', 'ST_OCCURN_FLG': 'sum',
                                                                          'FIRE_OCCURN_FLG': 'sum', 'FOUND_OPEN_OCCURN_FLG': 'sum',
                                                                          'PUBLIC_SAFETY_OCCURN_FLG': 'sum', 'WIRE_OCCURN_FLG': 'sum',
                                                                          'SWITCH_OCCURN_FLG': 'sum', 'REGULATOR_OCCURN_FLG': 'sum',
                                                                          'CUTOUT_OCCURN_FLG': 'sum','CAP_BANK_OCCURN_FLG': 'sum',
                                                                          'RECLOSER_OCCURN_FLG': 'sum','OH_OCCURN_FLG': 'sum'
                                                                          })
dummy_col = ['POLE_CLUE_FLG', 'PART_LIGHT_CLUE_FLG', 'EMERGENCY_CLUE_FLG','POWER_OUT_CLUE_FLG','OPEN_DEVICE_CLUE_FLG',
                'TREE_CLUE_FLG', 'WIRE_DOWN_CLUE_FLG','IVR_CLUE_FLG','EQUIPMENT_CLUE_FLG','TRANSFORMER_CLUE_FLG',
             'OH_CAUSE_FLG', 'UG_CAUSE_FLG', 'ANIMAL_CAUSE_FLG','WEATHER_CAUSE_FLG', 'WEATHER_COLD_CAUSE_FLG',
             'PUBLIC_CAUSE_FLG','WEATHER_LIGHTNING_CAUSE_FLG', 'WEATHER__SNOW_CAUSE_FLG','WEATHER__WIND_CAUSE_FLG',
             'WEATHER__HEAT_CAUSE_FLG','CUST_REQUEST_CAUSE_FLG','WEATHER__FLOOD_CAUSE_FLG', 'STREET_CAUSE_FLG',
             'SUBSTATION_CAUSE_FLG','TREE_CAUSE_FLG','MISCELLANEOUS_CAUSE_FLG','NO_CAUSE_FLG', 'PLANNED_CAUSE_FLG', 
             'NO_OUTAGE_CAUSE_FLG',
             'FUSE_OCCURN_FLG', 'CUST_EQUIP_OCCURN_FLG', 'POLE_OCCURN_FLG', 'TRANSFORMER_OCCURN_FLG', 
             'METER_OCCURN_FLG', 'SERVICE_OCCURN_FLG','CABLE_OCCURN_FLG', 'ST_OCCURN_FLG', 'FIRE_OCCURN_FLG', 
             'FOUND_OPEN_OCCURN_FLG','PUBLIC_SAFETY_OCCURN_FLG', 'WIRE_OCCURN_FLG', 'SWITCH_OCCURN_FLG',
             'REGULATOR_OCCURN_FLG', 'CUTOUT_OCCURN_FLG','CAP_BANK_OCCURN_FLG','RECLOSER_OCCURN_FLG','OH_OCCURN_FLG',
             'PRIORITY_VAL_1.0','PRIORITY_VAL_2.0','PRIORITY_VAL_3.0','PRIORITY_VAL_5.0']
for i in dummy_col:
    df_fac_cat[i] =  df_fac_cat[i].apply(lambda x: 1 if x>=1 else 0)

df_fac_cat = df_fac_cat[['INCIDENT_ID','STRCTUR_NO','CIRCT_ID', 'DNI_EQUIP_TYPE',
                         'POLE_CLUE_FLG', 'PART_LIGHT_CLUE_FLG', 'EMERGENCY_CLUE_FLG','POWER_OUT_CLUE_FLG','OPEN_DEVICE_CLUE_FLG',
                'TREE_CLUE_FLG', 'WIRE_DOWN_CLUE_FLG','IVR_CLUE_FLG','EQUIPMENT_CLUE_FLG','TRANSFORMER_CLUE_FLG',
             'OH_CAUSE_FLG', 'UG_CAUSE_FLG', 'ANIMAL_CAUSE_FLG','WEATHER_CAUSE_FLG', 'WEATHER_COLD_CAUSE_FLG',
             'PUBLIC_CAUSE_FLG','WEATHER_LIGHTNING_CAUSE_FLG', 'WEATHER__SNOW_CAUSE_FLG','WEATHER__WIND_CAUSE_FLG',
             'WEATHER__HEAT_CAUSE_FLG','CUST_REQUEST_CAUSE_FLG','WEATHER__FLOOD_CAUSE_FLG', 'STREET_CAUSE_FLG',
             'SUBSTATION_CAUSE_FLG','TREE_CAUSE_FLG','MISCELLANEOUS_CAUSE_FLG','NO_CAUSE_FLG', 'PLANNED_CAUSE_FLG', 
              'NO_OUTAGE_CAUSE_FLG',
             'FUSE_OCCURN_FLG', 'CUST_EQUIP_OCCURN_FLG', 'POLE_OCCURN_FLG', 'TRANSFORMER_OCCURN_FLG', 
             'METER_OCCURN_FLG', 'SERVICE_OCCURN_FLG','CABLE_OCCURN_FLG', 'ST_OCCURN_FLG', 'FIRE_OCCURN_FLG', 
             'FOUND_OPEN_OCCURN_FLG','PUBLIC_SAFETY_OCCURN_FLG', 'WIRE_OCCURN_FLG', 'SWITCH_OCCURN_FLG',
             'REGULATOR_OCCURN_FLG', 'CUTOUT_OCCURN_FLG','CAP_BANK_OCCURN_FLG','RECLOSER_OCCURN_FLG','OH_OCCURN_FLG',
             'PRIORITY_VAL_1.0','PRIORITY_VAL_2.0','PRIORITY_VAL_3.0','PRIORITY_VAL_5.0']].drop_duplicates()

QC check if the list retains to 111163

In [22]:
print(df_fac_cat.shape)

(6973, 55)


### **CITY**

In [23]:
%%time

# city treatment
def cat_city_treat(group):
    if(group.CITY_NAM.nunique() > 1):
        x = group[group.CITY_NAM != 'NO_CITY'].CITY_NAM.unique()
        group.CITY_NAM = x[0]
        return group
    else:
        return group
df_treated = df_fac_final[['INCIDENT_ID','STRCTUR_NO', 'CIRCT_ID', 'DNI_EQUIP_TYPE', 'CITY_NAM']]
df_treated = df_treated.groupby(['INCIDENT_ID','STRCTUR_NO', 'CIRCT_ID', 'DNI_EQUIP_TYPE'], as_index = False).apply(cat_city_treat)

CPU times: user 3.69 s, sys: 62.5 ms, total: 3.75 s
Wall time: 3.73 s


In [24]:
len(df_treated[['INCIDENT_ID','STRCTUR_NO', 'CIRCT_ID', 'DNI_EQUIP_TYPE', 'CITY_NAM']].drop_duplicates())

6973

QC check complete

In [25]:
df_treated = df_treated[['INCIDENT_ID','STRCTUR_NO', 'CIRCT_ID', 'DNI_EQUIP_TYPE','CITY_NAM']].drop_duplicates()
df_fac_cat = pd.merge(df_fac_cat, df_treated, on = ['INCIDENT_ID','STRCTUR_NO','CIRCT_ID', 'DNI_EQUIP_TYPE'])

In [26]:
print(df_fac_cat.shape)

(6973, 56)


In [27]:
df_fac_cat.head()

Unnamed: 0,INCIDENT_ID,STRCTUR_NO,CIRCT_ID,DNI_EQUIP_TYPE,POLE_CLUE_FLG,PART_LIGHT_CLUE_FLG,EMERGENCY_CLUE_FLG,POWER_OUT_CLUE_FLG,OPEN_DEVICE_CLUE_FLG,TREE_CLUE_FLG,WIRE_DOWN_CLUE_FLG,IVR_CLUE_FLG,EQUIPMENT_CLUE_FLG,TRANSFORMER_CLUE_FLG,OH_CAUSE_FLG,UG_CAUSE_FLG,ANIMAL_CAUSE_FLG,WEATHER_CAUSE_FLG,WEATHER_COLD_CAUSE_FLG,PUBLIC_CAUSE_FLG,WEATHER_LIGHTNING_CAUSE_FLG,WEATHER__SNOW_CAUSE_FLG,WEATHER__WIND_CAUSE_FLG,WEATHER__HEAT_CAUSE_FLG,CUST_REQUEST_CAUSE_FLG,WEATHER__FLOOD_CAUSE_FLG,STREET_CAUSE_FLG,SUBSTATION_CAUSE_FLG,TREE_CAUSE_FLG,MISCELLANEOUS_CAUSE_FLG,NO_CAUSE_FLG,PLANNED_CAUSE_FLG,NO_OUTAGE_CAUSE_FLG,FUSE_OCCURN_FLG,CUST_EQUIP_OCCURN_FLG,POLE_OCCURN_FLG,TRANSFORMER_OCCURN_FLG,METER_OCCURN_FLG,SERVICE_OCCURN_FLG,CABLE_OCCURN_FLG,ST_OCCURN_FLG,FIRE_OCCURN_FLG,FOUND_OPEN_OCCURN_FLG,PUBLIC_SAFETY_OCCURN_FLG,WIRE_OCCURN_FLG,SWITCH_OCCURN_FLG,REGULATOR_OCCURN_FLG,CUTOUT_OCCURN_FLG,CAP_BANK_OCCURN_FLG,RECLOSER_OCCURN_FLG,OH_OCCURN_FLG,PRIORITY_VAL_1.0,PRIORITY_VAL_2.0,PRIORITY_VAL_3.0,PRIORITY_VAL_5.0,CITY_NAM
0,2001537668,737--/72,1758,FUSE,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,INDIANAPOLIS
1,2001537674,301-B/21,2807,1TBOH,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,INDIANAPOLIS
2,2001537677,670-A/166,2203,FUSE,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,INDIANAPOLIS
3,2001537695,650-B/59,1702,FUSE,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,INDIANAPOLIS
4,2001537697,264-B/72,2809,1TBOH,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,INDIANAPOLIS


## **X Y Coordinate treatment**

In [28]:
df_his_location = df_his_location[['INCIDENT_ID','STRCTUR_NO','GEO_X_COORD','GEO_Y_COORD','PRIMARY_LOC_FLG']].drop_duplicates()
df_his_location = df_his_location[df_his_location.INCIDENT_ID.isin(df_fac_final.INCIDENT_ID)]
df_his_location = df_his_location[df_his_location.STRCTUR_NO.isin(df_fac_final.STRCTUR_NO)]

### **Changing X Y coordinates to LAT and LONG**

In [29]:
%%time
# function to convert geo_x, geo_y coordinate to lat, long

def change_to_loc(df):
    demnorthing = df.GEO_Y_COORD
    demeasting = df.GEO_X_COORD
    northing = float(demnorthing) * 0.3048
    easting = float(demeasting) * 0.3048
    om = (northing - 250000 + 4151863.7425) / 6367236.89768
    fo = om + (math.sin(om) * math.cos(om)) * (0.005022893948 + 0.000029370625 * math.pow(math.cos(om), 2) + 0.000000235059 * math.pow(math.cos(om), 4) + 0.000000002181 * math.pow(math.cos(om), 6))
    tf = math.sin(fo) / math.cos(fo)
    nf2 = 0.00673949677548 * math.pow(math.cos(fo), 2)
    rn = 0.9999666667 * 6378137 / math.pow((1 - 0.0066943800229034 * math.pow(math.sin(fo), 2)), 0.5)
    q = (easting - 100000) / rn
    b2 = -0.5 * tf * (1 + nf2)
    b4 = -(1 / 12) * (5 + (3 * math.pow(tf, 2)) + (nf2 * (1 - 9 * math.pow(tf, 2)) - 4 * math.pow(nf2, 2)))
    b6 = (1 / 360) * (61 + (90 * math.pow(tf, 2)) + (45 * math.pow(tf, 4)) + (nf2 * (46 - (252 * math.pow(tf, 2)) - (90 * math.pow(tf, 4)))))
    lat = fo + b2 * math.pow(q, 2) * (1 + math.pow(q, 2) * (b4 + b6 * math.pow(q, 2)))
    b3 = -(1 / 6) * (1 + 2 * math.pow(tf, 2) + nf2)
    b5 = (1 / 120) * (5 + 28 * math.pow(tf, 2) + 24 * math.pow(tf, 4) + nf2 * (6 + 8 * math.pow(tf, 2)))
    b7 = -(1 / 5040) * (61 + 662 * math.pow(tf, 2) + 1320 * math.pow(tf, 4) + 720 * math.pow(tf, 6))
    l = q * (1 + math.pow(q, 2) * (b3 + math.pow(q, 2) * (b5 + b7 * math.pow(q, 2))))
    lon = 1.4951653925 - l / math.cos(fo)
    coord = [(lat * 57.2957795131), (-1 * lon * 57.2957795131)]
    return coord[0],coord[1]

df_his_location['LAT'],df_his_location['LONG'] = zip(*df_his_location.apply(change_to_loc, axis=1))

CPU times: user 157 ms, sys: 3.08 ms, total: 160 ms
Wall time: 158 ms


In [30]:
(df_his_location['INCIDENT_ID'].astype(str) + df_his_location['STRCTUR_NO'].astype(str)).nunique(), len(df_his_location)

(6979, 7010)

Multiple locations were present, so had to do treating

In [31]:
%%time
def loc_treatment(group):
    group = group.reset_index(drop = True)
    if((group.LAT.nunique() > 1) | (group.LONG.nunique() > 1)):
        x = 0.0
        y = 0.0
        z = 0.0
        for i in range(len(group)):
            latitude = math.radians(float(group.LAT[i]))
            longitude = math.radians(float(group.LONG[i]))
            x += math.cos(latitude) * math.cos(longitude)
            y += math.cos(latitude) * math.sin(longitude)
            z += math.sin(latitude)
        total = len(group)
        x = x / total
        y = y / total
        z = z / total
        central_longitude = math.atan2(y, x)
        central_square_root = math.sqrt(x * x + y * y)
        central_latitude = math.atan2(z, central_square_root)
        group.LAT = math.degrees(central_latitude)
        group.LONG = math.degrees(central_longitude)
        return group
    else:
        return group

df_his_location = df_his_location.groupby(['INCIDENT_ID','STRCTUR_NO'], as_index = False).apply(loc_treatment)

CPU times: user 5.87 s, sys: 53.9 ms, total: 5.93 s
Wall time: 5.92 s


In [32]:
df_his_location = df_his_location[['INCIDENT_ID','STRCTUR_NO','LAT','LONG']].drop_duplicates()

In [33]:
(df_his_location['INCIDENT_ID'].astype(str) + df_his_location['STRCTUR_NO'].astype(str)).nunique(), len(df_his_location)

(6979, 6979)

In [34]:
df_his_location['LAT'] = df_his_location['LAT'].astype(float)
df_his_location['LONG'] = df_his_location['LONG'].astype(float)
df_his_location['LAT'].min(), df_his_location['LAT'].max()

(39.4832691399331, 39.98145328913072)

In [35]:
df_his_location['LONG'].min(), df_his_location['LONG'].max()

(-86.71752245388984, -85.91733054132811)

In [36]:
# function to add zone feature to the ads according to geo coordinates
def add_zone_feature(df):
    center_lat = 39.7684
    center_long = -86.1581
    zone = ''
    
    if(float(df['LAT']) < center_lat):
        if(float(df['LONG']) < center_long):
            zone = 'ZONE1'
        else:
            zone = 'ZONE2'
    else:
        if(float(df['LONG']) < center_long):
            zone = 'ZONE4'
        else:
            zone = 'ZONE3'
    
    return zone

df_his_location['ZONE'] = df_his_location.apply(add_zone_feature, axis=1)
print(df_his_location['ZONE'].unique())

['ZONE2' 'ZONE4' 'ZONE3' 'ZONE1']


In [37]:
df_fac_cat = pd.merge(df_fac_cat, df_his_location, on = ['INCIDENT_ID','STRCTUR_NO'], how = "left")

In [38]:
print(df_fac_cat.shape)
df_fac_cat.head()

(6973, 59)


Unnamed: 0,INCIDENT_ID,STRCTUR_NO,CIRCT_ID,DNI_EQUIP_TYPE,POLE_CLUE_FLG,PART_LIGHT_CLUE_FLG,EMERGENCY_CLUE_FLG,POWER_OUT_CLUE_FLG,OPEN_DEVICE_CLUE_FLG,TREE_CLUE_FLG,WIRE_DOWN_CLUE_FLG,IVR_CLUE_FLG,EQUIPMENT_CLUE_FLG,TRANSFORMER_CLUE_FLG,OH_CAUSE_FLG,UG_CAUSE_FLG,ANIMAL_CAUSE_FLG,WEATHER_CAUSE_FLG,WEATHER_COLD_CAUSE_FLG,PUBLIC_CAUSE_FLG,WEATHER_LIGHTNING_CAUSE_FLG,WEATHER__SNOW_CAUSE_FLG,WEATHER__WIND_CAUSE_FLG,WEATHER__HEAT_CAUSE_FLG,CUST_REQUEST_CAUSE_FLG,WEATHER__FLOOD_CAUSE_FLG,STREET_CAUSE_FLG,SUBSTATION_CAUSE_FLG,TREE_CAUSE_FLG,MISCELLANEOUS_CAUSE_FLG,NO_CAUSE_FLG,PLANNED_CAUSE_FLG,NO_OUTAGE_CAUSE_FLG,FUSE_OCCURN_FLG,CUST_EQUIP_OCCURN_FLG,POLE_OCCURN_FLG,TRANSFORMER_OCCURN_FLG,METER_OCCURN_FLG,SERVICE_OCCURN_FLG,CABLE_OCCURN_FLG,ST_OCCURN_FLG,FIRE_OCCURN_FLG,FOUND_OPEN_OCCURN_FLG,PUBLIC_SAFETY_OCCURN_FLG,WIRE_OCCURN_FLG,SWITCH_OCCURN_FLG,REGULATOR_OCCURN_FLG,CUTOUT_OCCURN_FLG,CAP_BANK_OCCURN_FLG,RECLOSER_OCCURN_FLG,OH_OCCURN_FLG,PRIORITY_VAL_1.0,PRIORITY_VAL_2.0,PRIORITY_VAL_3.0,PRIORITY_VAL_5.0,CITY_NAM,LAT,LONG,ZONE
0,2001537668,737--/72,1758,FUSE,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,INDIANAPOLIS,39.68,-86.05,ZONE2
1,2001537674,301-B/21,2807,1TBOH,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,INDIANAPOLIS,39.86,-86.2,ZONE4
2,2001537677,670-A/166,2203,FUSE,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,INDIANAPOLIS,39.71,-86.12,ZONE2
3,2001537695,650-B/59,1702,FUSE,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,INDIANAPOLIS,39.72,-86.03,ZONE2
4,2001537697,264-B/72,2809,1TBOH,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,INDIANAPOLIS,39.88,-86.18,ZONE4


In [39]:
df_ads = pd.merge(df_numerical, df_fac_cat, on = ['INCIDENT_ID','STRCTUR_NO', 'CIRCT_ID', 'DNI_EQUIP_TYPE'], how = "inner")

In [40]:
print(df_fac_cat.shape)
print(df_numerical.shape)
print(df_ads.shape)
df_ads.head()

(6973, 59)
(6973, 18)
(6973, 73)


Unnamed: 0,INCIDENT_ID,STRCTUR_NO,CIRCT_ID,DNI_EQUIP_TYPE,CUST_QTY,CALL_QTY,KEY_CUST_QTY,DOWNSTREAM_CUST_QTY,KVA_VAL,DOWNSTREAM_KVA_VAL,FAC_JOB_ID,ETR_DATETIME,CREATION_DATETIME,MAJ_OTG_ID,ENERGIZED_DATETIME,SUBST_ID,DAY_FLAG,TTR,POLE_CLUE_FLG,PART_LIGHT_CLUE_FLG,EMERGENCY_CLUE_FLG,POWER_OUT_CLUE_FLG,OPEN_DEVICE_CLUE_FLG,TREE_CLUE_FLG,WIRE_DOWN_CLUE_FLG,IVR_CLUE_FLG,EQUIPMENT_CLUE_FLG,TRANSFORMER_CLUE_FLG,OH_CAUSE_FLG,UG_CAUSE_FLG,ANIMAL_CAUSE_FLG,WEATHER_CAUSE_FLG,WEATHER_COLD_CAUSE_FLG,PUBLIC_CAUSE_FLG,WEATHER_LIGHTNING_CAUSE_FLG,WEATHER__SNOW_CAUSE_FLG,WEATHER__WIND_CAUSE_FLG,WEATHER__HEAT_CAUSE_FLG,CUST_REQUEST_CAUSE_FLG,WEATHER__FLOOD_CAUSE_FLG,STREET_CAUSE_FLG,SUBSTATION_CAUSE_FLG,TREE_CAUSE_FLG,MISCELLANEOUS_CAUSE_FLG,NO_CAUSE_FLG,PLANNED_CAUSE_FLG,NO_OUTAGE_CAUSE_FLG,FUSE_OCCURN_FLG,CUST_EQUIP_OCCURN_FLG,POLE_OCCURN_FLG,TRANSFORMER_OCCURN_FLG,METER_OCCURN_FLG,SERVICE_OCCURN_FLG,CABLE_OCCURN_FLG,ST_OCCURN_FLG,FIRE_OCCURN_FLG,FOUND_OPEN_OCCURN_FLG,PUBLIC_SAFETY_OCCURN_FLG,WIRE_OCCURN_FLG,SWITCH_OCCURN_FLG,REGULATOR_OCCURN_FLG,CUTOUT_OCCURN_FLG,CAP_BANK_OCCURN_FLG,RECLOSER_OCCURN_FLG,OH_OCCURN_FLG,PRIORITY_VAL_1.0,PRIORITY_VAL_2.0,PRIORITY_VAL_3.0,PRIORITY_VAL_5.0,CITY_NAM,LAT,LONG,ZONE
0,2001537668,737--/72,1758,FUSE,7,4,0,7,170.0,170.0,2002742730,2020-08-29 23:30:00,2020-08-29 18:51:07,0,2020-08-29 23:18:00,175,0,266.88,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,INDIANAPOLIS,39.68,-86.05,ZONE2
1,2001537674,301-B/21,2807,1TBOH,1,1,0,1,0.0,0.0,2002742736,2020-08-30 13:15:00,2020-08-29 19:35:29,0,2020-08-30 11:42:31,280,0,967.03,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,INDIANAPOLIS,39.86,-86.2,ZONE4
2,2001537677,670-A/166,2203,FUSE,1,1,0,1,100.0,100.0,2002742741,2020-08-30 00:30:00,2020-08-29 19:54:06,0,2020-08-29 20:53:00,220,0,58.9,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,INDIANAPOLIS,39.71,-86.12,ZONE2
3,2001537695,650-B/59,1702,FUSE,11,1,0,11,50.0,50.0,2002742761,2020-08-30 03:30:00,2020-08-29 23:59:25,0,2020-08-30 04:07:00,170,0,247.58,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,INDIANAPOLIS,39.72,-86.03,ZONE2
4,2001537697,264-B/72,2809,1TBOH,8,8,0,8,50.0,50.0,2002742762,2020-08-30 12:00:00,2020-08-30 03:26:42,0,2020-08-30 10:22:00,280,0,415.3,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,INDIANAPOLIS,39.88,-86.18,ZONE4


## **ADD NO OF OUTAGES FOR CLUE, CAUSE, OCCURN**

In [41]:
df_ads['CREATION_DATETIME'] = pd.to_datetime(df_ads['CREATION_DATETIME'])
df_ads['Date'] = df_ads['CREATION_DATETIME'].dt.date

df_no_of_outages = df_ads.groupby(['Date'],as_index=False).agg({'POWER_OUT_CLUE_FLG' : 'sum', 'OPEN_DEVICE_CLUE_FLG' : 'sum', 'IVR_CLUE_FLG' : 'sum', 'ANIMAL_CAUSE_FLG' : 'sum',
                                                                'WIRE_OCCURN_FLG' : 'sum'})
df_no_of_outages.rename(columns = {'POWER_OUT_CLUE_FLG' : 'NO_OF_POWER_OUT_CLUE_PER_DAY', 'OPEN_DEVICE_CLUE_FLG' : 'NO_OF_OPEN_DEVICE_CLUE_PER_DAY',
                                   'IVR_CLUE_FLG' : 'NO_OF_IVR_CLUE_PER_DAY', 'ANIMAL_CAUSE_FLG' : 'NO_OF_ANIMAL_CAUSE_PER_DAY',
                                   'WIRE_OCCURN_FLG' : 'NO_OF_WIRE_OCCURN_PER_DAY'}, inplace=True)

df_no_of_outages.head()

Unnamed: 0,Date,NO_OF_POWER_OUT_CLUE_PER_DAY,NO_OF_OPEN_DEVICE_CLUE_PER_DAY,NO_OF_IVR_CLUE_PER_DAY,NO_OF_ANIMAL_CAUSE_PER_DAY,NO_OF_WIRE_OCCURN_PER_DAY
0,2020-08-29,3,0,2,0,2
1,2020-08-30,39,0,16,13,0
2,2020-08-31,38,0,17,6,1
3,2020-09-01,62,0,28,6,1
4,2020-09-02,55,0,26,3,2


In [42]:
df_ads = df_ads[['FAC_JOB_ID','INCIDENT_ID', 'STRCTUR_NO', 'CREATION_DATETIME', 'ENERGIZED_DATETIME', 'CIRCT_ID', 'DNI_EQUIP_TYPE', 
         'SUBST_ID', 'CALL_QTY', 'DOWNSTREAM_CUST_QTY', 'KEY_CUST_QTY', 'ETR_DATETIME', 'CUST_QTY', 'DOWNSTREAM_KVA_VAL', 
         'KVA_VAL', 'DAY_FLAG', 'TTR', 'MAJ_OTG_ID', 'POLE_CLUE_FLG', 
                 'PART_LIGHT_CLUE_FLG', 'EMERGENCY_CLUE_FLG','POWER_OUT_CLUE_FLG','OPEN_DEVICE_CLUE_FLG',
                'TREE_CLUE_FLG', 'WIRE_DOWN_CLUE_FLG','IVR_CLUE_FLG','EQUIPMENT_CLUE_FLG','TRANSFORMER_CLUE_FLG',
             'OH_CAUSE_FLG', 'UG_CAUSE_FLG', 'ANIMAL_CAUSE_FLG','WEATHER_CAUSE_FLG', 'WEATHER_COLD_CAUSE_FLG',
             'PUBLIC_CAUSE_FLG','WEATHER_LIGHTNING_CAUSE_FLG', 'WEATHER__SNOW_CAUSE_FLG','WEATHER__WIND_CAUSE_FLG',
             'WEATHER__HEAT_CAUSE_FLG','CUST_REQUEST_CAUSE_FLG','WEATHER__FLOOD_CAUSE_FLG', 'STREET_CAUSE_FLG',
             'SUBSTATION_CAUSE_FLG','TREE_CAUSE_FLG','MISCELLANEOUS_CAUSE_FLG','NO_CAUSE_FLG', 'PLANNED_CAUSE_FLG', 
              'NO_OUTAGE_CAUSE_FLG',
             'FUSE_OCCURN_FLG', 'CUST_EQUIP_OCCURN_FLG', 'POLE_OCCURN_FLG', 'TRANSFORMER_OCCURN_FLG', 
             'METER_OCCURN_FLG', 'SERVICE_OCCURN_FLG','CABLE_OCCURN_FLG', 'ST_OCCURN_FLG', 'FIRE_OCCURN_FLG', 
             'FOUND_OPEN_OCCURN_FLG','PUBLIC_SAFETY_OCCURN_FLG', 'WIRE_OCCURN_FLG', 'SWITCH_OCCURN_FLG',
             'REGULATOR_OCCURN_FLG', 'CUTOUT_OCCURN_FLG','CAP_BANK_OCCURN_FLG','RECLOSER_OCCURN_FLG','OH_OCCURN_FLG',
             'PRIORITY_VAL_1.0','PRIORITY_VAL_2.0','PRIORITY_VAL_3.0','PRIORITY_VAL_5.0', 'CITY_NAM', 'LAT', 'LONG', 'ZONE', 'Date']]

df_ads.columns = ['OUTAGE_ID','INCIDENT_ID', 'STRCTUR_NO', 'CREATION_DATETIME', 'ENERGIZED_DATETIME', 'CIRCT_ID', 'DNI_EQUIP_TYPE', 
                 'SUBST_ID', 'CALL_QTY', 'DOWNSTREAM_CUST_QTY', 'KEY_CUST_QTY', 'ETR_DATETIME', 'CUST_QTY', 'DOWNSTREAM_KVA_VAL', 
                 'KVA_VAL', 'DAY_FLAG', 'TTR',  'MAJ_OTG_ID',
                  'POLE_CLUE_FLG', 'PART_LIGHT_CLUE_FLG', 'EMERGENCY_CLUE_FLG','POWER_OUT_CLUE_FLG','OPEN_DEVICE_CLUE_FLG',
                'TREE_CLUE_FLG', 'WIRE_DOWN_CLUE_FLG','IVR_CLUE_FLG','EQUIPMENT_CLUE_FLG','TRANSFORMER_CLUE_FLG',
                 'OH_CAUSE_FLG', 'UG_CAUSE_FLG', 'ANIMAL_CAUSE_FLG','WEATHER_CAUSE_FLG', 'WEATHER_COLD_CAUSE_FLG',
             'PUBLIC_CAUSE_FLG','WEATHER_LIGHTNING_CAUSE_FLG', 'WEATHER__SNOW_CAUSE_FLG','WEATHER__WIND_CAUSE_FLG',
             'WEATHER__HEAT_CAUSE_FLG','CUST_REQUEST_CAUSE_FLG','WEATHER__FLOOD_CAUSE_FLG', 'STREET_CAUSE_FLG',
             'SUBSTATION_CAUSE_FLG','TREE_CAUSE_FLG','MISCELLANEOUS_CAUSE_FLG','NO_CAUSE_FLG', 'PLANNED_CAUSE_FLG', 
              'NO_OUTAGE_CAUSE_FLG',
             'FUSE_OCCURN_FLG', 'CUST_EQUIP_OCCURN_FLG', 'POLE_OCCURN_FLG', 'TRANSFORMER_OCCURN_FLG', 
             'METER_OCCURN_FLG', 'SERVICE_OCCURN_FLG','CABLE_OCCURN_FLG', 'ST_OCCURN_FLG', 'FIRE_OCCURN_FLG', 
             'FOUND_OPEN_OCCURN_FLG','PUBLIC_SAFETY_OCCURN_FLG', 'WIRE_OCCURN_FLG', 'SWITCH_OCCURN_FLG',
             'REGULATOR_OCCURN_FLG', 'CUTOUT_OCCURN_FLG','CAP_BANK_OCCURN_FLG','RECLOSER_OCCURN_FLG','OH_OCCURN_FLG',
             'PRIORITY_VAL_1.0','PRIORITY_VAL_2.0','PRIORITY_VAL_3.0','PRIORITY_VAL_5.0', 'CITY_NAM', 'LAT', 'LONG','ZONE', 'Date']

In [43]:
df_ads = pd.merge(df_ads, df_no_of_outages, on=['Date'], how='left')

In [44]:
df_ads.drop(['Date'],axis=1,inplace=True)

In [45]:
print(df_ads.shape)
df_ads.head()

(6973, 78)


Unnamed: 0,OUTAGE_ID,INCIDENT_ID,STRCTUR_NO,CREATION_DATETIME,ENERGIZED_DATETIME,CIRCT_ID,DNI_EQUIP_TYPE,SUBST_ID,CALL_QTY,DOWNSTREAM_CUST_QTY,KEY_CUST_QTY,ETR_DATETIME,CUST_QTY,DOWNSTREAM_KVA_VAL,KVA_VAL,DAY_FLAG,TTR,MAJ_OTG_ID,POLE_CLUE_FLG,PART_LIGHT_CLUE_FLG,EMERGENCY_CLUE_FLG,POWER_OUT_CLUE_FLG,OPEN_DEVICE_CLUE_FLG,TREE_CLUE_FLG,WIRE_DOWN_CLUE_FLG,IVR_CLUE_FLG,EQUIPMENT_CLUE_FLG,TRANSFORMER_CLUE_FLG,OH_CAUSE_FLG,UG_CAUSE_FLG,ANIMAL_CAUSE_FLG,WEATHER_CAUSE_FLG,WEATHER_COLD_CAUSE_FLG,PUBLIC_CAUSE_FLG,WEATHER_LIGHTNING_CAUSE_FLG,WEATHER__SNOW_CAUSE_FLG,WEATHER__WIND_CAUSE_FLG,WEATHER__HEAT_CAUSE_FLG,CUST_REQUEST_CAUSE_FLG,WEATHER__FLOOD_CAUSE_FLG,STREET_CAUSE_FLG,SUBSTATION_CAUSE_FLG,TREE_CAUSE_FLG,MISCELLANEOUS_CAUSE_FLG,NO_CAUSE_FLG,PLANNED_CAUSE_FLG,NO_OUTAGE_CAUSE_FLG,FUSE_OCCURN_FLG,CUST_EQUIP_OCCURN_FLG,POLE_OCCURN_FLG,TRANSFORMER_OCCURN_FLG,METER_OCCURN_FLG,SERVICE_OCCURN_FLG,CABLE_OCCURN_FLG,ST_OCCURN_FLG,FIRE_OCCURN_FLG,FOUND_OPEN_OCCURN_FLG,PUBLIC_SAFETY_OCCURN_FLG,WIRE_OCCURN_FLG,SWITCH_OCCURN_FLG,REGULATOR_OCCURN_FLG,CUTOUT_OCCURN_FLG,CAP_BANK_OCCURN_FLG,RECLOSER_OCCURN_FLG,OH_OCCURN_FLG,PRIORITY_VAL_1.0,PRIORITY_VAL_2.0,PRIORITY_VAL_3.0,PRIORITY_VAL_5.0,CITY_NAM,LAT,LONG,ZONE,NO_OF_POWER_OUT_CLUE_PER_DAY,NO_OF_OPEN_DEVICE_CLUE_PER_DAY,NO_OF_IVR_CLUE_PER_DAY,NO_OF_ANIMAL_CAUSE_PER_DAY,NO_OF_WIRE_OCCURN_PER_DAY
0,2002742730,2001537668,737--/72,2020-08-29 18:51:07,2020-08-29 23:18:00,1758,FUSE,175,4,7,0,2020-08-29 23:30:00,7,170.0,170.0,0,266.88,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,INDIANAPOLIS,39.68,-86.05,ZONE2,3,0,2,0,2
1,2002742736,2001537674,301-B/21,2020-08-29 19:35:29,2020-08-30 11:42:31,2807,1TBOH,280,1,1,0,2020-08-30 13:15:00,1,0.0,0.0,0,967.03,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,INDIANAPOLIS,39.86,-86.2,ZONE4,3,0,2,0,2
2,2002742741,2001537677,670-A/166,2020-08-29 19:54:06,2020-08-29 20:53:00,2203,FUSE,220,1,1,0,2020-08-30 00:30:00,1,100.0,100.0,0,58.9,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,INDIANAPOLIS,39.71,-86.12,ZONE2,3,0,2,0,2
3,2002742761,2001537695,650-B/59,2020-08-29 23:59:25,2020-08-30 04:07:00,1702,FUSE,170,1,11,0,2020-08-30 03:30:00,11,50.0,50.0,0,247.58,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,INDIANAPOLIS,39.72,-86.03,ZONE2,3,0,2,0,2
4,2002742762,2001537697,264-B/72,2020-08-30 03:26:42,2020-08-30 10:22:00,2809,1TBOH,280,8,8,0,2020-08-30 12:00:00,8,50.0,50.0,0,415.3,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,INDIANAPOLIS,39.88,-86.18,ZONE4,39,0,16,13,0


In [46]:
ads = df_ads.copy(deep=True)
ads=ads[['OUTAGE_ID', 'LAT', 'LONG']]
ads.reset_index(drop=True,inplace=True)
ads.head()

Unnamed: 0,OUTAGE_ID,LAT,LONG
0,2002742730,39.68,-86.05
1,2002742736,39.86,-86.2
2,2002742741,39.71,-86.12
3,2002742761,39.72,-86.03
4,2002742762,39.88,-86.18


In [47]:
ads['LAT'] = pd.to_numeric(ads['LAT'])
ads['LONG'] = pd.to_numeric(ads['LONG'])
print(ads.dtypes)
ads.head()

OUTAGE_ID      int64
LAT          float64
LONG         float64
dtype: object


Unnamed: 0,OUTAGE_ID,LAT,LONG
0,2002742730,39.68,-86.05
1,2002742736,39.86,-86.2
2,2002742741,39.71,-86.12
3,2002742761,39.72,-86.03
4,2002742762,39.88,-86.18


In [48]:
ads['Marker1_LAT'] =  39.9613 
ads['Marker2_LAT'] = 39.8971
ads['Marker3_LAT'] = 39.9060
ads['Marker4_LAT'] = 39.9024
ads['Marker5_LAT'] = 39.8960
ads['Marker6_LAT'] = 39.8339
ads['Marker7_LAT'] = 39.8412
ads['Marker8_LAT'] = 39.8381
ads['Marker9_LAT'] = 39.8386
ads['Marker10_LAT'] = 39.7579
ads['Marker11_LAT'] = 39.7621
ads['Marker12_LAT'] = 39.7621
ads['Marker13_LAT'] = 39.7695
ads['Marker14_LAT'] = 39.6617
ads['Marker15_LAT'] = 39.6639
ads['Marker16_LAT'] = 39.6702
ads['Marker17_LAT'] = 39.6744
ads['Marker18_LAT'] = 39.5909
ads['Marker19_LAT'] = 39.5295
ads['Marker20_LAT'] = 39.5475

ads['Marker1_LONG'] = -86.4034 
ads['Marker2_LONG'] = -86.3045
ads['Marker3_LONG'] = -86.2001
ads['Marker4_LONG'] = -86.0738
ads['Marker5_LONG'] = -85.9783
ads['Marker6_LONG'] = -86.3155
ads['Marker7_LONG'] = -86.2056
ads['Marker8_LONG'] = -86.0985
ads['Marker9_LONG'] = -85.9811
ads['Marker10_LONG'] = -86.3155
ads['Marker11_LONG'] = -86.2042
ads['Marker12_LONG'] = -86.0923
ads['Marker13_LONG'] = -85.9708
ads['Marker14_LONG'] = -86.2935
ads['Marker15_LONG'] = -86.1823
ads['Marker16_LONG'] = -86.0669
ads['Marker17_LONG'] = -85.9557
ads['Marker18_LONG'] = -86.4212
ads['Marker19_LONG'] = -86.5874
ads['Marker20_LONG'] = -86.2743

In [49]:
# calculate distance from 2 lat long 

def haversine(p1, p2):
    R = 6371     # earth radius in km
    p1 = [math.radians(v) for v in p1]
    p2 = [math.radians(v) for v in p2]

    d_lat = p2[0] - p1[0]
    d_lng = p2[1] - p1[1]
    a = math.pow(math.sin(d_lat / 2), 2) + math.cos(p1[0]) * math.cos(p2[0]) * math.pow(math.sin(d_lng / 2), 2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    return R * c   # returns distance between p1 and p2 in km


In [50]:
# calculate minimum distance

def minimum_distance(lat, long, marker1_lat, marker2_lat, marker3_lat, marker4_lat, marker5_lat, marker6_lat, marker7_lat, marker8_lat, marker9_lat, marker10_lat, marker11_lat,
                     marker12_lat, marker13_lat, marker14_lat, marker15_lat, marker16_lat, marker17_lat, marker18_lat, marker19_lat, marker20_lat, marker1_long, marker2_long,
                     marker3_long, marker4_long, marker5_long, marker6_long, marker7_long, marker8_long, marker9_long, marker10_long, marker11_long, marker12_long, marker13_long,
                     marker14_long, marker15_long, marker16_long, marker17_long, marker18_long, marker19_long, marker20_long):
    
    dist1 = haversine((lat,long), (marker1_lat, marker1_long))
    dist2 = haversine((lat,long), (marker2_lat, marker2_long))
    dist3 = haversine((lat,long), (marker3_lat, marker3_long))
    dist4 = haversine((lat,long), (marker4_lat, marker4_long))
    dist5 = haversine((lat,long), (marker5_lat, marker5_long))
    dist6 = haversine((lat,long), (marker6_lat, marker6_long))
    dist7 = haversine((lat,long), (marker7_lat, marker7_long))
    dist8 = haversine((lat,long), (marker8_lat, marker8_long))
    dist9 = haversine((lat,long), (marker9_lat, marker9_long))
    dist10 = haversine((lat,long), (marker10_lat, marker10_long))
    dist11 = haversine((lat,long), (marker11_lat, marker11_long))
    dist12 = haversine((lat,long), (marker12_lat, marker12_long))
    dist13 = haversine((lat,long), (marker13_lat, marker13_long))
    dist14 = haversine((lat,long), (marker14_lat, marker14_long))
    dist15 = haversine((lat,long), (marker15_lat, marker15_long))
    dist16 = haversine((lat,long), (marker16_lat, marker16_long))
    dist17 = haversine((lat,long), (marker17_lat, marker17_long))
    dist18 = haversine((lat,long), (marker18_lat, marker18_long))
    dist19 = haversine((lat,long), (marker19_lat, marker19_long))
    dist20 = haversine((lat,long), (marker20_lat, marker20_long))
    
    dist_list = [dist1, dist2, dist3, dist4, dist5, dist6, dist7, dist8, dist9, dist10, dist11, dist12, dist13, dist14, dist15, dist16, dist17, dist18, dist19, dist20]

    min_index, min_value = min(enumerate(dist_list), key=operator.itemgetter(1))
    
    if ( (math.isnan(lat)) | (math.isnan(long)) ):
        return None, None
    else :
        return min_value, min_index+1

In [51]:
%%time
ads['Min_Distance'], ads['Marker_Location'] = zip(*ads.apply(lambda row: minimum_distance(row['LAT'], row['LONG'], row['Marker1_LAT'], row['Marker2_LAT'],
                                                            row['Marker3_LAT'], row['Marker4_LAT'], row['Marker5_LAT'], row['Marker6_LAT'],
                                                            row['Marker7_LAT'], row['Marker8_LAT'], row['Marker9_LAT'], row['Marker10_LAT'], 
                                                            row['Marker11_LAT'], row['Marker12_LAT'], row['Marker13_LAT'], row['Marker14_LAT'],
                                                            row['Marker15_LAT'], row['Marker16_LAT'], row['Marker17_LAT'], row['Marker18_LAT'],
                                                            row['Marker19_LAT'], row['Marker20_LAT'], row['Marker1_LONG'], row['Marker2_LONG'],
                                                            row['Marker3_LONG'], row['Marker4_LONG'], row['Marker5_LONG'], row['Marker6_LONG'], 
                                                            row['Marker7_LONG'], row['Marker8_LONG'], row['Marker9_LONG'], row['Marker10_LONG'],
                                                            row['Marker11_LONG'], row['Marker12_LONG'], row['Marker13_LONG'], row['Marker14_LONG'],
                                                            row['Marker15_LONG'], row['Marker16_LONG'], row['Marker17_LONG'], row['Marker18_LONG'], 
                                                            row['Marker19_LONG'], row['Marker20_LONG']),axis=1))

CPU times: user 1.22 s, sys: 2.49 ms, total: 1.22 s
Wall time: 1.22 s


In [52]:
ads = ads[['OUTAGE_ID', 'LAT', 'LONG', 'Min_Distance', 'Marker_Location']]
ads.head()

Unnamed: 0,OUTAGE_ID,LAT,LONG,Min_Distance,Marker_Location
0,2002742730,39.68,-86.05,1.53,16
1,2002742736,39.86,-86.2,2.33,7
2,2002742741,39.71,-86.12,6.23,16
3,2002742761,39.72,-86.03,6.82,13
4,2002742762,39.88,-86.18,3.36,3


In [53]:
ads['Marker_Location'] = 'Marker' + ads['Marker_Location'].astype(str)
print(ads.Marker_Location.unique())

['Marker16' 'Marker7' 'Marker13' 'Marker3' 'Marker6' 'Marker12' 'Marker4'
 'Marker11' 'Marker2' 'Marker10' 'Marker8' 'Marker15' 'Marker5' 'Marker14'
 'MarkerNone' 'Marker9' 'Marker17' 'Marker18' 'Marker19' 'Marker20'
 'Marker1']


In [54]:
ads.drop(['LAT','LONG'],axis=1,inplace=True)
ads.head()
print(df_ads.shape)
df_ads = pd.merge(df_ads, ads, how='left', on=['OUTAGE_ID'])
print(df_ads.shape)

(6973, 78)
(6973, 80)


In [55]:
print(list(df_ads.columns))

['OUTAGE_ID', 'INCIDENT_ID', 'STRCTUR_NO', 'CREATION_DATETIME', 'ENERGIZED_DATETIME', 'CIRCT_ID', 'DNI_EQUIP_TYPE', 'SUBST_ID', 'CALL_QTY', 'DOWNSTREAM_CUST_QTY', 'KEY_CUST_QTY', 'ETR_DATETIME', 'CUST_QTY', 'DOWNSTREAM_KVA_VAL', 'KVA_VAL', 'DAY_FLAG', 'TTR', 'MAJ_OTG_ID', 'POLE_CLUE_FLG', 'PART_LIGHT_CLUE_FLG', 'EMERGENCY_CLUE_FLG', 'POWER_OUT_CLUE_FLG', 'OPEN_DEVICE_CLUE_FLG', 'TREE_CLUE_FLG', 'WIRE_DOWN_CLUE_FLG', 'IVR_CLUE_FLG', 'EQUIPMENT_CLUE_FLG', 'TRANSFORMER_CLUE_FLG', 'OH_CAUSE_FLG', 'UG_CAUSE_FLG', 'ANIMAL_CAUSE_FLG', 'WEATHER_CAUSE_FLG', 'WEATHER_COLD_CAUSE_FLG', 'PUBLIC_CAUSE_FLG', 'WEATHER_LIGHTNING_CAUSE_FLG', 'WEATHER__SNOW_CAUSE_FLG', 'WEATHER__WIND_CAUSE_FLG', 'WEATHER__HEAT_CAUSE_FLG', 'CUST_REQUEST_CAUSE_FLG', 'WEATHER__FLOOD_CAUSE_FLG', 'STREET_CAUSE_FLG', 'SUBSTATION_CAUSE_FLG', 'TREE_CAUSE_FLG', 'MISCELLANEOUS_CAUSE_FLG', 'NO_CAUSE_FLG', 'PLANNED_CAUSE_FLG', 'NO_OUTAGE_CAUSE_FLG', 'FUSE_OCCURN_FLG', 'CUST_EQUIP_OCCURN_FLG', 'POLE_OCCURN_FLG', 'TRANSFORMER_OCCURN_F

## **ADD CYCLICITY ACCORDING TO HOUR**

In [56]:
df_ads['Hour'] = df_ads['CREATION_DATETIME'].dt.hour
print(df_ads['Hour'].unique())

[18 19 23  3  7  8  9 10 11 12 13 14 15 16 17 20 21 22  5  6  0  4  1  2]


In [57]:
df_ads['Hour_Sin'] = np.sin(df_ads.Hour*(2.0*np.pi/24))
df_ads['Hour_Cos'] = np.cos(df_ads.Hour*(2.0*np.pi/24))

In [58]:
df_ads.drop(['Hour'],axis=1,inplace=True)

## **MAJ_OTG_ID**

In [59]:
# ads_final = pd.merge(df_ads, maj_otg_df, on=['INCIDENT_ID', 'STRCTUR_NO'], how='left')
# print(ads_final.shape)
# ads_final.head()

In [60]:
# create a copy of the previous dataframe so that rerunning of code can be avaoided
ads_final = df_ads.copy(deep=True)

## **ADD SUBSEQUENT OUTAGES**

In [61]:
ads_final['Date'] = ads_final.CREATION_DATETIME.dt.date
ads_final['RANK_SUBSEQUENT_OUTAGES'] = ads_final.groupby('Date')['CREATION_DATETIME'].rank(method='dense', ascending=True)
ads_final.drop(['Date','Min_Distance'],axis=1,inplace=True)

In [62]:
print(ads_final.shape)
ads_final.head()

(6973, 82)


Unnamed: 0,OUTAGE_ID,INCIDENT_ID,STRCTUR_NO,CREATION_DATETIME,ENERGIZED_DATETIME,CIRCT_ID,DNI_EQUIP_TYPE,SUBST_ID,CALL_QTY,DOWNSTREAM_CUST_QTY,KEY_CUST_QTY,ETR_DATETIME,CUST_QTY,DOWNSTREAM_KVA_VAL,KVA_VAL,DAY_FLAG,TTR,MAJ_OTG_ID,POLE_CLUE_FLG,PART_LIGHT_CLUE_FLG,EMERGENCY_CLUE_FLG,POWER_OUT_CLUE_FLG,OPEN_DEVICE_CLUE_FLG,TREE_CLUE_FLG,WIRE_DOWN_CLUE_FLG,IVR_CLUE_FLG,EQUIPMENT_CLUE_FLG,TRANSFORMER_CLUE_FLG,OH_CAUSE_FLG,UG_CAUSE_FLG,ANIMAL_CAUSE_FLG,WEATHER_CAUSE_FLG,WEATHER_COLD_CAUSE_FLG,PUBLIC_CAUSE_FLG,WEATHER_LIGHTNING_CAUSE_FLG,WEATHER__SNOW_CAUSE_FLG,WEATHER__WIND_CAUSE_FLG,WEATHER__HEAT_CAUSE_FLG,CUST_REQUEST_CAUSE_FLG,WEATHER__FLOOD_CAUSE_FLG,STREET_CAUSE_FLG,SUBSTATION_CAUSE_FLG,TREE_CAUSE_FLG,MISCELLANEOUS_CAUSE_FLG,NO_CAUSE_FLG,PLANNED_CAUSE_FLG,NO_OUTAGE_CAUSE_FLG,FUSE_OCCURN_FLG,CUST_EQUIP_OCCURN_FLG,POLE_OCCURN_FLG,TRANSFORMER_OCCURN_FLG,METER_OCCURN_FLG,SERVICE_OCCURN_FLG,CABLE_OCCURN_FLG,ST_OCCURN_FLG,FIRE_OCCURN_FLG,FOUND_OPEN_OCCURN_FLG,PUBLIC_SAFETY_OCCURN_FLG,WIRE_OCCURN_FLG,SWITCH_OCCURN_FLG,REGULATOR_OCCURN_FLG,CUTOUT_OCCURN_FLG,CAP_BANK_OCCURN_FLG,RECLOSER_OCCURN_FLG,OH_OCCURN_FLG,PRIORITY_VAL_1.0,PRIORITY_VAL_2.0,PRIORITY_VAL_3.0,PRIORITY_VAL_5.0,CITY_NAM,LAT,LONG,ZONE,NO_OF_POWER_OUT_CLUE_PER_DAY,NO_OF_OPEN_DEVICE_CLUE_PER_DAY,NO_OF_IVR_CLUE_PER_DAY,NO_OF_ANIMAL_CAUSE_PER_DAY,NO_OF_WIRE_OCCURN_PER_DAY,Marker_Location,Hour_Sin,Hour_Cos,RANK_SUBSEQUENT_OUTAGES
0,2002742730,2001537668,737--/72,2020-08-29 18:51:07,2020-08-29 23:18:00,1758,FUSE,175,4,7,0,2020-08-29 23:30:00,7,170.0,170.0,0,266.88,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,INDIANAPOLIS,39.68,-86.05,ZONE2,3,0,2,0,2,Marker16,-1.0,-0.0,1.0
1,2002742736,2001537674,301-B/21,2020-08-29 19:35:29,2020-08-30 11:42:31,2807,1TBOH,280,1,1,0,2020-08-30 13:15:00,1,0.0,0.0,0,967.03,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,INDIANAPOLIS,39.86,-86.2,ZONE4,3,0,2,0,2,Marker7,-0.97,0.26,2.0
2,2002742741,2001537677,670-A/166,2020-08-29 19:54:06,2020-08-29 20:53:00,2203,FUSE,220,1,1,0,2020-08-30 00:30:00,1,100.0,100.0,0,58.9,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,INDIANAPOLIS,39.71,-86.12,ZONE2,3,0,2,0,2,Marker16,-0.97,0.26,3.0
3,2002742761,2001537695,650-B/59,2020-08-29 23:59:25,2020-08-30 04:07:00,1702,FUSE,170,1,11,0,2020-08-30 03:30:00,11,50.0,50.0,0,247.58,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,INDIANAPOLIS,39.72,-86.03,ZONE2,3,0,2,0,2,Marker13,-0.26,0.97,4.0
4,2002742762,2001537697,264-B/72,2020-08-30 03:26:42,2020-08-30 10:22:00,2809,1TBOH,280,8,8,0,2020-08-30 12:00:00,8,50.0,50.0,0,415.3,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,INDIANAPOLIS,39.88,-86.18,ZONE4,39,0,16,13,0,Marker3,0.71,0.71,1.0


## **ADDING LIVE OUTAGE**

In [63]:
def count_outage(group):
    group = group.reset_index(drop = True)
    group['LIVE_OUTAGE'] = len(ads_final[(ads_final.CREATION_DATETIME < group.CREATION_DATETIME[0]) & (ads_final.ENERGIZED_DATETIME > group.CREATION_DATETIME[0])])
    return group

def grouping_fn(df):
    liveoutage = df.groupby(['OUTAGE_ID'], as_index=False).apply(count_outage)
    return liveoutage

if __name__ == '__main__':
    starttime = time.time()
    with Pool(30) as p:
            live_outage = p.map(grouping_fn, [ads_final[:5000], ads_final[5000:10000], ads_final[10000:15000],
                                  ads_final[15000:20000], ads_final[20000:25000], ads_final[25000:30000],
                                  ads_final[30000:35000], ads_final[35000:40000], ads_final[40000:50000],
                                  ads_final[50000:55000], ads_final[55000:60000], ads_final[60000:65000],
                                   ads_final[65000:70000], ads_final[70000:75000], ads_final[75000:80000],
                                  ads_final[80000:90000], ads_final[90000:100000], ads_final[100000:105000],
                                  ads_final[105000:]])
    print('That took {} seconds'.format(time.time() - starttime))

That took 14.781800270080566 seconds


In [64]:
ads_final=pd.concat(live_outage)

In [65]:
print(ads_final.shape)
ads_final.reset_index(drop=True,inplace=True)
ads_final.head()

(6973, 83)


Unnamed: 0,OUTAGE_ID,INCIDENT_ID,STRCTUR_NO,CREATION_DATETIME,ENERGIZED_DATETIME,CIRCT_ID,DNI_EQUIP_TYPE,SUBST_ID,CALL_QTY,DOWNSTREAM_CUST_QTY,KEY_CUST_QTY,ETR_DATETIME,CUST_QTY,DOWNSTREAM_KVA_VAL,KVA_VAL,DAY_FLAG,TTR,MAJ_OTG_ID,POLE_CLUE_FLG,PART_LIGHT_CLUE_FLG,EMERGENCY_CLUE_FLG,POWER_OUT_CLUE_FLG,OPEN_DEVICE_CLUE_FLG,TREE_CLUE_FLG,WIRE_DOWN_CLUE_FLG,IVR_CLUE_FLG,EQUIPMENT_CLUE_FLG,TRANSFORMER_CLUE_FLG,OH_CAUSE_FLG,UG_CAUSE_FLG,ANIMAL_CAUSE_FLG,WEATHER_CAUSE_FLG,WEATHER_COLD_CAUSE_FLG,PUBLIC_CAUSE_FLG,WEATHER_LIGHTNING_CAUSE_FLG,WEATHER__SNOW_CAUSE_FLG,WEATHER__WIND_CAUSE_FLG,WEATHER__HEAT_CAUSE_FLG,CUST_REQUEST_CAUSE_FLG,WEATHER__FLOOD_CAUSE_FLG,STREET_CAUSE_FLG,SUBSTATION_CAUSE_FLG,TREE_CAUSE_FLG,MISCELLANEOUS_CAUSE_FLG,NO_CAUSE_FLG,PLANNED_CAUSE_FLG,NO_OUTAGE_CAUSE_FLG,FUSE_OCCURN_FLG,CUST_EQUIP_OCCURN_FLG,POLE_OCCURN_FLG,TRANSFORMER_OCCURN_FLG,METER_OCCURN_FLG,SERVICE_OCCURN_FLG,CABLE_OCCURN_FLG,ST_OCCURN_FLG,FIRE_OCCURN_FLG,FOUND_OPEN_OCCURN_FLG,PUBLIC_SAFETY_OCCURN_FLG,WIRE_OCCURN_FLG,SWITCH_OCCURN_FLG,REGULATOR_OCCURN_FLG,CUTOUT_OCCURN_FLG,CAP_BANK_OCCURN_FLG,RECLOSER_OCCURN_FLG,OH_OCCURN_FLG,PRIORITY_VAL_1.0,PRIORITY_VAL_2.0,PRIORITY_VAL_3.0,PRIORITY_VAL_5.0,CITY_NAM,LAT,LONG,ZONE,NO_OF_POWER_OUT_CLUE_PER_DAY,NO_OF_OPEN_DEVICE_CLUE_PER_DAY,NO_OF_IVR_CLUE_PER_DAY,NO_OF_ANIMAL_CAUSE_PER_DAY,NO_OF_WIRE_OCCURN_PER_DAY,Marker_Location,Hour_Sin,Hour_Cos,RANK_SUBSEQUENT_OUTAGES,LIVE_OUTAGE
0,2002742730,2001537668,737--/72,2020-08-29 18:51:07,2020-08-29 23:18:00,1758,FUSE,175,4,7,0,2020-08-29 23:30:00,7,170.0,170.0,0,266.88,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,INDIANAPOLIS,39.68,-86.05,ZONE2,3,0,2,0,2,Marker16,-1.0,-0.0,1.0,0
1,2002742736,2001537674,301-B/21,2020-08-29 19:35:29,2020-08-30 11:42:31,2807,1TBOH,280,1,1,0,2020-08-30 13:15:00,1,0.0,0.0,0,967.03,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,INDIANAPOLIS,39.86,-86.2,ZONE4,3,0,2,0,2,Marker7,-0.97,0.26,2.0,1
2,2002742741,2001537677,670-A/166,2020-08-29 19:54:06,2020-08-29 20:53:00,2203,FUSE,220,1,1,0,2020-08-30 00:30:00,1,100.0,100.0,0,58.9,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,INDIANAPOLIS,39.71,-86.12,ZONE2,3,0,2,0,2,Marker16,-0.97,0.26,3.0,2
3,2002742761,2001537695,650-B/59,2020-08-29 23:59:25,2020-08-30 04:07:00,1702,FUSE,170,1,11,0,2020-08-30 03:30:00,11,50.0,50.0,0,247.58,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,INDIANAPOLIS,39.72,-86.03,ZONE2,3,0,2,0,2,Marker13,-0.26,0.97,4.0,1
4,2002742762,2001537697,264-B/72,2020-08-30 03:26:42,2020-08-30 10:22:00,2809,1TBOH,280,8,8,0,2020-08-30 12:00:00,8,50.0,50.0,0,415.3,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,INDIANAPOLIS,39.88,-86.18,ZONE4,39,0,16,13,0,Marker3,0.71,0.71,1.0,2


## **OUTAGE FEATURES**

In [66]:
def count_outage_minutes(group):
    group = group.reset_index(drop = True)
    df_temp = ads_final[['OUTAGE_ID','CREATION_DATETIME']]
    df_temp['minutes'] = (group['CREATION_DATETIME'][0] - ads_final['CREATION_DATETIME']).dt.total_seconds().div(60)
    df_temp = df_temp[df_temp.minutes > 0]
    group['Outages_in_last_1hr'] = len(df_temp[df_temp.minutes <= 60])
    group['Outages_in_last_2hr'] = len(df_temp[df_temp.minutes <= 120])
    group['Outages_in_last_3hr'] = len(df_temp[df_temp.minutes <= 180])
    group['Outages_in_last_4hr'] = len(df_temp[df_temp.minutes <= 240])
    group['Outages_in_last_5hr'] = len(df_temp[df_temp.minutes <= 300])
    group['Outages_in_last_6hr'] = len(df_temp[df_temp.minutes <= 360])
    group['Outages_in_last_7hr'] = len(df_temp[df_temp.minutes <= 420])
    group['Outages_in_last_8hr'] = len(df_temp[df_temp.minutes <= 480])
    group['Outages_in_last_9hr'] = len(df_temp[df_temp.minutes <= 540])
    group['Outages_in_last_10hr'] = len(df_temp[df_temp.minutes <= 600])
    return group

def grouping_fn_minutes(df):
    liveoutage = df.groupby(['OUTAGE_ID'], as_index=False).apply(count_outage_minutes)
    return liveoutage

if __name__ == '__main__':
    starttime = time.time()
    with Pool(30) as p:
            live_outage_minutes = p.map(grouping_fn_minutes, [ads_final[:5000], ads_final[5000:10000], ads_final[10000:15000],
                                  ads_final[15000:20000], ads_final[20000:25000], ads_final[25000:30000],
                                  ads_final[30000:35000], ads_final[35000:40000], ads_final[40000:50000],
                                  ads_final[50000:55000], ads_final[55000:60000], ads_final[60000:65000],
                                  ads_final[65000:70000], ads_final[70000:75000], ads_final[75000:80000],
                                  ads_final[80000:90000], ads_final[90000:100000], ads_final[100000:105000],
                                  ads_final[105000:]])
    print('That took {} seconds'.format(time.time() - starttime))

That took 56.73664331436157 seconds


In [67]:
ads_final=pd.concat(live_outage_minutes)

In [68]:
print(ads_final.shape)
ads_final.reset_index(drop=True,inplace=True)
ads_final.head()

(6973, 93)


Unnamed: 0,OUTAGE_ID,INCIDENT_ID,STRCTUR_NO,CREATION_DATETIME,ENERGIZED_DATETIME,CIRCT_ID,DNI_EQUIP_TYPE,SUBST_ID,CALL_QTY,DOWNSTREAM_CUST_QTY,KEY_CUST_QTY,ETR_DATETIME,CUST_QTY,DOWNSTREAM_KVA_VAL,KVA_VAL,DAY_FLAG,TTR,MAJ_OTG_ID,POLE_CLUE_FLG,PART_LIGHT_CLUE_FLG,EMERGENCY_CLUE_FLG,POWER_OUT_CLUE_FLG,OPEN_DEVICE_CLUE_FLG,TREE_CLUE_FLG,WIRE_DOWN_CLUE_FLG,IVR_CLUE_FLG,EQUIPMENT_CLUE_FLG,TRANSFORMER_CLUE_FLG,OH_CAUSE_FLG,UG_CAUSE_FLG,ANIMAL_CAUSE_FLG,WEATHER_CAUSE_FLG,WEATHER_COLD_CAUSE_FLG,PUBLIC_CAUSE_FLG,WEATHER_LIGHTNING_CAUSE_FLG,WEATHER__SNOW_CAUSE_FLG,WEATHER__WIND_CAUSE_FLG,WEATHER__HEAT_CAUSE_FLG,CUST_REQUEST_CAUSE_FLG,WEATHER__FLOOD_CAUSE_FLG,STREET_CAUSE_FLG,SUBSTATION_CAUSE_FLG,TREE_CAUSE_FLG,MISCELLANEOUS_CAUSE_FLG,NO_CAUSE_FLG,PLANNED_CAUSE_FLG,NO_OUTAGE_CAUSE_FLG,FUSE_OCCURN_FLG,CUST_EQUIP_OCCURN_FLG,POLE_OCCURN_FLG,TRANSFORMER_OCCURN_FLG,METER_OCCURN_FLG,SERVICE_OCCURN_FLG,CABLE_OCCURN_FLG,ST_OCCURN_FLG,FIRE_OCCURN_FLG,FOUND_OPEN_OCCURN_FLG,PUBLIC_SAFETY_OCCURN_FLG,WIRE_OCCURN_FLG,SWITCH_OCCURN_FLG,REGULATOR_OCCURN_FLG,CUTOUT_OCCURN_FLG,CAP_BANK_OCCURN_FLG,RECLOSER_OCCURN_FLG,OH_OCCURN_FLG,PRIORITY_VAL_1.0,PRIORITY_VAL_2.0,PRIORITY_VAL_3.0,PRIORITY_VAL_5.0,CITY_NAM,LAT,LONG,ZONE,NO_OF_POWER_OUT_CLUE_PER_DAY,NO_OF_OPEN_DEVICE_CLUE_PER_DAY,NO_OF_IVR_CLUE_PER_DAY,NO_OF_ANIMAL_CAUSE_PER_DAY,NO_OF_WIRE_OCCURN_PER_DAY,Marker_Location,Hour_Sin,Hour_Cos,RANK_SUBSEQUENT_OUTAGES,LIVE_OUTAGE,Outages_in_last_1hr,Outages_in_last_2hr,Outages_in_last_3hr,Outages_in_last_4hr,Outages_in_last_5hr,Outages_in_last_6hr,Outages_in_last_7hr,Outages_in_last_8hr,Outages_in_last_9hr,Outages_in_last_10hr
0,2002742730,2001537668,737--/72,2020-08-29 18:51:07,2020-08-29 23:18:00,1758,FUSE,175,4,7,0,2020-08-29 23:30:00,7,170.0,170.0,0,266.88,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,INDIANAPOLIS,39.68,-86.05,ZONE2,3,0,2,0,2,Marker16,-1.0,-0.0,1.0,0,0,0,0,0,0,0,0,0,0,0
1,2002742736,2001537674,301-B/21,2020-08-29 19:35:29,2020-08-30 11:42:31,2807,1TBOH,280,1,1,0,2020-08-30 13:15:00,1,0.0,0.0,0,967.03,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,INDIANAPOLIS,39.86,-86.2,ZONE4,3,0,2,0,2,Marker7,-0.97,0.26,2.0,1,1,1,1,1,1,1,1,1,1,1
2,2002742741,2001537677,670-A/166,2020-08-29 19:54:06,2020-08-29 20:53:00,2203,FUSE,220,1,1,0,2020-08-30 00:30:00,1,100.0,100.0,0,58.9,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,INDIANAPOLIS,39.71,-86.12,ZONE2,3,0,2,0,2,Marker16,-0.97,0.26,3.0,2,1,2,2,2,2,2,2,2,2,2
3,2002742761,2001537695,650-B/59,2020-08-29 23:59:25,2020-08-30 04:07:00,1702,FUSE,170,1,11,0,2020-08-30 03:30:00,11,50.0,50.0,0,247.58,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,INDIANAPOLIS,39.72,-86.03,ZONE2,3,0,2,0,2,Marker13,-0.26,0.97,4.0,1,0,0,0,0,2,3,3,3,3,3
4,2002742762,2001537697,264-B/72,2020-08-30 03:26:42,2020-08-30 10:22:00,2809,1TBOH,280,8,8,0,2020-08-30 12:00:00,8,50.0,50.0,0,415.3,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,INDIANAPOLIS,39.88,-86.18,ZONE4,39,0,16,13,0,Marker3,0.71,0.71,1.0,2,0,0,0,1,1,1,1,3,4,4


In [69]:
df_copy = ads_final.copy(deep=True)
# ads_final = df_copy.copy(deep=True)

In [70]:
print(df_copy.shape)
print(ads_final.shape)

(6973, 93)
(6973, 93)


## **Day of the week features**

In [71]:
ads_final['Date'] = ads_final.CREATION_DATETIME.dt.date
days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
ads_final['Weekday'] = ads_final['Date'].apply(lambda x: x.weekday()).apply(lambda x: days[x])

ads_final['Weekend_flag'] = ads_final['Weekday'].apply(lambda x: True if (x == 'Saturday') | (x == 'Sunday') else False)
ads_final.drop(['Date'],axis=1,inplace=True)

In [72]:
print(list(ads_final.columns))
print(ads_final.shape)

['OUTAGE_ID', 'INCIDENT_ID', 'STRCTUR_NO', 'CREATION_DATETIME', 'ENERGIZED_DATETIME', 'CIRCT_ID', 'DNI_EQUIP_TYPE', 'SUBST_ID', 'CALL_QTY', 'DOWNSTREAM_CUST_QTY', 'KEY_CUST_QTY', 'ETR_DATETIME', 'CUST_QTY', 'DOWNSTREAM_KVA_VAL', 'KVA_VAL', 'DAY_FLAG', 'TTR', 'MAJ_OTG_ID', 'POLE_CLUE_FLG', 'PART_LIGHT_CLUE_FLG', 'EMERGENCY_CLUE_FLG', 'POWER_OUT_CLUE_FLG', 'OPEN_DEVICE_CLUE_FLG', 'TREE_CLUE_FLG', 'WIRE_DOWN_CLUE_FLG', 'IVR_CLUE_FLG', 'EQUIPMENT_CLUE_FLG', 'TRANSFORMER_CLUE_FLG', 'OH_CAUSE_FLG', 'UG_CAUSE_FLG', 'ANIMAL_CAUSE_FLG', 'WEATHER_CAUSE_FLG', 'WEATHER_COLD_CAUSE_FLG', 'PUBLIC_CAUSE_FLG', 'WEATHER_LIGHTNING_CAUSE_FLG', 'WEATHER__SNOW_CAUSE_FLG', 'WEATHER__WIND_CAUSE_FLG', 'WEATHER__HEAT_CAUSE_FLG', 'CUST_REQUEST_CAUSE_FLG', 'WEATHER__FLOOD_CAUSE_FLG', 'STREET_CAUSE_FLG', 'SUBSTATION_CAUSE_FLG', 'TREE_CAUSE_FLG', 'MISCELLANEOUS_CAUSE_FLG', 'NO_CAUSE_FLG', 'PLANNED_CAUSE_FLG', 'NO_OUTAGE_CAUSE_FLG', 'FUSE_OCCURN_FLG', 'CUST_EQUIP_OCCURN_FLG', 'POLE_OCCURN_FLG', 'TRANSFORMER_OCCURN_F

## **Priority Queuing Feature**
1. Rank based on simple customer quantity as mentioned by Eric (live rankings to be followed, numerical feature) <br>

In [73]:
ads_final.sort_values(by = ['CREATION_DATETIME'], inplace=True)
ads_final.reset_index(drop=True, inplace=True)

In [74]:
def create_groups_based_on_live_outages(live):
    list_group_no = []
    group = 0
    
    for i in range(len(live)):
        if live.LIVE_OUTAGE[i] == 0  :
            group = group + 1
            list_group_no.append(group)
        else :
            list_group_no.append(group)
    
    return list_group_no

ads_final['Live_outage_group'] = create_groups_based_on_live_outages(ads_final)

In [75]:
ads_final['Priority_Customer_Qty'] = ads_final.groupby(['Live_outage_group'])['DOWNSTREAM_CUST_QTY'].rank(method='dense', ascending=False)

In [76]:
print(ads_final.shape)

(6973, 97)


2. Rank based on the factor of distance from centroid and customer quantity (live rankings to be followed, approach #2, numerical feature) <br>

In [77]:
df_v1 = ads_final.groupby(['Live_outage_group'],as_index=False).agg({'LAT' : 'sum', 'LONG' : 'sum', 'LIVE_OUTAGE' : 'count'})
df_v1['Center_LAT'] = (df_v1.LAT)/(df_v1.LIVE_OUTAGE)
df_v1['Center_LONG'] = (df_v1.LONG)/(df_v1.LIVE_OUTAGE)
df_v1.head()

Unnamed: 0,Live_outage_group,LAT,LONG,LIVE_OUTAGE,Center_LAT,Center_LONG
0,1,1194.81,-2584.93,30,39.83,-86.16
1,2,199.32,-430.6,5,39.86,-86.12
2,3,39.75,-86.19,1,39.75,-86.19
3,4,119.54,-258.48,3,39.85,-86.16
4,5,39.76,-86.05,1,39.76,-86.05


In [78]:
df_v1.drop(['LAT', 'LONG', 'LIVE_OUTAGE'], axis=1, inplace=True)
ads_final_v1 = pd.merge(ads_final, df_v1, how='left', on='Live_outage_group')

In [79]:
print(ads_final_v1.shape)
# ads_final_v1.head(5)

(6973, 99)


In [80]:
def cal_distance_from_center_lat_long(lat, long, center_lat, center_long):
    if ((math.isnan(lat)) | (math.isnan(long)) | (math.isnan(center_lat)) | (math.isnan(center_long))):
        return None
    else :
        coords1 = [lat,long]
        coords2 = [center_lat, center_long]
        return (geopy.distance.distance(coords1, coords2).miles)

In [81]:
ads_final_v1['Dis_From_Live_Centriod'] = ads_final_v1.apply(lambda x: cal_distance_from_center_lat_long(x['LAT'], x['LONG'], x['Center_LAT'], x['Center_LONG']),axis=1)
ads_final_v1['Dis_From_Live_Centriod'] = ads_final_v1['Dis_From_Live_Centriod'].apply(pd.to_numeric, errors='coerce')

In [82]:
ads_final_v1['Dis_From_Live_Centriod_div_Cust_qty'] = (ads_final_v1['Dis_From_Live_Centriod']) / (ads_final_v1['DOWNSTREAM_CUST_QTY'])
ads_final_v1['Priority_Dist_Customer_Qty'] = ads_final_v1.groupby(['Live_outage_group'])['Dis_From_Live_Centriod_div_Cust_qty'].rank(method='max', ascending=True)

In [83]:
ads_final_v1.drop(['Center_LAT', 'Center_LONG', 'Dis_From_Live_Centriod'], axis=1, inplace=True)

In [84]:
print(ads_final_v1.shape)

(6973, 99)


## **Add dispatch area location**

In [85]:
def cal_distance_from_dipatch_area(lat, long):
    
    if ((math.isnan(lat)) | (math.isnan(long))):
        return None, None
    else :
        coords1 = [lat,long]
        dist_34 = geopy.distance.distance(coords1, [39.8802, -86.2324]).miles
        dist_arl = geopy.distance.distance(coords1, [39.8802, -86.0854]).miles
        dist_mill = geopy.distance.distance(coords1, [39.7880, -86.2296]).miles
        dist_english = geopy.distance.distance(coords1, [39.7880, -86.0868]).miles
        dist_wii = geopy.distance.distance(coords1, [39.7003, -86.2303]).miles
        dist_south = geopy.distance.distance(coords1, [39.7003, -86.0834]).miles
    
        dist_list = [dist_34, dist_arl, dist_mill, dist_english, dist_wii, dist_south]

        min_index, min_value = min(enumerate(dist_list), key=operator.itemgetter(1))
    
        return min_value, min_index+1

In [86]:
ads_final_v1['Min_Distance'], ads_final_v1['Grid'] = zip(*ads_final_v1.apply(lambda row: cal_distance_from_dipatch_area(row['LAT'], row['LONG']),axis=1))

In [87]:
def map_grid_to_location(row):
    
    if row==1:
        return '34th'
    elif row==2:
        return 'ARL.'
    elif row==3:
        return 'MILL'
    elif row==4:
        return 'ENGLISH'
    elif row==5:
        return 'W.I.'
    elif row==6:
        return 'SOUTH'
    else :
        return "NO_LOCATION"

In [88]:
ads_final_v1['Dispatch_Location'] = ads_final_v1.apply(lambda row: map_grid_to_location(row['Grid']),axis=1)

In [89]:
ads_final_v1.drop(['Min_Distance', 'Grid'], axis=1, inplace=True)

In [90]:
ads_final_v1.head()

Unnamed: 0,OUTAGE_ID,INCIDENT_ID,STRCTUR_NO,CREATION_DATETIME,ENERGIZED_DATETIME,CIRCT_ID,DNI_EQUIP_TYPE,SUBST_ID,CALL_QTY,DOWNSTREAM_CUST_QTY,KEY_CUST_QTY,ETR_DATETIME,CUST_QTY,DOWNSTREAM_KVA_VAL,KVA_VAL,DAY_FLAG,TTR,MAJ_OTG_ID,POLE_CLUE_FLG,PART_LIGHT_CLUE_FLG,EMERGENCY_CLUE_FLG,POWER_OUT_CLUE_FLG,OPEN_DEVICE_CLUE_FLG,TREE_CLUE_FLG,WIRE_DOWN_CLUE_FLG,IVR_CLUE_FLG,EQUIPMENT_CLUE_FLG,TRANSFORMER_CLUE_FLG,OH_CAUSE_FLG,UG_CAUSE_FLG,ANIMAL_CAUSE_FLG,WEATHER_CAUSE_FLG,WEATHER_COLD_CAUSE_FLG,PUBLIC_CAUSE_FLG,WEATHER_LIGHTNING_CAUSE_FLG,WEATHER__SNOW_CAUSE_FLG,WEATHER__WIND_CAUSE_FLG,WEATHER__HEAT_CAUSE_FLG,CUST_REQUEST_CAUSE_FLG,WEATHER__FLOOD_CAUSE_FLG,STREET_CAUSE_FLG,SUBSTATION_CAUSE_FLG,TREE_CAUSE_FLG,MISCELLANEOUS_CAUSE_FLG,NO_CAUSE_FLG,PLANNED_CAUSE_FLG,NO_OUTAGE_CAUSE_FLG,FUSE_OCCURN_FLG,CUST_EQUIP_OCCURN_FLG,POLE_OCCURN_FLG,TRANSFORMER_OCCURN_FLG,METER_OCCURN_FLG,SERVICE_OCCURN_FLG,CABLE_OCCURN_FLG,ST_OCCURN_FLG,FIRE_OCCURN_FLG,FOUND_OPEN_OCCURN_FLG,PUBLIC_SAFETY_OCCURN_FLG,WIRE_OCCURN_FLG,SWITCH_OCCURN_FLG,REGULATOR_OCCURN_FLG,CUTOUT_OCCURN_FLG,CAP_BANK_OCCURN_FLG,RECLOSER_OCCURN_FLG,OH_OCCURN_FLG,PRIORITY_VAL_1.0,PRIORITY_VAL_2.0,PRIORITY_VAL_3.0,PRIORITY_VAL_5.0,CITY_NAM,LAT,LONG,ZONE,NO_OF_POWER_OUT_CLUE_PER_DAY,NO_OF_OPEN_DEVICE_CLUE_PER_DAY,NO_OF_IVR_CLUE_PER_DAY,NO_OF_ANIMAL_CAUSE_PER_DAY,NO_OF_WIRE_OCCURN_PER_DAY,Marker_Location,Hour_Sin,Hour_Cos,RANK_SUBSEQUENT_OUTAGES,LIVE_OUTAGE,Outages_in_last_1hr,Outages_in_last_2hr,Outages_in_last_3hr,Outages_in_last_4hr,Outages_in_last_5hr,Outages_in_last_6hr,Outages_in_last_7hr,Outages_in_last_8hr,Outages_in_last_9hr,Outages_in_last_10hr,Weekday,Weekend_flag,Live_outage_group,Priority_Customer_Qty,Dis_From_Live_Centriod_div_Cust_qty,Priority_Dist_Customer_Qty,Dispatch_Location
0,2002742730,2001537668,737--/72,2020-08-29 18:51:07,2020-08-29 23:18:00,1758,FUSE,175,4,7,0,2020-08-29 23:30:00,7,170.0,170.0,0,266.88,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,INDIANAPOLIS,39.68,-86.05,ZONE2,3,0,2,0,2,Marker16,-1.0,-0.0,1.0,0,0,0,0,0,0,0,0,0,0,0,Saturday,True,1,8.0,1.71,23.0,SOUTH
1,2002742736,2001537674,301-B/21,2020-08-29 19:35:29,2020-08-30 11:42:31,2807,1TBOH,280,1,1,0,2020-08-30 13:15:00,1,0.0,0.0,0,967.03,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,INDIANAPOLIS,39.86,-86.2,ZONE4,3,0,2,0,2,Marker7,-0.97,0.26,2.0,1,1,1,1,1,1,1,1,1,1,1,Saturday,True,1,13.0,3.11,27.0,34th
2,2002742741,2001537677,670-A/166,2020-08-29 19:54:06,2020-08-29 20:53:00,2203,FUSE,220,1,1,0,2020-08-30 00:30:00,1,100.0,100.0,0,58.9,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,INDIANAPOLIS,39.71,-86.12,ZONE2,3,0,2,0,2,Marker16,-0.97,0.26,3.0,2,1,2,2,2,2,2,2,2,2,2,Saturday,True,1,13.0,8.47,29.0,SOUTH
3,2002742761,2001537695,650-B/59,2020-08-29 23:59:25,2020-08-30 04:07:00,1702,FUSE,170,1,11,0,2020-08-30 03:30:00,11,50.0,50.0,0,247.58,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,INDIANAPOLIS,39.72,-86.03,ZONE2,3,0,2,0,2,Marker13,-0.26,0.97,4.0,1,0,0,0,0,2,3,3,3,3,3,Saturday,True,1,5.0,0.93,15.0,SOUTH
4,2002742762,2001537697,264-B/72,2020-08-30 03:26:42,2020-08-30 10:22:00,2809,1TBOH,280,8,8,0,2020-08-30 12:00:00,8,50.0,50.0,0,415.3,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,INDIANAPOLIS,39.88,-86.18,ZONE4,39,0,16,13,0,Marker3,0.71,0.71,1.0,2,0,0,0,1,1,1,1,3,4,4,Sunday,True,1,7.0,0.48,9.0,34th


In [91]:
ads_final_v1['CREATION_DATETIME'] = pd.to_datetime(ads_final['CREATION_DATETIME'])
ads_final_v1['Date'] = ads_final_v1['CREATION_DATETIME'].dt.date
ads_final_v1['Date'] = ads_final_v1['Date'].apply(lambda x: x.strftime('%Y-%m-%d'))

In [92]:
ads_final_v1['Date'].unique()

array(['2020-08-29', '2020-08-30', '2020-08-31', '2020-09-01',
       '2020-09-02', '2020-09-03', '2020-09-04', '2020-09-05',
       '2020-09-06', '2020-09-07', '2020-09-08', '2020-09-09',
       '2020-09-10', '2020-09-11', '2020-09-12', '2020-09-13',
       '2020-09-14', '2020-09-15', '2020-09-16', '2020-09-17',
       '2020-09-18', '2020-09-19', '2020-09-20', '2020-09-21',
       '2020-09-22', '2020-09-23', '2020-09-24', '2020-09-25',
       '2020-09-26', '2020-09-27', '2020-09-28', '2020-09-29',
       '2020-09-30', '2020-10-01', '2020-10-02', '2020-10-03',
       '2020-10-04', '2020-10-05', '2020-10-06', '2020-10-07',
       '2020-10-08', '2020-10-09', '2020-10-10', '2020-10-11',
       '2020-10-12', '2020-10-13', '2020-10-14', '2020-10-15',
       '2020-10-16', '2020-10-17', '2020-10-18', '2020-10-19',
       '2020-10-20', '2020-10-21', '2020-10-22', '2020-10-23',
       '2020-10-24', '2020-10-25', '2020-10-26', '2020-10-27',
       '2020-10-28', '2020-10-29', '2020-10-30', '2020-

In [93]:
ads_final_v1 = ads_final_v1[(ads_final_v1['Date'] == '2020-11-15')]
ads_final_v1.shape

(523, 101)

In [94]:
ads_final_v1.reset_index(drop=True, inplace=True)
ads_final_v1.head()

Unnamed: 0,OUTAGE_ID,INCIDENT_ID,STRCTUR_NO,CREATION_DATETIME,ENERGIZED_DATETIME,CIRCT_ID,DNI_EQUIP_TYPE,SUBST_ID,CALL_QTY,DOWNSTREAM_CUST_QTY,KEY_CUST_QTY,ETR_DATETIME,CUST_QTY,DOWNSTREAM_KVA_VAL,KVA_VAL,DAY_FLAG,TTR,MAJ_OTG_ID,POLE_CLUE_FLG,PART_LIGHT_CLUE_FLG,EMERGENCY_CLUE_FLG,POWER_OUT_CLUE_FLG,OPEN_DEVICE_CLUE_FLG,TREE_CLUE_FLG,WIRE_DOWN_CLUE_FLG,IVR_CLUE_FLG,EQUIPMENT_CLUE_FLG,TRANSFORMER_CLUE_FLG,OH_CAUSE_FLG,UG_CAUSE_FLG,ANIMAL_CAUSE_FLG,WEATHER_CAUSE_FLG,WEATHER_COLD_CAUSE_FLG,PUBLIC_CAUSE_FLG,WEATHER_LIGHTNING_CAUSE_FLG,WEATHER__SNOW_CAUSE_FLG,WEATHER__WIND_CAUSE_FLG,WEATHER__HEAT_CAUSE_FLG,CUST_REQUEST_CAUSE_FLG,WEATHER__FLOOD_CAUSE_FLG,STREET_CAUSE_FLG,SUBSTATION_CAUSE_FLG,TREE_CAUSE_FLG,MISCELLANEOUS_CAUSE_FLG,NO_CAUSE_FLG,PLANNED_CAUSE_FLG,NO_OUTAGE_CAUSE_FLG,FUSE_OCCURN_FLG,CUST_EQUIP_OCCURN_FLG,POLE_OCCURN_FLG,TRANSFORMER_OCCURN_FLG,METER_OCCURN_FLG,SERVICE_OCCURN_FLG,CABLE_OCCURN_FLG,ST_OCCURN_FLG,FIRE_OCCURN_FLG,FOUND_OPEN_OCCURN_FLG,PUBLIC_SAFETY_OCCURN_FLG,WIRE_OCCURN_FLG,SWITCH_OCCURN_FLG,REGULATOR_OCCURN_FLG,CUTOUT_OCCURN_FLG,CAP_BANK_OCCURN_FLG,RECLOSER_OCCURN_FLG,OH_OCCURN_FLG,PRIORITY_VAL_1.0,PRIORITY_VAL_2.0,PRIORITY_VAL_3.0,PRIORITY_VAL_5.0,CITY_NAM,LAT,LONG,ZONE,NO_OF_POWER_OUT_CLUE_PER_DAY,NO_OF_OPEN_DEVICE_CLUE_PER_DAY,NO_OF_IVR_CLUE_PER_DAY,NO_OF_ANIMAL_CAUSE_PER_DAY,NO_OF_WIRE_OCCURN_PER_DAY,Marker_Location,Hour_Sin,Hour_Cos,RANK_SUBSEQUENT_OUTAGES,LIVE_OUTAGE,Outages_in_last_1hr,Outages_in_last_2hr,Outages_in_last_3hr,Outages_in_last_4hr,Outages_in_last_5hr,Outages_in_last_6hr,Outages_in_last_7hr,Outages_in_last_8hr,Outages_in_last_9hr,Outages_in_last_10hr,Weekday,Weekend_flag,Live_outage_group,Priority_Customer_Qty,Dis_From_Live_Centriod_div_Cust_qty,Priority_Dist_Customer_Qty,Dispatch_Location,Date
0,2002769033,2001558502,324-A/251,2020-11-15 00:24:21,2020-11-15 00:37:35,2304,1TPUG,230,1,7,0,2020-11-15 05:00:00,7,50.0,50.0,0,13.23,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,INDIANAPOLIS,39.86,-86.27,ZONE4,514,4,186,2,96,Marker6,0.0,1.0,1.0,0,1,2,3,3,4,15,35,40,51,64,Sunday,True,325,1.0,0.0,1.0,34th,2020-11-15
1,2002769124,2001558508,783-A/1,2020-11-15 02:58:44,2020-11-15 06:08:43,2255,SWITCH,225,69,169,0,2020-11-15 05:30:00,169,638.33,638.33,0,189.98,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,INDIANAPOLIS,39.65,-86.18,ZONE1,514,4,186,2,96,Marker15,0.5,0.87,2.0,0,0,0,1,3,3,4,4,9,26,39,Sunday,True,326,1.0,0.0,1.0,W.I.,2020-11-15
2,2002769107,2001558508,EDIT2000064927,2020-11-15 02:58:44,2020-11-15 04:21:31,2255,CUT,225,35,95,1,2020-11-15 08:00:00,95,1100.0,1100.0,0,82.78,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,INDIANAPOLIS,39.65,-86.18,ZONE1,514,4,186,2,96,Marker15,0.5,0.87,2.0,0,0,0,1,3,3,4,4,9,26,39,Sunday,True,327,1.0,0.0,1.0,W.I.,2020-11-15
3,2002769106,2001558508,EDIT2000064926,2020-11-15 02:58:44,2020-11-15 04:21:53,2255,CUT,225,33,120,1,2020-11-15 08:00:00,120,1215.0,1215.0,0,83.15,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,INDIANAPOLIS,39.65,-86.18,ZONE1,514,4,186,2,96,Marker15,0.5,0.87,2.0,0,0,0,1,3,3,4,4,9,26,39,Sunday,True,328,1.0,0.0,1.0,W.I.,2020-11-15
4,2002769105,2001558508,EDIT2000064928,2020-11-15 02:58:44,2020-11-15 04:21:11,2255,CUT,225,30,79,1,2020-11-15 08:00:00,79,825.0,825.0,0,82.45,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,INDIANAPOLIS,39.65,-86.18,ZONE1,514,4,186,2,96,Marker15,0.5,0.87,2.0,0,0,0,1,3,3,4,4,9,26,39,Sunday,True,329,1.0,0.0,1.0,W.I.,2020-11-15


## **Weather data addition**

In [95]:
from pandas.io.json import json_normalize
def ws_historical_data(start, lat, long, period='day', fields='all'):
    '''
    If duration is more than 1 year separate calls should be used
    Timestamp should be converted to ISO 8601 format
    Docstring with examples and function return values:
    
    Input :
    start - (%Y-%m-%d) format
    end - (%Y-%m-%d) format
    lat - latitude 
    long - longitude
    period - hour, day (default=day)
    
    Output : return a collection of weather historical data for a latitude/longitude point
    
    '''
    headers = {'User-Agent': 'Chrome/78and.0.3865.90'}
    http_proxy  = "http://10.245.5.249:8080"
    https_proxy = "https://10.245.5.249:8080"
    ftp_proxy   = "ftp://10.245.5.249:8080"

    proxyDict = { 
                "http"  : http_proxy, 
                "https" : https_proxy, 
                "ftp"   : ftp_proxy
                 }
    
    key = 'e721181f854ac2268ee8'
    start = pd.to_datetime(start,format='%Y-%m-%d')
    
    start = start.strftime('%Y-%m-%dT%H:%M:%S')
    
    weather_ = pd.DataFrame()
    link = 'https://api.weathersource.com/v1/'+key+'/points/'+lat+','+long+'/history.json?period='+period+'&timestamp_eq='+start+'&fields='+fields
    print(link)
    response = requests.get(link, headers=headers,proxies=proxyDict)
    json_obj = json.loads(response.content.decode('utf-8'))
    weather_ = json_normalize(json_obj)
    
    return weather_

In [96]:
date_list = list(ads_final_v1['Date'].unique())
print(date_list)

sites_latitude = {
    'Marker 1' : '39.9613','Marker 2' : '39.8971','Marker 3' : '39.9060','Marker 4' : '39.9024','Marker 5' : '39.8960','Marker 6' : '39.8339',
    'Marker 7' : '39.8412','Marker 8' : '39.8381','Marker 9' : '39.8386','Marker 10' : '39.7579','Marker 11' : '39.7621','Marker 12' : '39.7621',
    'Marker 13' : '39.7695','Marker 14' : '39.6617','Marker 15' : '39.6639','Marker 16' : '39.6702','Marker 17' : '39.6744','Marker 18' : '39.5909',
    'Marker 19' : '39.5295','Marker 20' : '39.5475'
    }
    
# longitude of the location markers
sites_longitude = {
    'Marker 1' : '-86.4034','Marker 2' : '-86.3045','Marker 3' : '-86.2001','Marker 4' : '-86.0738','Marker 5' : '-85.9783','Marker 6' : '-86.3155',
    'Marker 7' : '-86.2056','Marker 8' : '-86.0985','Marker 9' : '-85.9811','Marker 10' : '-86.3155','Marker 11' : '-86.2042','Marker 12' : '-86.0923',
    'Marker 13' : '-85.9708','Marker 14' : '-86.2935','Marker 15' : '-86.1823','Marker 16' : '-86.0669','Marker 17' : '-85.9557','Marker 18' : '-86.4212',
    'Marker 19' : '-86.5874','Marker 20' : '-86.2743'
    }


location_marker = ['Marker 1', 'Marker 2', 'Marker 3', 'Marker 4', 'Marker 5', 'Marker 6', 'Marker 7', 'Marker 8', 'Marker 9', 'Marker 10',
                   'Marker 11', 'Marker 12', 'Marker 13', 'Marker 14', 'Marker 15', 'Marker 16', 'Marker 17', 'Marker 18', 'Marker 19', 'Marker 20']

waethersourcefiles_historical = []
ws_master = pd.DataFrame()
value1 = 0.0
value2 = 0.0
for j in date_list:
    print(j)
    for i in location_marker:
        time.sleep(2)
        value1 = sites_latitude.get(i)
        value2 = sites_longitude.get(i)
        waethersource_data_historical = ws_historical_data(start=j, lat=value1, long=value2, period='day')    
        waethersource_data_historical['Location'] = i
        waethersourcefiles_historical.append(waethersource_data_historical)
    
    waethersource_df_his = pd.concat(waethersourcefiles_historical)
    waethersource_df_his.reset_index(drop=True, inplace=True)
    ws_master = ws_master.append(waethersource_df_his)
    
ws_master.reset_index(drop=True, inplace=True)
ws_master['timestamp'] = pd.to_datetime(ws_master['timestamp']).dt.date

['2020-11-15']
2020-11-15
https://api.weathersource.com/v1/e721181f854ac2268ee8/points/39.9613,-86.4034/history.json?period=day&timestamp_eq=2020-11-15T00:00:00&fields=all
https://api.weathersource.com/v1/e721181f854ac2268ee8/points/39.8971,-86.3045/history.json?period=day&timestamp_eq=2020-11-15T00:00:00&fields=all
https://api.weathersource.com/v1/e721181f854ac2268ee8/points/39.9060,-86.2001/history.json?period=day&timestamp_eq=2020-11-15T00:00:00&fields=all
https://api.weathersource.com/v1/e721181f854ac2268ee8/points/39.9024,-86.0738/history.json?period=day&timestamp_eq=2020-11-15T00:00:00&fields=all
https://api.weathersource.com/v1/e721181f854ac2268ee8/points/39.8960,-85.9783/history.json?period=day&timestamp_eq=2020-11-15T00:00:00&fields=all
https://api.weathersource.com/v1/e721181f854ac2268ee8/points/39.8339,-86.3155/history.json?period=day&timestamp_eq=2020-11-15T00:00:00&fields=all
https://api.weathersource.com/v1/e721181f854ac2268ee8/points/39.8412,-86.2056/history.json?period=

In [97]:
print(ws_master.shape)

(20, 59)


In [98]:
ws_master.head()

Unnamed: 0,latitude,longitude,timestamp,cldCvrMin,cldCvrAvg,cldCvrMax,dewPtMin,dewPtAvg,dewPtMax,feelsLikeMin,feelsLikeAvg,feelsLikeMax,heatIndexMin,heatIndexAvg,heatIndexMax,mslPresMin,mslPresAvg,mslPresMax,precip,presTendMin,presTendAvg,presTendMax,radSolarMin,radSolarAvg,radSolarMax,radSolarTot,relHumMin,relHumAvg,relHumMax,sfcPresMin,sfcPresAvg,sfcPresMax,snowDepth,snowfall,spcHumMin,spcHumAvg,spcHumMax,tempMin,tempAvg,tempMax,windChillMin,windChillAvg,windChillMax,windDirAvg,windDir80mAvg,windDir100mAvg,windSpdMin,windSpdAvg,windSpdMax,windSpd80mMin,windSpd80mAvg,windSpd80mMax,windSpd100mMin,windSpd100mAvg,windSpd100mMax,wetBulbMin,wetBulbAvg,wetBulbMax,Location
0,39.96,-86.4,2020-11-15,0,46,100,29.6,39.7,53.1,31.7,39.4,50.8,40.1,46.7,54.7,999.5,1008.4,1018.8,0.64,-2.7,0.5,3.4,0,49.0,204.0,1175.0,65.6,77.0,100.0,963.7,973.5,983.4,0,0,3.5,5.6,8.9,40.1,46.7,54.7,31.7,39.4,50.8,247,244,244,9.4,20.0,31.7,16.3,30.0,41.5,16.0,30.6,38.8,36.1,43.6,53.6,Marker 1
1,39.9,-86.3,2020-11-15,0,48,100,29.3,39.6,53.2,33.3,41.2,52.4,41.2,47.7,56.8,999.8,1008.6,1018.9,0.57,-2.7,0.5,3.7,0,55.3,238.6,1327.8,61.0,74.3,95.6,966.7,976.3,986.2,0,0,3.5,5.6,8.9,41.2,47.7,56.8,33.3,41.2,52.4,248,245,244,7.8,17.7,29.7,14.7,28.8,40.4,13.9,29.1,36.4,36.6,44.1,53.8,Marker 2
2,39.91,-86.2,2020-11-15,0,51,100,29.1,39.7,52.7,34.3,42.3,54.5,42.1,48.3,57.9,999.7,1008.6,1018.8,0.41,-2.7,0.5,3.7,0,53.9,268.8,1293.6,60.0,73.0,93.0,968.1,977.3,987.2,0,0,3.4,5.6,8.8,42.1,48.3,57.9,34.3,42.3,54.5,246,244,244,3.8,15.9,26.3,13.6,28.3,40.2,13.2,28.2,35.3,37.2,44.5,53.7,Marker 3
3,39.9,-86.07,2020-11-15,0,52,100,29.4,40.0,52.7,35.3,43.0,55.1,42.6,48.8,58.5,999.6,1008.5,1018.8,0.54,-2.8,0.5,3.6,0,49.5,286.7,1188.6,59.1,72.1,92.9,970.3,979.6,989.6,0,0,3.5,5.6,8.7,42.6,48.8,58.5,35.3,43.0,55.1,245,243,243,4.8,15.8,25.3,12.9,28.0,39.8,12.9,27.6,35.5,37.6,44.8,53.7,Marker 4
4,39.9,-85.98,2020-11-15,0,54,100,29.3,39.9,52.5,35.0,42.7,53.5,42.3,48.7,57.5,999.6,1008.6,1018.7,0.54,-2.4,0.5,3.2,0,48.1,307.0,1153.6,60.0,72.0,93.9,969.4,978.2,988.2,0,0,3.5,5.6,8.7,42.3,48.7,57.5,35.0,42.7,53.5,244,240,241,4.3,16.5,26.3,14.2,28.4,39.8,14.5,28.1,35.9,37.4,44.7,53.2,Marker 5


In [99]:
print(list(ws_master.columns))

['latitude', 'longitude', 'timestamp', 'cldCvrMin', 'cldCvrAvg', 'cldCvrMax', 'dewPtMin', 'dewPtAvg', 'dewPtMax', 'feelsLikeMin', 'feelsLikeAvg', 'feelsLikeMax', 'heatIndexMin', 'heatIndexAvg', 'heatIndexMax', 'mslPresMin', 'mslPresAvg', 'mslPresMax', 'precip', 'presTendMin', 'presTendAvg', 'presTendMax', 'radSolarMin', 'radSolarAvg', 'radSolarMax', 'radSolarTot', 'relHumMin', 'relHumAvg', 'relHumMax', 'sfcPresMin', 'sfcPresAvg', 'sfcPresMax', 'snowDepth', 'snowfall', 'spcHumMin', 'spcHumAvg', 'spcHumMax', 'tempMin', 'tempAvg', 'tempMax', 'windChillMin', 'windChillAvg', 'windChillMax', 'windDirAvg', 'windDir80mAvg', 'windDir100mAvg', 'windSpdMin', 'windSpdAvg', 'windSpdMax', 'windSpd80mMin', 'windSpd80mAvg', 'windSpd80mMax', 'windSpd100mMin', 'windSpd100mAvg', 'windSpd100mMax', 'wetBulbMin', 'wetBulbAvg', 'wetBulbMax', 'Location']


In [100]:
columns = ['DAY_FLAG','POLE_CLUE_FLG', 'PART_LIGHT_CLUE_FLG', 'EMERGENCY_CLUE_FLG', 'POWER_OUT_CLUE_FLG', 'OPEN_DEVICE_CLUE_FLG', 'TREE_CLUE_FLG', 'WIRE_DOWN_CLUE_FLG',
           'IVR_CLUE_FLG', 'EQUIPMENT_CLUE_FLG', 'TRANSFORMER_CLUE_FLG','OH_CAUSE_FLG', 'UG_CAUSE_FLG', 'ANIMAL_CAUSE_FLG', 'WEATHER_CAUSE_FLG', 'WEATHER_COLD_CAUSE_FLG',
           'PUBLIC_CAUSE_FLG','WEATHER_LIGHTNING_CAUSE_FLG', 'WEATHER__SNOW_CAUSE_FLG', 'WEATHER__WIND_CAUSE_FLG', 'WEATHER__HEAT_CAUSE_FLG','CUST_REQUEST_CAUSE_FLG', 
           'WEATHER__FLOOD_CAUSE_FLG','STREET_CAUSE_FLG', 'SUBSTATION_CAUSE_FLG','TREE_CAUSE_FLG', 'MISCELLANEOUS_CAUSE_FLG', 'NO_CAUSE_FLG', 'PLANNED_CAUSE_FLG',
          'NO_OUTAGE_CAUSE_FLG', 'FUSE_OCCURN_FLG', 'CUST_EQUIP_OCCURN_FLG', 'POLE_OCCURN_FLG','TRANSFORMER_OCCURN_FLG', 'METER_OCCURN_FLG', 'SERVICE_OCCURN_FLG',
           'CABLE_OCCURN_FLG', 'ST_OCCURN_FLG', 'FIRE_OCCURN_FLG', 'FOUND_OPEN_OCCURN_FLG', 'PUBLIC_SAFETY_OCCURN_FLG', 'WIRE_OCCURN_FLG', 'SWITCH_OCCURN_FLG', 
           'REGULATOR_OCCURN_FLG', 'CUTOUT_OCCURN_FLG', 'CAP_BANK_OCCURN_FLG', 'RECLOSER_OCCURN_FLG', 'OH_OCCURN_FLG', 'PRIORITY_VAL_1.0', 'PRIORITY_VAL_2.0',
           'PRIORITY_VAL_3.0', 'PRIORITY_VAL_5.0']
for i in columns:
    ads_final_v1[i] = ads_final_v1[i].apply(lambda x: True if x==1 else False)

In [101]:
ws_master['Location'] = ws_master['Location'].str.replace(" ","")

In [102]:
len(ws_master), len(ws_master.drop_duplicates())

(20, 20)

In [103]:
ws_master.head(2)

Unnamed: 0,latitude,longitude,timestamp,cldCvrMin,cldCvrAvg,cldCvrMax,dewPtMin,dewPtAvg,dewPtMax,feelsLikeMin,feelsLikeAvg,feelsLikeMax,heatIndexMin,heatIndexAvg,heatIndexMax,mslPresMin,mslPresAvg,mslPresMax,precip,presTendMin,presTendAvg,presTendMax,radSolarMin,radSolarAvg,radSolarMax,radSolarTot,relHumMin,relHumAvg,relHumMax,sfcPresMin,sfcPresAvg,sfcPresMax,snowDepth,snowfall,spcHumMin,spcHumAvg,spcHumMax,tempMin,tempAvg,tempMax,windChillMin,windChillAvg,windChillMax,windDirAvg,windDir80mAvg,windDir100mAvg,windSpdMin,windSpdAvg,windSpdMax,windSpd80mMin,windSpd80mAvg,windSpd80mMax,windSpd100mMin,windSpd100mAvg,windSpd100mMax,wetBulbMin,wetBulbAvg,wetBulbMax,Location
0,39.96,-86.4,2020-11-15,0,46,100,29.6,39.7,53.1,31.7,39.4,50.8,40.1,46.7,54.7,999.5,1008.4,1018.8,0.64,-2.7,0.5,3.4,0,49.0,204.0,1175.0,65.6,77.0,100.0,963.7,973.5,983.4,0,0,3.5,5.6,8.9,40.1,46.7,54.7,31.7,39.4,50.8,247,244,244,9.4,20.0,31.7,16.3,30.0,41.5,16.0,30.6,38.8,36.1,43.6,53.6,Marker1
1,39.9,-86.3,2020-11-15,0,48,100,29.3,39.6,53.2,33.3,41.2,52.4,41.2,47.7,56.8,999.8,1008.6,1018.9,0.57,-2.7,0.5,3.7,0,55.3,238.6,1327.8,61.0,74.3,95.6,966.7,976.3,986.2,0,0,3.5,5.6,8.9,41.2,47.7,56.8,33.3,41.2,52.4,248,245,244,7.8,17.7,29.7,14.7,28.8,40.4,13.9,29.1,36.4,36.6,44.1,53.8,Marker2


## **ADDING WEATHER FEATURES**

In [104]:
# removing unwanted columns
newdf_ws = ws_master.copy(deep=True)
newdf_ws = newdf_ws.drop(['snowDepth'], axis = 1)
unwanted = newdf_ws.columns[newdf_ws.columns.str.startswith('presTend')]
newdf_ws = newdf_ws.drop(unwanted, axis=1)

In [105]:
print(list(newdf_ws.columns))

['latitude', 'longitude', 'timestamp', 'cldCvrMin', 'cldCvrAvg', 'cldCvrMax', 'dewPtMin', 'dewPtAvg', 'dewPtMax', 'feelsLikeMin', 'feelsLikeAvg', 'feelsLikeMax', 'heatIndexMin', 'heatIndexAvg', 'heatIndexMax', 'mslPresMin', 'mslPresAvg', 'mslPresMax', 'precip', 'radSolarMin', 'radSolarAvg', 'radSolarMax', 'radSolarTot', 'relHumMin', 'relHumAvg', 'relHumMax', 'sfcPresMin', 'sfcPresAvg', 'sfcPresMax', 'snowfall', 'spcHumMin', 'spcHumAvg', 'spcHumMax', 'tempMin', 'tempAvg', 'tempMax', 'windChillMin', 'windChillAvg', 'windChillMax', 'windDirAvg', 'windDir80mAvg', 'windDir100mAvg', 'windSpdMin', 'windSpdAvg', 'windSpdMax', 'windSpd80mMin', 'windSpd80mAvg', 'windSpd80mMax', 'windSpd100mMin', 'windSpd100mAvg', 'windSpd100mMax', 'wetBulbMin', 'wetBulbAvg', 'wetBulbMax', 'Location']


In [106]:
## Add range for columns with negative values 

newdf_ws['tempRange'] = newdf_ws['tempMax'] - newdf_ws['tempMin']
newdf_ws['windSpdRange'] = newdf_ws['windSpdMax'] - newdf_ws['windSpdMin']
newdf_ws['sfcPresRange'] = newdf_ws['sfcPresMax'] - newdf_ws['sfcPresMin']
newdf_ws['cldCvrRange'] = newdf_ws['cldCvrMax'] - newdf_ws['cldCvrMin']
newdf_ws['relHumRange'] = newdf_ws['relHumMax'] - newdf_ws['relHumMin']

In [107]:
## Add ratio for columns which dont have negative values 

newdf_ws['relHumRatio'] = newdf_ws['relHumMax'] / newdf_ws['relHumMin']
newdf_ws['sfcPresRatio'] = newdf_ws['sfcPresMax'] / newdf_ws['sfcPresMin']

In [108]:
newdf_ws = newdf_ws.replace([np.inf, -np.inf], np.nan)
nulls = newdf_ws.isnull().sum()

df_nulls = pd.DataFrame({'Feature': nulls.index, 'VALUES': nulls.values})
df_nulls[df_nulls.VALUES>=1]

Unnamed: 0,Feature,VALUES


In [109]:
print(ads_final_v1['Date'].dtype)
print(newdf_ws['timestamp'].dtype)
print(ads_final_v1['Marker_Location'].dtype)
print(newdf_ws['Location'].dtype)

object
object
object
object


In [110]:
ads_final_v1['Date'] = pd.to_datetime(ads_final_v1['Date'])
newdf_ws['timestamp'] = pd.to_datetime(newdf_ws['timestamp'])

In [111]:
ads_df = pd.merge(ads_final_v1, newdf_ws,left_on = ['Date','Marker_Location'],right_on = ['timestamp','Location'],how = "left")

In [112]:
ads_df.head()

Unnamed: 0,OUTAGE_ID,INCIDENT_ID,STRCTUR_NO,CREATION_DATETIME,ENERGIZED_DATETIME,CIRCT_ID,DNI_EQUIP_TYPE,SUBST_ID,CALL_QTY,DOWNSTREAM_CUST_QTY,KEY_CUST_QTY,ETR_DATETIME,CUST_QTY,DOWNSTREAM_KVA_VAL,KVA_VAL,DAY_FLAG,TTR,MAJ_OTG_ID,POLE_CLUE_FLG,PART_LIGHT_CLUE_FLG,EMERGENCY_CLUE_FLG,POWER_OUT_CLUE_FLG,OPEN_DEVICE_CLUE_FLG,TREE_CLUE_FLG,WIRE_DOWN_CLUE_FLG,IVR_CLUE_FLG,EQUIPMENT_CLUE_FLG,TRANSFORMER_CLUE_FLG,OH_CAUSE_FLG,UG_CAUSE_FLG,ANIMAL_CAUSE_FLG,WEATHER_CAUSE_FLG,WEATHER_COLD_CAUSE_FLG,PUBLIC_CAUSE_FLG,WEATHER_LIGHTNING_CAUSE_FLG,WEATHER__SNOW_CAUSE_FLG,WEATHER__WIND_CAUSE_FLG,WEATHER__HEAT_CAUSE_FLG,CUST_REQUEST_CAUSE_FLG,WEATHER__FLOOD_CAUSE_FLG,STREET_CAUSE_FLG,SUBSTATION_CAUSE_FLG,TREE_CAUSE_FLG,MISCELLANEOUS_CAUSE_FLG,NO_CAUSE_FLG,PLANNED_CAUSE_FLG,NO_OUTAGE_CAUSE_FLG,FUSE_OCCURN_FLG,CUST_EQUIP_OCCURN_FLG,POLE_OCCURN_FLG,TRANSFORMER_OCCURN_FLG,METER_OCCURN_FLG,SERVICE_OCCURN_FLG,CABLE_OCCURN_FLG,ST_OCCURN_FLG,FIRE_OCCURN_FLG,FOUND_OPEN_OCCURN_FLG,PUBLIC_SAFETY_OCCURN_FLG,WIRE_OCCURN_FLG,SWITCH_OCCURN_FLG,REGULATOR_OCCURN_FLG,CUTOUT_OCCURN_FLG,CAP_BANK_OCCURN_FLG,RECLOSER_OCCURN_FLG,OH_OCCURN_FLG,PRIORITY_VAL_1.0,PRIORITY_VAL_2.0,PRIORITY_VAL_3.0,PRIORITY_VAL_5.0,CITY_NAM,LAT,LONG,ZONE,NO_OF_POWER_OUT_CLUE_PER_DAY,NO_OF_OPEN_DEVICE_CLUE_PER_DAY,NO_OF_IVR_CLUE_PER_DAY,NO_OF_ANIMAL_CAUSE_PER_DAY,NO_OF_WIRE_OCCURN_PER_DAY,Marker_Location,Hour_Sin,Hour_Cos,RANK_SUBSEQUENT_OUTAGES,LIVE_OUTAGE,Outages_in_last_1hr,Outages_in_last_2hr,Outages_in_last_3hr,Outages_in_last_4hr,Outages_in_last_5hr,Outages_in_last_6hr,Outages_in_last_7hr,Outages_in_last_8hr,Outages_in_last_9hr,Outages_in_last_10hr,Weekday,Weekend_flag,Live_outage_group,Priority_Customer_Qty,Dis_From_Live_Centriod_div_Cust_qty,Priority_Dist_Customer_Qty,Dispatch_Location,Date,latitude,longitude,timestamp,cldCvrMin,cldCvrAvg,cldCvrMax,dewPtMin,dewPtAvg,dewPtMax,feelsLikeMin,feelsLikeAvg,feelsLikeMax,heatIndexMin,heatIndexAvg,heatIndexMax,mslPresMin,mslPresAvg,mslPresMax,precip,radSolarMin,radSolarAvg,radSolarMax,radSolarTot,relHumMin,relHumAvg,relHumMax,sfcPresMin,sfcPresAvg,sfcPresMax,snowfall,spcHumMin,spcHumAvg,spcHumMax,tempMin,tempAvg,tempMax,windChillMin,windChillAvg,windChillMax,windDirAvg,windDir80mAvg,windDir100mAvg,windSpdMin,windSpdAvg,windSpdMax,windSpd80mMin,windSpd80mAvg,windSpd80mMax,windSpd100mMin,windSpd100mAvg,windSpd100mMax,wetBulbMin,wetBulbAvg,wetBulbMax,Location,tempRange,windSpdRange,sfcPresRange,cldCvrRange,relHumRange,relHumRatio,sfcPresRatio
0,2002769033,2001558502,324-A/251,2020-11-15 00:24:21,2020-11-15 00:37:35,2304,1TPUG,230,1,7,0,2020-11-15 05:00:00,7,50.0,50.0,False,13.23,0,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,INDIANAPOLIS,39.86,-86.27,ZONE4,514,4,186,2,96,Marker6,0.0,1.0,1.0,0,1,2,3,3,4,15,35,40,51,64,Sunday,True,325,1.0,0.0,1.0,34th,2020-11-15,39.83,-86.32,2020-11-15,0,48,100,29.1,39.4,53.1,34.3,41.9,53.8,41.6,48.2,57.8,1000.0,1008.8,1019.0,0.31,0,59.5,259.3,1428.8,58.2,72.1,95.6,967.7,977.4,987.4,0,3.4,5.5,8.9,41.6,48.2,57.8,34.3,41.9,53.8,246,244,244,4.7,17.1,28.5,13.7,28.4,39.9,13.0,28.5,35.7,36.8,44.3,53.8,Marker6,16.2,23.8,19.7,100,37.4,1.64,1.02
1,2002769124,2001558508,783-A/1,2020-11-15 02:58:44,2020-11-15 06:08:43,2255,SWITCH,225,69,169,0,2020-11-15 05:30:00,169,638.33,638.33,False,189.98,0,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,True,False,False,INDIANAPOLIS,39.65,-86.18,ZONE1,514,4,186,2,96,Marker15,0.5,0.87,2.0,0,0,0,1,3,3,4,4,9,26,39,Sunday,True,326,1.0,0.0,1.0,W.I.,2020-11-15,39.66,-86.18,2020-11-15,0,44,100,30.6,40.1,54.3,36.4,43.7,57.0,42.3,49.5,59.9,1000.2,1009.1,1019.2,0.49,0,65.9,320.0,1580.9,56.8,70.3,95.4,973.2,982.9,993.0,0,3.6,5.6,9.2,42.3,49.5,59.9,36.4,43.7,57.0,238,241,243,5.7,17.6,30.1,14.8,28.1,38.5,14.1,27.1,35.5,37.7,45.3,54.8,Marker15,17.6,24.4,19.8,100,38.6,1.68,1.02
2,2002769107,2001558508,EDIT2000064927,2020-11-15 02:58:44,2020-11-15 04:21:31,2255,CUT,225,35,95,1,2020-11-15 08:00:00,95,1100.0,1100.0,False,82.78,0,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,True,False,False,INDIANAPOLIS,39.65,-86.18,ZONE1,514,4,186,2,96,Marker15,0.5,0.87,2.0,0,0,0,1,3,3,4,4,9,26,39,Sunday,True,327,1.0,0.0,1.0,W.I.,2020-11-15,39.66,-86.18,2020-11-15,0,44,100,30.6,40.1,54.3,36.4,43.7,57.0,42.3,49.5,59.9,1000.2,1009.1,1019.2,0.49,0,65.9,320.0,1580.9,56.8,70.3,95.4,973.2,982.9,993.0,0,3.6,5.6,9.2,42.3,49.5,59.9,36.4,43.7,57.0,238,241,243,5.7,17.6,30.1,14.8,28.1,38.5,14.1,27.1,35.5,37.7,45.3,54.8,Marker15,17.6,24.4,19.8,100,38.6,1.68,1.02
3,2002769106,2001558508,EDIT2000064926,2020-11-15 02:58:44,2020-11-15 04:21:53,2255,CUT,225,33,120,1,2020-11-15 08:00:00,120,1215.0,1215.0,False,83.15,0,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,True,False,False,INDIANAPOLIS,39.65,-86.18,ZONE1,514,4,186,2,96,Marker15,0.5,0.87,2.0,0,0,0,1,3,3,4,4,9,26,39,Sunday,True,328,1.0,0.0,1.0,W.I.,2020-11-15,39.66,-86.18,2020-11-15,0,44,100,30.6,40.1,54.3,36.4,43.7,57.0,42.3,49.5,59.9,1000.2,1009.1,1019.2,0.49,0,65.9,320.0,1580.9,56.8,70.3,95.4,973.2,982.9,993.0,0,3.6,5.6,9.2,42.3,49.5,59.9,36.4,43.7,57.0,238,241,243,5.7,17.6,30.1,14.8,28.1,38.5,14.1,27.1,35.5,37.7,45.3,54.8,Marker15,17.6,24.4,19.8,100,38.6,1.68,1.02
4,2002769105,2001558508,EDIT2000064928,2020-11-15 02:58:44,2020-11-15 04:21:11,2255,CUT,225,30,79,1,2020-11-15 08:00:00,79,825.0,825.0,False,82.45,0,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,True,False,False,INDIANAPOLIS,39.65,-86.18,ZONE1,514,4,186,2,96,Marker15,0.5,0.87,2.0,0,0,0,1,3,3,4,4,9,26,39,Sunday,True,329,1.0,0.0,1.0,W.I.,2020-11-15,39.66,-86.18,2020-11-15,0,44,100,30.6,40.1,54.3,36.4,43.7,57.0,42.3,49.5,59.9,1000.2,1009.1,1019.2,0.49,0,65.9,320.0,1580.9,56.8,70.3,95.4,973.2,982.9,993.0,0,3.6,5.6,9.2,42.3,49.5,59.9,36.4,43.7,57.0,238,241,243,5.7,17.6,30.1,14.8,28.1,38.5,14.1,27.1,35.5,37.7,45.3,54.8,Marker15,17.6,24.4,19.8,100,38.6,1.68,1.02


In [113]:
print(ads_df.shape[0])
print(len(ads_df.drop_duplicates()))

523
523


In [114]:
ads_df.drop_duplicates(keep='first', inplace=True)
ads_df.reset_index(drop=True,inplace=True)
print(ads_df.shape)

(523, 163)


In [115]:
# removing unwanted columns for the final data frame

ads_df = ads_df.drop(['DOWNSTREAM_KVA_VAL','KVA_VAL','latitude','longitude','timestamp', 'radSolarMin','Location'], axis = 1)

In [116]:
ads_df = ads_df.drop_duplicates()
ads_df.reset_index(drop=True, inplace=True)
print(ads_df.shape)

(523, 156)


In [117]:
ads_df.dropna(axis=0,subset=['LAT','LONG'], inplace=True)

In [118]:
def feature_add(group):
    group = group.reset_index(drop = True)
    x = [group.Outages_in_last_1hr[0],group.Outages_in_last_2hr[0],group.Outages_in_last_3hr[0],group.Outages_in_last_4hr[0],group.Outages_in_last_5hr[0]
        ,group.Outages_in_last_6hr[0],group.Outages_in_last_7hr[0],group.Outages_in_last_8hr[0],group.Outages_in_last_9hr[0],group.Outages_in_last_10hr[0]]
    y = [1,2,3,4,5,6,7,8,9,10]
    slope, intercept, r_value, p_value, std_err = linregress(x, y)
    group['Slope_outages'] = slope
    
    ROC = [abs(group.Outages_in_last_1hr[0] - group.Outages_in_last_2hr[0]),abs(group.Outages_in_last_2hr[0] - group.Outages_in_last_3hr[0]),
           abs(group.Outages_in_last_3hr[0] - group.Outages_in_last_4hr[0]),abs(group.Outages_in_last_4hr[0] - group.Outages_in_last_5hr[0]),
           abs(group.Outages_in_last_5hr[0] - group.Outages_in_last_6hr[0]),abs(group.Outages_in_last_6hr[0] - group.Outages_in_last_7hr[0]),
           abs(group.Outages_in_last_7hr[0] - group.Outages_in_last_8hr[0]),abs(group.Outages_in_last_8hr[0] - group.Outages_in_last_9hr[0]),
           abs(group.Outages_in_last_9hr[0] - group.Outages_in_last_10hr[0])]
    y = [1,2,3,4,5,6,7,8,9]
    slope, intercept, r_value, p_value, std_err = linregress(ROC, y)
    group['Slope_ROC_outages'] = slope
    
    group['max_ROC_outages'] = max(ROC)
    group['max_index_ROC'] = 1+ROC.index(max(ROC))    
    group['weight_ROC'] = group['max_ROC_outages'] * group['max_index_ROC']*0.1
    return group

In [119]:
ads_df = ads_df.groupby(['OUTAGE_ID'], as_index = False).apply(feature_add).reset_index(drop = True)

In [120]:
ads_df = ads_df.dropna(axis=0, subset=['Slope_outages', 'Slope_ROC_outages'])

In [121]:
ads_df.isnull().sum()

OUTAGE_ID                              0
INCIDENT_ID                            0
STRCTUR_NO                             0
CREATION_DATETIME                      0
ENERGIZED_DATETIME                     0
CIRCT_ID                               0
DNI_EQUIP_TYPE                         0
SUBST_ID                               0
CALL_QTY                               0
DOWNSTREAM_CUST_QTY                    0
KEY_CUST_QTY                           0
ETR_DATETIME                           0
CUST_QTY                               0
DAY_FLAG                               0
TTR                                    0
MAJ_OTG_ID                             0
POLE_CLUE_FLG                          0
PART_LIGHT_CLUE_FLG                    0
EMERGENCY_CLUE_FLG                     0
POWER_OUT_CLUE_FLG                     0
OPEN_DEVICE_CLUE_FLG                   0
TREE_CLUE_FLG                          0
WIRE_DOWN_CLUE_FLG                     0
IVR_CLUE_FLG                           0
EQUIPMENT_CLUE_F

In [122]:
ads_df.to_csv("gs://aes-datahub-0002-curated/Outage_Restoration/IPL_Master_Dataset/OMS_All_Storm.csv",index=False)

### **Drop columns which will not be used for modelling purpose**

In [None]:
ads_df.drop(['OUTAGE_ID', 'INCIDENT_ID', 'STRCTUR_NO', 'ENERGIZED_DATETIME', 'CIRCT_ID', 'SUBST_ID', 
            'ETR_DATETIME', 'CUST_QTY', 'LAT', 'LONG', 'LIVE_OUTAGE', 'Live_outage_group', 'Marker_Location', 'Dis_From_Live_Centriod_div_Cust_qty'], axis=1, inplace=True)

In [None]:
print(ads_df.shape)

## **WRITE TO CSV**

In [None]:
ads_df.to_csv("gs://aes-datahub-0002-curated/Outage_Restoration/Historical_Data/Master_Dataset/OMS_STORMS_09092020.csv",index=False)