## **Import Required packages**

In [1]:
'''
PYHTON SCRIPT TO CREATE CURATED DATASET AND FEAUTRES LIKE
OUTAGES IN LAST N HOURS, CAUSE, OCCURN, CLUE DAY FLAGS
'''

import os
import ast
import math
from datetime import datetime, timedelta
import pandas as pd
from pandas.io import gbq
import subprocess
import logging

from configparser import ConfigParser, ExtendedInterpolation
import geopy.distance

# Setup logs
logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s',
    level=logging.INFO,
    datefmt='%Y-%m-%d %H:%M:%S')
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

In [2]:
# read config file
CONFIGPARSER = ConfigParser(interpolation=ExtendedInterpolation())
CONFIGPARSER.read('/root/confignew0002.ini')
logging.info('Config File Loaded')
logging.info('Config File Sections %s', CONFIGPARSER.sections())

2020-11-27 16:50:38 INFO     Config File Loaded
2020-11-27 16:50:38 INFO     Config File Sections ['SETTINGS', 'LIVE_OMS', 'DATA_COLLATION', 'CURATED_DATA', 'LOAD_AND_PREDICT']


In [3]:
def QC_CHECK_SHAPE_AND_COLUMNS(df):
    '''
    Input - Dataframe with operations/addtion of features/columns or joins performed
    Output - Log Info using shape of dataframe and columns present
    '''
    logging.info('****QC Check****')
    logging.info('\n')
    logging.info('Shape of the DataFrame %s', df.shape)
    logging.info('\n')
    logging.info('Columns present in the DataFrame: %s', list(df.columns))
    logging.info('\n')
    return

## **Read OMS Weather Source curated dataset**

In [4]:
BUCKET_NAME = CONFIGPARSER['CURATED_DATA']['CURATED_DATA_STAGING_PATH']
DF_OMSDS = pd.read_csv(BUCKET_NAME)

DF_OMSDS = DF_OMSDS.loc[:, ~DF_OMSDS.columns.str.contains('^Unnamed')]
DF_OMSDS = DF_OMSDS.loc[:, ~DF_OMSDS.columns.str.contains('^_c0')]

DF_OMSDS['CREATION_DATETIME'] = pd.to_datetime(DF_OMSDS['CREATION_DATETIME'],
                                               errors='coerce')
DF_OMSDS['Date'] = DF_OMSDS['CREATION_DATETIME'].dt.date

logging.info('OMS LIVE Dataframe Loaded')
logging.info('\n')
QC_CHECK_SHAPE_AND_COLUMNS(DF_OMSDS)

2020-11-27 16:50:39 INFO     OMS LIVE Dataframe Loaded
2020-11-27 16:50:39 INFO     

2020-11-27 16:50:39 INFO     ****QC Check****
2020-11-27 16:50:39 INFO     

2020-11-27 16:50:39 INFO     Shape of the DataFrame (7, 139)
2020-11-27 16:50:39 INFO     

2020-11-27 16:50:39 INFO     Columns present in the DataFrame: ['INCIDENT_ID', 'STRCTUR_NO', 'CIRCT_ID', 'DNI_EQUIP_TYPE', 'CALL_QTY', 'CUST_QTY', 'KVA_VAL', 'DOWNSTREAM_KVA_VAL', 'INCIDENT_DEVICE_ID', 'CREATION_DATETIME', 'SUBST_ID', 'LOCATION_ID', 'ENERGIZED_DATETIME', 'OUTAGE_ID', 'DAY_FLAG', 'POLE_CLUE_FLG', 'PART_LIGHT_CLUE_FLG', 'EMERGENCY_CLUE_FLG', 'POWER_OUT_CLUE_FLG', 'TREE_CLUE_FLG', 'WIRE_DOWN_CLUE_FLG', 'IVR_CLUE_FLG', 'EQUIPMENT_CLUE_FLG', 'TRANSFORMER_CLUE_FLG', 'OPEN_DEVICE_CLUE_FLG', 'OH_CAUSE_FLG', 'UG_CAUSE_FLG', 'ANIMAL_CAUSE_FLG', 'WEATHER_CAUSE_FLG', 'WEATHER_COLD_CAUSE_FLG', 'WEATHER_LIGHTNING_CAUSE_FLG', 'WEATHER__SNOW_CAUSE_FLG', 'WEATHER__WIND_CAUSE_FLG', 'WEATHER__HEAT_CAUSE_FLG', 'WEATHER__FLOOD_CAUSE_FLG', 

## **Priority Queuing Feature**

In [5]:
# 1. Rank based on simple customer quantity as mentioned by Business
# (live rankings to be followed,  numerical feature)

DF_OMSDS.sort_values(by=['CREATION_DATETIME'], inplace=True)
DF_OMSDS.reset_index(drop=True, inplace=True)

DF_OMSDS['ENERGIZED_DATETIME'] = DF_OMSDS.ENERGIZED_DATETIME.fillna('1900-01-01')
DF_OMSDS['ENERGIZED_DATETIME'] = pd.to_datetime(DF_OMSDS['ENERGIZED_DATETIME'])
DF_OMSDS['ENERGIZED_DATETIME'] = DF_OMSDS['ENERGIZED_DATETIME'].apply(
    lambda row: row.strftime("%Y-%m-%d"))
DF_OMSDS = DF_OMSDS[DF_OMSDS.ENERGIZED_DATETIME == '1900-01-01']

SHAPE = DF_OMSDS.shape[0]
if SHAPE == 0:
    raise Exception('No new Outages,  All outages are already ENERGIZED for')

DF_OMSDS['Priority_Customer_Qty'] = DF_OMSDS['CUST_QTY'].rank(method='dense', ascending=False)

# 2. Rank based on the factor of distance from centroid and customer quantity
#(live rankings to be followed,approach #2, numerical feature)

DF_OMSDS['LIVE_OUTAGE'] = DF_OMSDS.OUTAGE_ID.nunique()
DF_OMSDS['Center_LAT'] = (DF_OMSDS.LAT)/(DF_OMSDS.LIVE_OUTAGE)
DF_OMSDS['Center_LONG'] = (DF_OMSDS.LONG)/(DF_OMSDS.LIVE_OUTAGE)


def cal_distance_from_center_lat_long(lat, long, center_lat, center_long):
    '''
    Takes lat, long, center_lat, center_long as argument and outputs the distance
    '''
    if math.isnan(lat)|math.isnan(long)|math.isnan(center_lat)|math.isnan(center_long):
        return None
    else:
        coords1 = [lat, long]
        coords2 = [center_lat, center_long]
        return geopy.distance.distance(coords1, coords2).miles

DF_OMSDS['Dis_From_Live_Centriod'] = DF_OMSDS.apply(lambda x: cal_distance_from_center_lat_long(
    x['LAT'], x['LONG'], x['Center_LAT'], x['Center_LONG']), axis=1)
DF_OMSDS['Dis_From_Live_Centriod'] = DF_OMSDS['Dis_From_Live_Centriod'].apply(pd.to_numeric,
                                                                              errors='coerce')
DF_OMSDS['Dis_From_Live_Centriod_div_Cust_qty'] = (
    DF_OMSDS['Dis_From_Live_Centriod'])/(DF_OMSDS['CUST_QTY'])
DF_OMSDS['Priority_Dist_Customer_Qty'] = DF_OMSDS['Dis_From_Live_Centriod_div_Cust_qty'].rank(
    method='max', ascending=True)

DF_OMSDS.drop(['Center_LAT',
               'Center_LONG', 'Dis_From_Live_Centriod', 'LIVE_OUTAGE'], axis=1, inplace=True)

logging.info('Priority Queuing Features Added')
logging.info('\n')
QC_CHECK_SHAPE_AND_COLUMNS(DF_OMSDS)

2020-11-27 16:50:41 INFO     Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2020-11-27 16:50:41 INFO     NumExpr defaulting to 8 threads.
2020-11-27 16:50:41 INFO     Priority Queuing Features Added
2020-11-27 16:50:41 INFO     

2020-11-27 16:50:41 INFO     ****QC Check****
2020-11-27 16:50:41 INFO     

2020-11-27 16:50:41 INFO     Shape of the DataFrame (2, 142)
2020-11-27 16:50:41 INFO     

2020-11-27 16:50:41 INFO     Columns present in the DataFrame: ['INCIDENT_ID', 'STRCTUR_NO', 'CIRCT_ID', 'DNI_EQUIP_TYPE', 'CALL_QTY', 'CUST_QTY', 'KVA_VAL', 'DOWNSTREAM_KVA_VAL', 'INCIDENT_DEVICE_ID', 'CREATION_DATETIME', 'SUBST_ID', 'LOCATION_ID', 'ENERGIZED_DATETIME', 'OUTAGE_ID', 'DAY_FLAG', 'POLE_CLUE_FLG', 'PART_LIGHT_CLUE_FLG', 'EMERGENCY_CLUE_FLG', 'POWER_OUT_CLUE_FLG', 'TREE_CLUE_FLG', 'WIRE_DOWN_CLUE_FLG', 'IVR_CLUE_FLG', 'EQUIPMENT_CLUE_FLG', 'TRANSFORMER_CLUE_FLG', 'OPEN_DEVICE_CLUE_FLG', 'OH_CAUSE_FLG', 'UG_CAUSE_FLG', 'ANIMAL_CAUSE

## **Read output dataset and filter for Predicted Flag**

In [6]:
DF_FIN = DF_OMSDS.copy(deep=True)

try:
    DF_PRED = CONFIGPARSER['CURATED_DATA']['IPL_PREDICTION_QUERY']
    DF_PRED = gbq.read_gbq(DF_PRED, project_id=CONFIGPARSER['SETTINGS']['PROJECT_ID'])
    PREDICTIONS = list(DF_PRED['OUTAGE_ID'].unique())
    DF_OMSDS['OUTAGE_ID'] = DF_OMSDS['OUTAGE_ID'].astype(str)
    DF_OMSDS['OUTAGE_ID'] = DF_OMSDS['OUTAGE_ID'].str.replace(' ', '')
    DF_FINAL = DF_OMSDS[~DF_OMSDS['OUTAGE_ID'].isin(PREDICTIONS)]
    DF_FINAL.reset_index(drop=True, inplace=True)
except:
    DF_FINAL = DF_OMSDS

SHAPE = DF_FINAL.shape[0]
if SHAPE == 0:
    raise Exception('No new Outages,  All outages are already predicted for')

logging.info('Filtered for Predicted Outages')
logging.info('\n')
QC_CHECK_SHAPE_AND_COLUMNS(DF_FINAL)

2020-11-27 16:50:42 INFO     Filtered for Predicted Outages
2020-11-27 16:50:42 INFO     

2020-11-27 16:50:42 INFO     ****QC Check****
2020-11-27 16:50:42 INFO     

2020-11-27 16:50:42 INFO     Shape of the DataFrame (2, 142)
2020-11-27 16:50:42 INFO     

2020-11-27 16:50:42 INFO     Columns present in the DataFrame: ['INCIDENT_ID', 'STRCTUR_NO', 'CIRCT_ID', 'DNI_EQUIP_TYPE', 'CALL_QTY', 'CUST_QTY', 'KVA_VAL', 'DOWNSTREAM_KVA_VAL', 'INCIDENT_DEVICE_ID', 'CREATION_DATETIME', 'SUBST_ID', 'LOCATION_ID', 'ENERGIZED_DATETIME', 'OUTAGE_ID', 'DAY_FLAG', 'POLE_CLUE_FLG', 'PART_LIGHT_CLUE_FLG', 'EMERGENCY_CLUE_FLG', 'POWER_OUT_CLUE_FLG', 'TREE_CLUE_FLG', 'WIRE_DOWN_CLUE_FLG', 'IVR_CLUE_FLG', 'EQUIPMENT_CLUE_FLG', 'TRANSFORMER_CLUE_FLG', 'OPEN_DEVICE_CLUE_FLG', 'OH_CAUSE_FLG', 'UG_CAUSE_FLG', 'ANIMAL_CAUSE_FLG', 'WEATHER_CAUSE_FLG', 'WEATHER_COLD_CAUSE_FLG', 'WEATHER_LIGHTNING_CAUSE_FLG', 'WEATHER__SNOW_CAUSE_FLG', 'WEATHER__WIND_CAUSE_FLG', 'WEATHER__HEAT_CAUSE_FLG', 'WEATHER__FLOOD_CAUSE_F

## **Add Number of Outages for CLUE,  CAUSE,  OCCURN**

In [7]:
# convert to datetime columns for operations
DF_FINAL['CREATION_DATETIME'] = pd.to_datetime(DF_FINAL['CREATION_DATETIME'], errors='coerce')
DF_FIN['CREATION_DATETIME'] = pd.to_datetime(DF_FIN['CREATION_DATETIME'], errors='coerce')

# extract date from datetime column
DF_FINAL['Date'] = DF_FINAL['CREATION_DATETIME'].dt.date
DF_FIN['Date'] = DF_FIN['CREATION_DATETIME'].dt.date

DF_NO_OF_OUTAGES = DF_FINAL.groupby(['Date'], as_index=False).agg({'POWER_OUT_CLUE_FLG':'sum',
                                                                   'OPEN_DEVICE_CLUE_FLG':'sum',
                                                                   'IVR_CLUE_FLG':'sum',
                                                                   'ANIMAL_CAUSE_FLG':'sum',
                                                                   'WIRE_OCCURN_FLG':'sum'})

DF_NO_OF_OUTAGES.rename(columns={'POWER_OUT_CLUE_FLG':'NO_OF_POWER_OUT_CLUE_PER_DAY',
                                 'OPEN_DEVICE_CLUE_FLG':'NO_OF_OPEN_DEVICE_CLUE_PER_DAY',
                                 'IVR_CLUE_FLG':'NO_OF_IVR_CLUE_PER_DAY',
                                 'ANIMAL_CAUSE_FLG':'NO_OF_ANIMAL_CAUSE_PER_DAY',
                                 'WIRE_OCCURN_FLG':'NO_OF_WIRE_OCCURN_PER_DAY'}, inplace=True)

try:
    DF_CLUE_COUNT = pd.read_csv(CONFIGPARSER['CURATED_DATA']['CLUE_COUNT_CSV'])
except:
    DF_NO_OF_OUTAGES.to_csv(CONFIGPARSER['CURATED_DATA']['CLUE_COUNT_CSV'], index=False)

RECORD_DATE = (datetime.today()-timedelta(days=1)).date()
logging.info('Type of the record date %s', type(RECORD_DATE))
logging.info('\n')

DF_CLUE_COUNT['Date'] = pd.to_datetime(DF_CLUE_COUNT.Date).dt.date
logging.info('Type of the clue count date columns %s', type(DF_CLUE_COUNT.Date[0]))
logging.info('\n')

DF_CLUE_COUNT_CURRENT = DF_CLUE_COUNT[DF_CLUE_COUNT.Date >= RECORD_DATE]
DF_CLUE_COUNT_CURRENT.reset_index(drop=True, inplace=True)
logging.info('Check if Clue Count Dataframe is empty: %s', DF_CLUE_COUNT_CURRENT.empty)
logging.info('\n')

DF_CLUE_COUNT = DF_CLUE_COUNT.append(DF_NO_OF_OUTAGES)
DF_CLUE_COUNT = DF_CLUE_COUNT.groupby(['Date'], as_index=False).agg({
    'NO_OF_POWER_OUT_CLUE_PER_DAY':'sum',
    'NO_OF_OPEN_DEVICE_CLUE_PER_DAY':'sum',
    'NO_OF_IVR_CLUE_PER_DAY':'sum',
    'NO_OF_ANIMAL_CAUSE_PER_DAY':'sum',
    'NO_OF_WIRE_OCCURN_PER_DAY':'sum'})

logging.info('Datatype of the date column: %s', type(DF_CLUE_COUNT.Date[0]))
logging.info('\n')

DF_CLUE_COUNT.to_csv(CONFIGPARSER['CURATED_DATA']['CLUE_COUNT_CSV'], index=False)
DF_FIN = DF_FIN.merge(DF_CLUE_COUNT, how='left', left_on=['Date'], right_on=['Date'])
DF_FIN.reset_index(drop=True, inplace=True)

logging.info('CLUE, CAUSE, OCCURN codes added')
logging.info('\n')
QC_CHECK_SHAPE_AND_COLUMNS(DF_FIN)

2020-11-27 16:50:43 INFO     Type of the record date <class 'datetime.date'>
2020-11-27 16:50:43 INFO     

2020-11-27 16:50:43 INFO     Type of the clue count date columns <class 'datetime.date'>
2020-11-27 16:50:43 INFO     

2020-11-27 16:50:43 INFO     Check if Clue Count Dataframe is empty: False
2020-11-27 16:50:43 INFO     

2020-11-27 16:50:43 INFO     Datatype of the date column: <class 'datetime.date'>
2020-11-27 16:50:43 INFO     

2020-11-27 16:50:43 INFO     CLUE, CAUSE, OCCURN codes added
2020-11-27 16:50:43 INFO     

2020-11-27 16:50:43 INFO     ****QC Check****
2020-11-27 16:50:43 INFO     

2020-11-27 16:50:43 INFO     Shape of the DataFrame (2, 147)
2020-11-27 16:50:43 INFO     

2020-11-27 16:50:43 INFO     Columns present in the DataFrame: ['INCIDENT_ID', 'STRCTUR_NO', 'CIRCT_ID', 'DNI_EQUIP_TYPE', 'CALL_QTY', 'CUST_QTY', 'KVA_VAL', 'DOWNSTREAM_KVA_VAL', 'INCIDENT_DEVICE_ID', 'CREATION_DATETIME', 'SUBST_ID', 'LOCATION_ID', 'ENERGIZED_DATETIME', 'OUTAGE_ID', 'DAY_FL

## **Change all columns to Flag values**

In [8]:
FINAL_LIST = ast.literal_eval(CONFIGPARSER.get("CURATED_DATA", "FLAG_LIST"))
FINAL_LIST = list(FINAL_LIST)
logging.info('Name of all FLAG COLUMNS LOADED: %s', FINAL_LIST)
logging.info('\n')

MAPIN = {1:'True', 0:'False'}

for i in FINAL_LIST:
    DF_FIN[i] = DF_FIN[i].map(MAPIN)

# fillna's null values using forward fill method
DF_FIN.fillna(method='ffill', inplace=True)
DF_FIN['CITY_NAM'].fillna('NO_CITY', inplace=True)

DF_FIN['CREATION_DATETIME'] = DF_FINAL['CREATION_DATETIME'].apply(
    lambda row: row.strftime("%Y-%m-%d %H:%M:%S"))
DF_FIN['CREATION_DATETIME'] = pd.to_datetime(DF_FINAL['CREATION_DATETIME'],errors='coerce')

logging.info('Changed All Flag Columns to TRUE/FALSE')
logging.info('\n')
QC_CHECK_SHAPE_AND_COLUMNS(DF_FIN)

2020-11-27 16:50:43 INFO     Name of all FLAG COLUMNS LOADED: ['POLE_CLUE_FLG', 'PART_LIGHT_CLUE_FLG', 'EMERGENCY_CLUE_FLG', 'POWER_OUT_CLUE_FLG', 'TREE_CLUE_FLG', 'WIRE_DOWN_CLUE_FLG', 'IVR_CLUE_FLG', 'EQUIPMENT_CLUE_FLG', 'TRANSFORMER_CLUE_FLG', 'OPEN_DEVICE_CLUE_FLG', 'OH_CAUSE_FLG', 'UG_CAUSE_FLG', 'ANIMAL_CAUSE_FLG', 'WEATHER_CAUSE_FLG', 'WEATHER_COLD_CAUSE_FLG', 'WEATHER_LIGHTNING_CAUSE_FLG', 'WEATHER__SNOW_CAUSE_FLG', 'WEATHER__WIND_CAUSE_FLG', 'WEATHER__HEAT_CAUSE_FLG', 'WEATHER__FLOOD_CAUSE_FLG', 'PUBLIC_CAUSE_FLG', 'STREET_CAUSE_FLG', 'SUBSTATION_CAUSE_FLG', 'TREE_CAUSE_FLG', 'MISCELLANEOUS_CAUSE_FLG', 'CUST_REQUEST_CAUSE_FLG', 'NO_CAUSE_FLG', 'PLANNED_CAUSE_FLG', 'NO_OUTAGE_CAUSE_FLG', 'FUSE_OCCURN_FLG', 'CUST_EQUIP_OCCURN_FLG', 'POLE_OCCURN_FLG', 'TRANSFORMER_OCCURN_FLG', 'METER_OCCURN_FLG', 'SERVICE_OCCURN_FLG', 'CABLE_OCCURN_FLG', 'ST_OCCURN_FLG', 'FIRE_OCCURN_FLG', 'FOUND_OPEN_OCCURN_FLG', 'PUBLIC_SAFETY_OCCURN_FLG', 'WIRE_OCCURN_FLG', 'SWITCH_OCCURN_FLG', 'CUTOUT_OCCURN

## **Add outages in last N hours feature**

In [9]:
RECORD_DATE_OUTAGE = (datetime.today()-timedelta(days=2)).date()

try:
    PRED_OUTAGES = CONFIGPARSER.get('CURATED_DATA', 'PRED_OUTAGES_QUERY')+"'"+str(RECORD_DATE_OUTAGE)+"'"
    logging.info(PRED_OUTAGES)
    DF_PRED_OUTAGES = gbq.read_gbq(PRED_OUTAGES, project_id=CONFIGPARSER.get('SETTINGS', 'PROJECT_ID'))
    DF_PRED_OUTAGES.reset_index(drop=True, inplace=True)
except:
    DF_PRED_OUTAGES = pd.DataFrame()


DF_PRED_OUTAGES['Creation_Time'] = pd.to_datetime(DF_PRED_OUTAGES['Creation_Time'],errors='coerce')
DF_PRED_OUTAGES['Creation_Time'] = DF_PRED_OUTAGES['Creation_Time'].apply(
    lambda row: row.strftime("%Y-%m-%d %H:%M:%S"))
DF_PRED_OUTAGES['CREATION_DATETIME'] = pd.to_datetime(DF_PRED_OUTAGES['Creation_Time'],errors='coerce')
DF_PRED_OUTAGES.drop(['Creation_Time'], axis=1, inplace=True)

DF_FINAL_OUTAGE_COUNT = DF_FIN[['OUTAGE_ID', 'CREATION_DATETIME']]
DF_FINAL_OUTAGE_COUNT = DF_FINAL_OUTAGE_COUNT.append(DF_PRED_OUTAGES)
DF_FINAL_OUTAGE_COUNT.drop_duplicates(subset='OUTAGE_ID',keep='last',inplace=True)
DF_FINAL_OUTAGE_COUNT.reset_index(inplace=True)

def count_outage_minutes(group):
    '''takes group as argument and outputs group with added features'''
    group = group.reset_index(drop=True)
    df_temp = DF_FINAL_OUTAGE_COUNT[['OUTAGE_ID', 'CREATION_DATETIME']]
    df_temp['minutes'] = (group['CREATION_DATETIME'][0]-DF_FINAL_OUTAGE_COUNT['CREATION_DATETIME']).dt.total_seconds().div(60)
    df_temp = df_temp[df_temp.minutes > 0]
    group['Outages_in_last_1hr'] = len(df_temp[df_temp.minutes <= 60])
    group['Outages_in_last_2hr'] = len(df_temp[df_temp.minutes <= 120])
    group['Outages_in_last_3hr'] = len(df_temp[df_temp.minutes <= 180])
    group['Outages_in_last_4hr'] = len(df_temp[df_temp.minutes <= 240])
    group['Outages_in_last_5hr'] = len(df_temp[df_temp.minutes <= 300])
    group['Outages_in_last_6hr'] = len(df_temp[df_temp.minutes <= 360])
    group['Outages_in_last_7hr'] = len(df_temp[df_temp.minutes <= 420])
    group['Outages_in_last_8hr'] = len(df_temp[df_temp.minutes <= 480])
    group['Outages_in_last_9hr'] = len(df_temp[df_temp.minutes <= 540])
    group['Outages_in_last_10hr'] = len(df_temp[df_temp.minutes <= 600])
    return group



def grouping_fn_minutes(df):
    '''takes df as input and gives liveoutage as output'''
    liveoutage = df.groupby(['OUTAGE_ID'], as_index=False).apply(count_outage_minutes)
    return liveoutage

LIVE_OUTAGES = grouping_fn_minutes(DF_FIN)
LIVE_OUTAGES.reset_index(drop=True, inplace=True)

logging.info('Added Outages in last N features to Analytical Dataset')
logging.info('\n')
QC_CHECK_SHAPE_AND_COLUMNS(LIVE_OUTAGES)

2020-11-27 16:50:45 INFO     SELECT OUTAGE_ID, Creation_Time FROM aes-analytics-0002.mds_outage_restoration.IPL_Predictions where creation_time>='2020-11-25'
2020-11-27 16:50:46 INFO     Added Outages in last N features to Analytical Dataset
2020-11-27 16:50:46 INFO     

2020-11-27 16:50:46 INFO     ****QC Check****
2020-11-27 16:50:46 INFO     

2020-11-27 16:50:46 INFO     Shape of the DataFrame (2, 157)
2020-11-27 16:50:46 INFO     

2020-11-27 16:50:46 INFO     Columns present in the DataFrame: ['INCIDENT_ID', 'STRCTUR_NO', 'CIRCT_ID', 'DNI_EQUIP_TYPE', 'CALL_QTY', 'CUST_QTY', 'KVA_VAL', 'DOWNSTREAM_KVA_VAL', 'INCIDENT_DEVICE_ID', 'CREATION_DATETIME', 'SUBST_ID', 'LOCATION_ID', 'ENERGIZED_DATETIME', 'OUTAGE_ID', 'DAY_FLAG', 'POLE_CLUE_FLG', 'PART_LIGHT_CLUE_FLG', 'EMERGENCY_CLUE_FLG', 'POWER_OUT_CLUE_FLG', 'TREE_CLUE_FLG', 'WIRE_DOWN_CLUE_FLG', 'IVR_CLUE_FLG', 'EQUIPMENT_CLUE_FLG', 'TRANSFORMER_CLUE_FLG', 'OPEN_DEVICE_CLUE_FLG', 'OH_CAUSE_FLG', 'UG_CAUSE_FLG', 'ANIMAL_CAUSE_FLG', 

## **Write curated dataset to CSV's**

In [10]:
if 'DOWNSTREAM_CUST_QTY' not in LIVE_OUTAGES:
    LIVE_OUTAGES['DOWNSTREAM_CUST_QTY'] = LIVE_OUTAGES['CUST_QTY']

LIVE_OUTAGES['KVA_VAL'] = LIVE_OUTAGES['DOWNSTREAM_KVA_VAL']
LIVE_OUTAGES.fillna(method='ffill', inplace=True)

logging.info('Path to CSV %s', CONFIGPARSER['CURATED_DATA']['LIVE_OUTAGES_BACKUP_CSV'])
LIVE_OUTAGES.to_csv(CONFIGPARSER['CURATED_DATA']['LIVE_OUTAGES_BACKUP_CSV'], index=False)

logging.info('Path to CSV %s', CONFIGPARSER['CURATED_DATA']['LIVE_OUTAGES_PATH']+'IPL_OMS_LIVE_Data_'+datetime.today().strftime('%Y%m%d%H%M')+'.csv')
LIVE_OUTAGES.to_csv(CONFIGPARSER['CURATED_DATA']['LIVE_OUTAGES_PATH']+'IPL_OMS_LIVE_Data_'+datetime.today().strftime('%Y%m%d%H%M')+'.csv', index=False)

2020-11-27 16:50:46 INFO     Path to CSV gs://aes-analytics-0002-curated/Outage_Restoration/Staging/IPL_Live_Master_Dataset.csv
2020-11-27 16:50:46 INFO     Path to CSV gs://aes-analytics-0002-curated/Outage_Restoration/Historical_Data/BQ_backup/IPL_OMS_LIVE_Data_202011271650.csv


In [11]:
logging.info('Column Names in LIVE OUTAGES: %s', list(LIVE_OUTAGES.columns))

2020-11-24 10:34:19 INFO     Column Names in LIVE OUTAGES: ['INCIDENT_ID', 'STRCTUR_NO', 'CIRCT_ID', 'DNI_EQUIP_TYPE', 'CALL_QTY', 'CUST_QTY', 'KVA_VAL', 'DOWNSTREAM_KVA_VAL', 'INCIDENT_DEVICE_ID', 'CREATION_DATETIME', 'SUBST_ID', 'LOCATION_ID', 'ENERGIZED_DATETIME', 'OUTAGE_ID', 'DAY_FLAG', 'POLE_CLUE_FLG', 'PART_LIGHT_CLUE_FLG', 'EMERGENCY_CLUE_FLG', 'POWER_OUT_CLUE_FLG', 'TREE_CLUE_FLG', 'WIRE_DOWN_CLUE_FLG', 'IVR_CLUE_FLG', 'EQUIPMENT_CLUE_FLG', 'TRANSFORMER_CLUE_FLG', 'OPEN_DEVICE_CLUE_FLG', 'OH_CAUSE_FLG', 'UG_CAUSE_FLG', 'ANIMAL_CAUSE_FLG', 'WEATHER_CAUSE_FLG', 'WEATHER_COLD_CAUSE_FLG', 'WEATHER_LIGHTNING_CAUSE_FLG', 'WEATHER__SNOW_CAUSE_FLG', 'WEATHER__WIND_CAUSE_FLG', 'WEATHER__HEAT_CAUSE_FLG', 'WEATHER__FLOOD_CAUSE_FLG', 'PUBLIC_CAUSE_FLG', 'STREET_CAUSE_FLG', 'SUBSTATION_CAUSE_FLG', 'TREE_CAUSE_FLG', 'MISCELLANEOUS_CAUSE_FLG', 'CUST_REQUEST_CAUSE_FLG', 'NO_CAUSE_FLG', 'PLANNED_CAUSE_FLG', 'NO_OUTAGE_CAUSE_FLG', 'FUSE_OCCURN_FLG', 'CUST_EQUIP_OCCURN_FLG', 'POLE_OCCURN_FLG', '

## **Write to Big Query Tables**

In [11]:
LIVE_OUTAGES.columns = LIVE_OUTAGES.columns.str.replace('.', '_')
CURATED_QUERY = 'Select * from ' + CONFIGPARSER['SETTINGS']['BQ_CURATED_DATASET']
CURATED_DATA  = gbq.read_gbq(CURATED_QUERY, project_id=CONFIGPARSER['SETTINGS']['PROJECT_ID'])
logging.info('Big Query table loaded')
logging.info('\n')

CURATED_DATA.append(LIVE_OUTAGES)
logging.info('Big query table appended')
logging.info('\n')

CURATED_DATA.drop_duplicates(['INCIDENT_ID', 'STRCTUR_NO', 'CIRCT_ID', 'DNI_EQUIP_TYPE'], keep='last', inplace=True)


logging.info('Curated Dataset Big query table name %s', CONFIGPARSER['SETTINGS']['BQ_CURATED_DATASET'])

CURATED_DATA.to_gbq(CONFIGPARSER['SETTINGS']['BQ_CURATED_DATASET'], project_id=CONFIGPARSER['SETTINGS']['PROJECT_ID'],
                    chunksize=None, reauth=False, if_exists='append', auth_local_webserver=False,
                    table_schema=None, location=None, progress_bar=True, credentials=None)
logging.info('Final Big Query table created')
logging.info('\n')

2020-11-27 16:51:05 INFO     Big Query table loaded
2020-11-27 16:51:05 INFO     

2020-11-27 16:51:05 INFO     Big query table appended
2020-11-27 16:51:05 INFO     

2020-11-27 16:51:05 INFO     Curated Dataset Big query table name mds_outage_restoration.IPL_curated_dataset
3 out of 3 rows loaded.11-27 16:51:05 INFO     
1it [00:04,  4.46s/it]
2020-11-27 16:51:10 INFO     Final Big Query table created
2020-11-27 16:51:10 INFO     



In [12]:
CURATED_DATA

Unnamed: 0,INCIDENT_ID,STRCTUR_NO,CIRCT_ID,DNI_EQUIP_TYPE,CALL_QTY,CUST_QTY,KVA_VAL,DOWNSTREAM_KVA_VAL,INCIDENT_DEVICE_ID,CREATION_DATETIME,...,Outages_in_last_2hr,Outages_in_last_3hr,Outages_in_last_4hr,Outages_in_last_5hr,Outages_in_last_6hr,Outages_in_last_7hr,Outages_in_last_8hr,Outages_in_last_9hr,Outages_in_last_10hr,DOWNSTREAM_CUST_QTY
138,2001561869,701-B/100,3359,FUSE,1,1,50,50,2002773429,2020-11-22 13:12:40+00:00,...,7,8,8,9,14,15,18,20,24,1
139,2001561868,170-A/120,3107,1TBOH,1,1,0,0,2002773427,2020-11-22 13:10:41+00:00,...,6,7,7,8,13,14,17,19,23,1
140,2001561876,242--/449,3060,1TPUG,3,2,50,50,2002773436,2020-11-22 14:26:47+00:00,...,3,6,9,9,10,14,16,19,20,2
