## **Import Required packages**

In [1]:
'''
Input - LIVE OMS PREPROCESSED DATA
This scripts collects data from Weather Source Location
ADDS it to OMS LIVE PREPROCESSED DATA
Output - LIVE OMS PREPROCESSED DATA WITH WEATHER VARIABLES
'''

import sys
import os
from datetime import date, datetime
import datetime as dt
import subprocess
import logging
import pandas as pd
import numpy as np
from configparser import ConfigParser, ExtendedInterpolation
from google.cloud import storage
from pandas.io import gbq

# Setup logs
logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s',
    level=logging.INFO,
    datefmt='%Y-%m-%d %H:%M:%S')
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

In [2]:
# read config file
CONFIGPARSER = ConfigParser(interpolation=ExtendedInterpolation())
CONFIGPARSER.read('/root/confignew0002.ini')
logging.info('Config File Loaded')
logging.info('Config File Sections %s', CONFIGPARSER.sections())

2020-11-24 09:47:59 INFO     Config File Loaded
2020-11-24 09:47:59 INFO     Config File Sections ['SETTINGS', 'LIVE_OMS', 'DATA_COLLATION', 'CURATED_DATA', 'LOAD_AND_PREDICT']


In [3]:
def QC_CHECK_SHAPE_AND_COLUMNS(df):
    '''
    Input - Dataframe with operations/addtion of features/columns or joins performed
    Output - Log Info using shape of dataframe and columns present
    '''
    logging.info('****QC Check****')
    logging.info('\n')
    logging.info('Shape of the DataFrame %s', df.shape)
    logging.info('\n')
    logging.info('Columns present in the DataFrame: %s', list(df.columns))
    logging.info('\n')
    return

## **Read curated OMS data**

In [4]:
DATA_COLLATION_STAGING_PATH = CONFIGPARSER['DATA_COLLATION']['DATA_COLLATION_STAGING_PATH']
logging.info('Data Collation Staging Path: %s', DATA_COLLATION_STAGING_PATH)
logging.info('\n')

DF_OMS_LIVE = pd.read_csv(DATA_COLLATION_STAGING_PATH)

DF_OMS_LIVE = DF_OMS_LIVE.loc[:, ~DF_OMS_LIVE.columns.str.contains('^Unnamed')]
DF_OMS_LIVE = DF_OMS_LIVE.loc[:, ~DF_OMS_LIVE.columns.str.contains('^_c0')]

logging.info('OMS Dataframe Loaded')
logging.info('\n')
QC_CHECK_SHAPE_AND_COLUMNS(DF_OMS_LIVE)

2020-11-24 10:21:08 INFO     Data Collation Staging Path: gs://aes-analytics-0002-curated/Outage_Restoration/Live_Data_Curation/OMS/OMS_Live_Data.csv
2020-11-24 10:21:08 INFO     

2020-11-24 10:21:08 INFO     OMS Dataframe Loaded
2020-11-24 10:21:08 INFO     

2020-11-24 10:21:08 INFO     ****QC Check****
2020-11-24 10:21:08 INFO     

2020-11-24 10:21:08 INFO     Shape of the DataFrame (4, 76)
2020-11-24 10:21:08 INFO     

2020-11-24 10:21:08 INFO     Columns present in the DataFrame: ['INCIDENT_ID', 'STRCTUR_NO', 'CIRCT_ID', 'DNI_EQUIP_TYPE', 'CALL_QTY', 'CUST_QTY', 'KVA_VAL', 'DOWNSTREAM_KVA_VAL', 'INCIDENT_DEVICE_ID', 'CREATION_DATETIME', 'SUBST_ID', 'LOCATION_ID', 'ENERGIZED_DATETIME', 'OUTAGE_ID', 'DAY_FLAG', 'POLE_CLUE_FLG', 'PART_LIGHT_CLUE_FLG', 'EMERGENCY_CLUE_FLG', 'POWER_OUT_CLUE_FLG', 'TREE_CLUE_FLG', 'WIRE_DOWN_CLUE_FLG', 'IVR_CLUE_FLG', 'EQUIPMENT_CLUE_FLG', 'TRANSFORMER_CLUE_FLG', 'OPEN_DEVICE_CLUE_FLG', 'OH_CAUSE_FLG', 'UG_CAUSE_FLG', 'ANIMAL_CAUSE_FLG', 'WEATHER_CAU

## **Dynamic reading of files from Weather Source Storage Location**

In [5]:
DF_OMS_LIVE['CREATION_DATETIME'] = pd.to_datetime(
    DF_OMS_LIVE['CREATION_DATETIME'], errors='coerce')
DF_OMS_LIVE['Date'] = DF_OMS_LIVE['CREATION_DATETIME'].dt.date

UNIQUE_DATES = DF_OMS_LIVE[['Date']]
UNIQUE_DATES.drop_duplicates(subset=['Date'], keep='first', inplace=True)
UNIQUE_DATES['Date'] = UNIQUE_DATES['Date'].apply(lambda x: x.strftime('%Y-%m-%d'))
UNIQUE = UNIQUE_DATES['Date'].to_list()
logging.info('Unique Dates for files: %s', UNIQUE)
logging.info('\n')

# read weather source data from big query
WSFILES = pd.DataFrame()

for i in UNIQUE:
    WS_LOCATION = CONFIGPARSER['DATA_COLLATION']['WS_QUERY'].format(i)
    WSFILES = WSFILES.append(gbq.read_gbq(WS_LOCATION, project_id=CONFIGPARSER['SETTINGS']['PROJECT_ID']))
    
WS_DF = WSFILES.drop_duplicates(['timestamp', 'Location'], keep='last')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
2020-11-24 10:21:12 INFO     Unique Dates for files: ['2020-11-23']
2020-11-24 10:21:12 INFO     



## **Weather Source Weather data cleaning**

In [6]:
WS_DF['Date'] = pd.to_datetime(WS_DF['timestamp']).dt.date
WS_DF['Location'] = WS_DF['Location'].astype(str)

logging.info('Weather Source Data Loaded')
logging.info('\n')
QC_CHECK_SHAPE_AND_COLUMNS(WS_DF)

2020-11-24 10:21:21 INFO     Weather Source Data Loaded
2020-11-24 10:21:21 INFO     

2020-11-24 10:21:21 INFO     ****QC Check****
2020-11-24 10:21:21 INFO     

2020-11-24 10:21:21 INFO     Shape of the DataFrame (20, 60)
2020-11-24 10:21:21 INFO     

2020-11-24 10:21:21 INFO     Columns present in the DataFrame: ['Job_Update_Time', 'timestamp', 'timestampInit', 'cldCvrMin', 'cldCvrAvg', 'cldCvrMax', 'dewPtMin', 'dewPtAvg', 'dewPtMax', 'feelsLikeMin', 'feelsLikeAvg', 'feelsLikeMax', 'heatIndexMin', 'heatIndexAvg', 'heatIndexMax', 'mslPresMin', 'mslPresAvg', 'mslPresMax', 'precip', 'precipProb', 'radSolarMin', 'radSolarAvg', 'radSolarMax', 'radSolarTot', 'relHumMin', 'relHumAvg', 'relHumMax', 'sfcPresMin', 'sfcPresAvg', 'sfcPresMax', 'snowfall', 'snowfallProb', 'spcHumMin', 'spcHumAvg', 'spcHumMax', 'tempMin', 'tempAvg', 'tempMax', 'windChillMin', 'windChillAvg', 'windChillMax', 'windDirAvg', 'windDir80mAvg', 'windDir100mAvg', 'windSpdMin', 'windSpdAvg', 'windSpdMax', 'windSpd80mMin

## **Add Weather Source features**

In [7]:
## Add range for columns with negative values

WS_DF['tempRange'] = WS_DF['tempMax'] - WS_DF['tempMin']
WS_DF['windSpdRange'] = WS_DF['windSpdMax'] - WS_DF['windSpdMin']
WS_DF['sfcPresRange'] = WS_DF['sfcPresMax'] - WS_DF['sfcPresMin']
WS_DF['cldCvrRange'] = WS_DF['cldCvrMax'] - WS_DF['cldCvrMin']
WS_DF['relHumRange'] = WS_DF['relHumMax'] - WS_DF['relHumMin']

## Add ratio for columns which dont have negative values

WS_DF['relHumRatio'] = WS_DF['relHumMax'] / WS_DF['relHumMin']
WS_DF['sfcPresRatio'] = WS_DF['sfcPresMax'] / WS_DF['sfcPresMin']

## data qc check for nulls

WS_DF = WS_DF.replace([np.inf, -np.inf], np.nan)
nulls = WS_DF.isnull().sum()

DF_NULLS = pd.DataFrame({'Feature': nulls.index, 'VALUES': nulls.values})
DF_NULLS[DF_NULLS.VALUES >= 1]

logging.info('Features to Weather Source Data Added')
QC_CHECK_SHAPE_AND_COLUMNS(WS_DF)

2020-11-24 10:21:26 INFO     Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2020-11-24 10:21:26 INFO     NumExpr defaulting to 8 threads.
2020-11-24 10:21:26 INFO     Features to Weather Source Data Added
2020-11-24 10:21:26 INFO     ****QC Check****
2020-11-24 10:21:26 INFO     

2020-11-24 10:21:26 INFO     Shape of the DataFrame (20, 67)
2020-11-24 10:21:26 INFO     

2020-11-24 10:21:26 INFO     Columns present in the DataFrame: ['Job_Update_Time', 'timestamp', 'timestampInit', 'cldCvrMin', 'cldCvrAvg', 'cldCvrMax', 'dewPtMin', 'dewPtAvg', 'dewPtMax', 'feelsLikeMin', 'feelsLikeAvg', 'feelsLikeMax', 'heatIndexMin', 'heatIndexAvg', 'heatIndexMax', 'mslPresMin', 'mslPresAvg', 'mslPresMax', 'precip', 'precipProb', 'radSolarMin', 'radSolarAvg', 'radSolarMax', 'radSolarTot', 'relHumMin', 'relHumAvg', 'relHumMax', 'sfcPresMin', 'sfcPresAvg', 'sfcPresMax', 'snowfall', 'snowfallProb', 'spcHumMin', 'spcHumAvg', 'spcHumMax', 'tempMin', 'tempAv

## **Weather Source OMS weather mapping**

In [8]:
# make mapping consistent 
def marker_weather_mapping(marker_name):
    '''
    Input - Marker name with IPL%
    Output - Only Maker names with no IPL
    Example i/p, o/p - IPL_Marker1, Marker1
    '''
    name = marker_name[4:]
    return name

def remove_spaces(string):
    '''
    Input - Maker name with spaces
    Output - Marker name without space
    Example i/p, o/p - Marker 1, Marker1
    '''
    return string.replace(" ", "")

WS_DF['Location'] = WS_DF.apply(lambda x: marker_weather_mapping(x['Location']), axis=1)
DF_OMS_LIVE['Marker_Location'] = DF_OMS_LIVE.apply(lambda x: remove_spaces(x['Marker_Location']), axis=1)

In [9]:
DF_OMS_LIVE['Date'] = pd.to_datetime(DF_OMS_LIVE['Date'], errors='coerce')
WS_DF['Date'] = pd.to_datetime(WS_DF['Date'], errors='coerce')

WS_DF.drop(['Latitude', 'Longitude', 'timestamp', 'timestampInit', 'Job_Update_Time'], axis=1, inplace=True)

logging.info('Merging Weather Source and OMS Live Data OLD OMS shape %s', DF_OMS_LIVE.shape)
logging.info('\n')

DF_OMS_LIVE = pd.merge(DF_OMS_LIVE, WS_DF, how='left',
                       left_on=['Date', 'Marker_Location'], right_on=['Date', 'Location'])


DF_OMS_LIVE.drop(['Date'], axis=1, inplace=True)

logging.info('Live OMS data merged with Weather Souce Weather Data')
logging.info('\n')
QC_CHECK_SHAPE_AND_COLUMNS(DF_OMS_LIVE)

2020-11-24 10:23:13 INFO     Merging Weather Source and OMS Live Data OLD OMS shape (4, 77)
2020-11-24 10:23:13 INFO     

2020-11-24 10:23:13 INFO     Live OMS data merged with Weather Souce Weather Data
2020-11-24 10:23:13 INFO     

2020-11-24 10:23:13 INFO     ****QC Check****
2020-11-24 10:23:13 INFO     

2020-11-24 10:23:13 INFO     Shape of the DataFrame (4, 137)
2020-11-24 10:23:13 INFO     

2020-11-24 10:23:13 INFO     Columns present in the DataFrame: ['INCIDENT_ID', 'STRCTUR_NO', 'CIRCT_ID', 'DNI_EQUIP_TYPE', 'CALL_QTY', 'CUST_QTY', 'KVA_VAL', 'DOWNSTREAM_KVA_VAL', 'INCIDENT_DEVICE_ID', 'CREATION_DATETIME', 'SUBST_ID', 'LOCATION_ID', 'ENERGIZED_DATETIME', 'OUTAGE_ID', 'DAY_FLAG', 'POLE_CLUE_FLG', 'PART_LIGHT_CLUE_FLG', 'EMERGENCY_CLUE_FLG', 'POWER_OUT_CLUE_FLG', 'TREE_CLUE_FLG', 'WIRE_DOWN_CLUE_FLG', 'IVR_CLUE_FLG', 'EQUIPMENT_CLUE_FLG', 'TRANSFORMER_CLUE_FLG', 'OPEN_DEVICE_CLUE_FLG', 'OH_CAUSE_FLG', 'UG_CAUSE_FLG', 'ANIMAL_CAUSE_FLG', 'WEATHER_CAUSE_FLG', 'WEATHER_COLD_CA

## **Renaming weather attributes**

In [10]:
def create_wind_direction(x_wind_direction):
    '''
    Input - Wind direction in degrees
    Output - Wind direction classes based on directions
    '''
    if(x_wind_direction >= 1) & (x_wind_direction < 45):
        direction = 'N-E-N'
    elif(x_wind_direction >= 45) & (x_wind_direction < 90):
        direction = 'N-E-E'
    elif(x_wind_direction >= 90) & (x_wind_direction < 180):
        direction = 'S-E-E'
    elif(x_wind_direction >= 135) & (x_wind_direction < 180):
        direction = 'S-E-S'
    elif(x_wind_direction >= 180) & (x_wind_direction < 225):
        direction = 'S-W-S'
    elif(x_wind_direction >= 225) & (x_wind_direction < 270):
        direction = 'S-W-W'
    elif(x_wind_direction >= 270) & (x_wind_direction < 315):
        direction = 'N-W-W'
    elif(x_wind_direction >= 315) & (x_wind_direction < 360):
        direction = 'N-W-N'
    else:
        direction = None

    return direction

DF_OMS_LIVE['WIND_DIRECTION'] = DF_OMS_LIVE['windDirAvg'].apply(create_wind_direction)

def create_weekend_flag(x_flag):
    '''
	Input - Weekday Number 1,2,3,4,5,6,7
	Output - WEEKEND 1, 0
    '''
    if x_flag >= 5:
        flag = 1
    else:
        flag = 0
    return flag

DF_OMS_LIVE['weekday'] = pd.to_datetime(DF_OMS_LIVE['CREATION_DATETIME']).dt.dayofweek
DF_OMS_LIVE['weekend_flag'] = DF_OMS_LIVE['weekday'].apply(create_weekend_flag)

DF_OMS_LIVE.drop(['weekday', 'Location'], axis=1, inplace=True)

logging.info('Final Dataset Created')
logging.info('\n')
QC_CHECK_SHAPE_AND_COLUMNS(DF_OMS_LIVE)

2020-11-24 10:23:15 INFO     Final Dataset Created
2020-11-24 10:23:15 INFO     

2020-11-24 10:23:15 INFO     ****QC Check****
2020-11-24 10:23:15 INFO     

2020-11-24 10:23:15 INFO     Shape of the DataFrame (4, 138)
2020-11-24 10:23:15 INFO     

2020-11-24 10:23:15 INFO     Columns present in the DataFrame: ['INCIDENT_ID', 'STRCTUR_NO', 'CIRCT_ID', 'DNI_EQUIP_TYPE', 'CALL_QTY', 'CUST_QTY', 'KVA_VAL', 'DOWNSTREAM_KVA_VAL', 'INCIDENT_DEVICE_ID', 'CREATION_DATETIME', 'SUBST_ID', 'LOCATION_ID', 'ENERGIZED_DATETIME', 'OUTAGE_ID', 'DAY_FLAG', 'POLE_CLUE_FLG', 'PART_LIGHT_CLUE_FLG', 'EMERGENCY_CLUE_FLG', 'POWER_OUT_CLUE_FLG', 'TREE_CLUE_FLG', 'WIRE_DOWN_CLUE_FLG', 'IVR_CLUE_FLG', 'EQUIPMENT_CLUE_FLG', 'TRANSFORMER_CLUE_FLG', 'OPEN_DEVICE_CLUE_FLG', 'OH_CAUSE_FLG', 'UG_CAUSE_FLG', 'ANIMAL_CAUSE_FLG', 'WEATHER_CAUSE_FLG', 'WEATHER_COLD_CAUSE_FLG', 'WEATHER_LIGHTNING_CAUSE_FLG', 'WEATHER__SNOW_CAUSE_FLG', 'WEATHER__WIND_CAUSE_FLG', 'WEATHER__HEAT_CAUSE_FLG', 'WEATHER__FLOOD_CAUSE_FLG', 'PUB

## **Write OMS Live and added Weather Source Dataset for Curated Processing**

In [11]:
OMS_LIVE_PATH = CONFIGPARSER['DATA_COLLATION']['OMS_LIVE_PATH']
logging.info('OMS LIVE PATH %s', OMS_LIVE_PATH)

DF_OMS_LIVE.to_csv(OMS_LIVE_PATH, index=False)

2020-11-24 10:23:43 INFO     OMS LIVE PATH gs://aes-analytics-0002-curated/Outage_Restoration/Live_Data_Curation/weather-source/OMS_weather-source_Live_Data.csv
