In [1]:

######################################################################################################################################################################################################
######################################################################### Curated Data Set creation####################################################################################
######################################################################################################################################################################################################

#!/usr/bin/env python
# coding: utf-8

# # **Import Required packages**

# In[ ]:


import os
import math
import warnings
import operator
import pandas as pd
import numpy as np
import datetime as dt
from datetime import datetime
from pandas.io import gbq
from datetime import date, timedelta
from datetime import datetime
from google.cloud import storage
from pyspark.context import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession

warnings.filterwarnings('ignore')
pd.options.mode.chained_assignment = None  # default='warn'

import logging
logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)


# # **Read OMS Dark-sky curated dataset**

# In[ ]:


bucket_name = 'gs://aes-analytics-0002-curated/Outage_Restoration/Live_Data_Curation/'

df_omsds=spark.read.format('CSV').option("header","true").option("inferSchema","true").option("delimiter",",").load(
    bucket_name + 'weather-source/OMS_weather-source_Live_Data.csv').toPandas()
#df_omsds = df_omsds.loc[:, ~df_omsds.columns.str.contains('^Unnamed')]


# # **Read Storm Profiles Data**

# In[ ]:


df_omsds['CREATION_DATETIME'] = pd.to_datetime(df_omsds['CREATION_DATETIME'],errors='coerce')
df_omsds['Date'] = df_omsds['CREATION_DATETIME'].dt.date

unique_dates = df_omsds[['Date']]
unique_dates.drop_duplicates(subset=['Date'], keep='first', inplace=True)
unique_dates['Date'] = unique_dates['Date'].apply(lambda x: x.strftime('%Y%m%d'))
unique = unique_dates['Date'].to_list()
print(unique)


storm_profiles_location = 'gs://aes-analytics-0002-curated/Outage_Restoration/Live_Data_Curation/Storm_Profiles_ws/'
storm_profiles_files = [] 

for i in unique:         
    filename = storm_profiles_location + 'storm_profiles_{}.csv'.format(i)         
    print(filename)         
    storm_profiles_files.append(pd.read_csv(filename))

stormprofiles_df = spark.read.format('CSV').option("header","true").option("inferSchema","true").option("delimiter",",").load(
    storm_profiles_location).toPandas()

stormprofiles_df = pd.concat(storm_profiles_files)
stormprofiles_df.reset_index(drop=True, inplace=True)
stormprofiles_df = stormprofiles_df.loc[:, ~stormprofiles_df.columns.str.contains('^Unnamed')]


# # **Storm Profiles Weather Data Cleaning**

# In[ ]:


stormprofiles_df=stormprofiles_df[['timestamp', 'Location', 'clusters']]
stormprofiles_df['Date']=pd.to_datetime(stormprofiles_df['timestamp']).dt.date
df_omsds['Date']=pd.to_datetime(df_omsds['Date'])
print(stormprofiles_df.shape)


# In[ ]:


df_omsds['Date'] = pd.to_datetime(df_omsds['Date']).dt.date
df_omsds = df_omsds.merge(stormprofiles_df,how='left',left_on=['Date','Marker_Location'],right_on=['Date','Location'])
df_omsds.drop(['timestamp_y','timestamp_x'],axis=1,inplace=True)


# ## **Read output dataset and filter for Predicted Flag**

# In[ ]:


try:    
    df_pred = 'SELECT OUTAGE_ID FROM aes-analytics-0002.mds_outage_restoration.IPL_PREDICTIONS_ws'
    df_pred = gbq.read_gbq(df_pred, project_id = "aes-analytics-0002")
    predictions=list(df_pred['OUTAGE_ID'].unique())
    df_omsds['OUTAGE_ID'] = df_omsds['OUTAGE_ID'].astype(str)
    df_omsds['OUTAGE_ID']=df_omsds['OUTAGE_ID'].str.replace(' ','')
    df_final=df_omsds[~df_omsds['OUTAGE_ID'].isin(predictions)]
    df_final.reset_index(drop=True,inplace=True)
    
except:
    df_final=df_omsds

shape = df_final.shape[0]
if (shape==0):
    raise Exception('No new Outages, All outages are already predicted for')

# # **Write curated dataset to Big query table**

# In[ ]:
if 'DOWNSTREAM_CUST_QTY' not in df_final:
    df_final['DOWNSTREAM_CUST_QTY']=df_final['CUST_QTY']


df_final['KVA_VAL']=df_final['DOWNSTREAM_KVA_VAL']

# **Change all columns to Flag values**

# In[ ]:


# flg_list = list(df_final.filter(regex='FLG').columns)
# day_flg_list = list(df_final.filter(regex='FLAG').columns)
# prior_list = list(df_final.filter(regex='PRIORITY').columns)
# final_list = flg_list + prior_list+day_flg_list
# mapin = { 1: 'TRUE', 0: 'FALSE'}
# for i in final_list:
#     df_final[i] = df_final[i].map(mapin)

# df_final.fillna(method='ffill',inplace=True)
# df_final['CITY_NAM'].fillna('NO_CITY',inplace=True)

# df_final = df_final.loc[:, ~df_final.columns.str.contains('^Unnamed')]
# In[ ]:
# df_final.to_csv("gs://aes-analytics-0002-curated/Outage_Restoration/Staging/IPL_Live_Master_Dataset_ws.csv",index=False)
# # Backup
# df_final.to_csv("gs://aes-analytics-0002-curated/Outage_Restoration/Historical_Data/BQ_backup/IPL_OMS_LIVE_Data_"+datetime.today().strftime('%Y%m%d%H%M')+".csv",index=False)

['20200923']
gs://aes-analytics-0002-curated/Outage_Restoration/Live_Data_Curation/Storm_Profiles_ws/storm_profiles_20200923.csv
(20, 4)


Exception: No new Outages, All outages are already predicted for

In [2]:
df_final=df_omsds

## **ADD NO OF OUTAGES FOR CLUE, CAUSE, OCCURN**

In [3]:
df_final['CREATION_DATETIME'] = pd.to_datetime(df_final['CREATION_DATETIME'])
df_final['Date'] = df_final['CREATION_DATETIME'].dt.date

df_no_of_outages = df_final.groupby(['Date'],as_index=False).agg({'POWER_OUT_CLUE_FLG' : 'sum', 'OPEN_DEVICE_CLUE_FLG' : 'sum', 'IVR_CLUE_FLG' : 'sum', 'ANIMAL_CAUSE_FLG' : 'sum',
                                                                'WIRE_OCCURN_FLG' : 'sum'})
df_no_of_outages.rename(columns = {'POWER_OUT_CLUE_FLG' : 'NO_OF_POWER_OUT_CLUE_PER_DAY', 'OPEN_DEVICE_CLUE_FLG' : 'NO_OF_OPEN_DEVICE_CLUE_PER_DAY',
                                   'IVR_CLUE_FLG' : 'NO_OF_IVR_CLUE_PER_DAY', 'ANIMAL_CAUSE_FLG' : 'NO_OF_ANIMAL_CAUSE_PER_DAY',
                                   'WIRE_OCCURN_FLG' : 'NO_OF_WIRE_OCCURN_PER_DAY'}, inplace=True)

df_no_of_outages.head()

Unnamed: 0,Date,NO_OF_POWER_OUT_CLUE_PER_DAY,NO_OF_OPEN_DEVICE_CLUE_PER_DAY,NO_OF_IVR_CLUE_PER_DAY,NO_OF_ANIMAL_CAUSE_PER_DAY,NO_OF_WIRE_OCCURN_PER_DAY
0,2020-09-23,0,0,0,0,0


In [4]:
try:
    df_clue_count=pd.read_csv('gs://aes-analytics-0002-curated/Outage_Restoration/Staging/OMS_Clue_Flag_Record.csv')
except:
    df_no_of_outages.to_csv('gs://aes-analytics-0002-curated/Outage_Restoration/Staging/OMS_Clue_Flag_Record.csv',index=False)

In [5]:
record_date=(datetime.today()-timedelta(days=1)).date()

In [6]:
df_clue_count['Date']=pd.to_datetime(df_clue_count.Date).dt.date
df_clue_count_current=df_clue_count[df_clue_count.Date>=record_date]

In [7]:
df_clue_count_current

Unnamed: 0,Date,NO_OF_POWER_OUT_CLUE_PER_DAY,NO_OF_OPEN_DEVICE_CLUE_PER_DAY,NO_OF_IVR_CLUE_PER_DAY,NO_OF_ANIMAL_CAUSE_PER_DAY,NO_OF_WIRE_OCCURN_PER_DAY
1,2020-09-24,1,0,0,0,0


In [8]:
df_clue_count=df_clue_count.append(df_no_of_outages)

In [9]:
df_clue_count = df_clue_count.groupby(['Date'],as_index=False).agg({'NO_OF_POWER_OUT_CLUE_PER_DAY' : 'sum', 'NO_OF_OPEN_DEVICE_CLUE_PER_DAY' : 'sum', 'NO_OF_IVR_CLUE_PER_DAY' : 'sum', 'NO_OF_ANIMAL_CAUSE_PER_DAY' : 'sum',
                                                                'NO_OF_WIRE_OCCURN_PER_DAY' : 'sum'})

In [10]:
df_clue_countto_csv('gs://aes-analytics-0002-curated/Outage_Restoration/Staging/OMS_Clue_Flag_Record.csv',index=False)

Unnamed: 0,Date,NO_OF_POWER_OUT_CLUE_PER_DAY,NO_OF_OPEN_DEVICE_CLUE_PER_DAY,NO_OF_IVR_CLUE_PER_DAY,NO_OF_ANIMAL_CAUSE_PER_DAY,NO_OF_WIRE_OCCURN_PER_DAY
0,2020-09-23,0,0,0,0,0
1,2020-09-24,1,0,0,0,0


In [11]:
df_final = df_final.merge(df_clue_count,how='left',left_on=['Date'],right_on=['Date'])

## **OUTAGE FEATURES**

In [14]:
try:    
    df_pred_outages = 'SELECT OUTAGE_ID,Creation_Time FROM aes-analytics-0002.mds_outage_restoration.IPL_PREDICTIONS_ws where creation_time>='+"'"+str(record_date)+"'"
    df_pred_outages = gbq.read_gbq(df_pred_outages, project_id = "aes-analytics-0002")
    df_pred_outages.reset_index(drop=True,inplace=True)
except:
    df_pred_outages=pd.DataFrame()

In [15]:
df_pred_outages['CREATION_DATETIME']=pd.to_datetime(df_pred_outages.Creation_Time)
df_pred_outages.drop(['Creation_Time'],axis=1,inplace=True)

In [16]:
df_final_outage_count=df_final[['OUTAGE_ID','CREATION_DATETIME']]
df_final_outage_count=df_final_outage_count.append(df_pred_outages)
df_final_outage_count.reset_index(inplace=True)

In [18]:
def count_outage_minutes(group):
    group = group.reset_index(drop = True)
    df_temp = df_final_outage_count[['OUTAGE_ID','CREATION_DATETIME']]
    df_temp['minutes'] = (group['CREATION_DATETIME'][0] - df_final_outage_count['CREATION_DATETIME']).dt.total_seconds().div(60)
    df_temp = df_temp[df_temp.minutes > 0]
    group['Outages_in_last_1hr'] = len(df_temp[df_temp.minutes <= 60])
    group['Outages_in_last_2hr'] = len(df_temp[df_temp.minutes <= 120])
    group['Outages_in_last_3hr'] = len(df_temp[df_temp.minutes <= 180])
    group['Outages_in_last_4hr'] = len(df_temp[df_temp.minutes <= 240])
    group['Outages_in_last_5hr'] = len(df_temp[df_temp.minutes <= 300])
    group['Outages_in_last_6hr'] = len(df_temp[df_temp.minutes <= 360])
    group['Outages_in_last_7hr'] = len(df_temp[df_temp.minutes <= 420])
    group['Outages_in_last_8hr'] = len(df_temp[df_temp.minutes <= 480])
    group['Outages_in_last_9hr'] = len(df_temp[df_temp.minutes <= 540])
    group['Outages_in_last_10hr'] = len(df_temp[df_temp.minutes <= 600])
    return group

def grouping_fn_minutes(df):
    liveoutage = df.groupby(['OUTAGE_ID'], as_index=False).apply(count_outage_minutes)
    return liveoutage
live_outages=grouping_fn_minutes(df_final)

2020-09-25 11:55:30,655 NumExpr defaulting to 8 threads.


In [19]:
live_outages[[ 'Outages_in_last_1hr',
 'Outages_in_last_2hr',
 'Outages_in_last_3hr',
 'Outages_in_last_4hr',
 'Outages_in_last_5hr',
 'Outages_in_last_6hr',
 'Outages_in_last_7hr',
 'Outages_in_last_8hr',
 'Outages_in_last_9hr',
 'Outages_in_last_10hr']]

Unnamed: 0,Outages_in_last_1hr,Outages_in_last_2hr,Outages_in_last_3hr,Outages_in_last_4hr,Outages_in_last_5hr,Outages_in_last_6hr,Outages_in_last_7hr,Outages_in_last_8hr,Outages_in_last_9hr,Outages_in_last_10hr
0,0,0,0,0,0,0,0,0,0,0


In [21]:
df_omsds

Unnamed: 0,INCIDENT_ID,STRCTUR_NO,CIRCT_ID,DNI_EQUIP_TYPE,CALL_QTY,CUST_QTY,KVA_VAL,DOWNSTREAM_KVA_VAL,INCIDENT_DEVICE_ID,CREATION_DATETIME,...,windSpd100mMax,wetBulbMin,wetBulbAvg,wetBulbMax,WIND_DIRECTION,SEASON,weekend_flag,Date,Location,clusters
0,2001543646,321WA/155,1207,SB_FUSE,2,3,0.0,175.0,2002750725,2020-09-23 17:17:36+00:00,...,15.4,54.7,58.2,62.3,S-W-W,SUMMER,0,2020-09-23,Marker 6,Cluster1


In [1]:
%%sh
pip uninstall google-cloud-bigquery --y
http_proxy=http://10.245.5.249:8080
export http_proxy
https_proxy=https://10.245.5.249:8080
export https_proxy
pip install google-cloud-bigquery==1.26.0
pip install google-cloud-bigquery-storage

Uninstalling google-cloud-bigquery-1.27.2:
  Successfully uninstalled google-cloud-bigquery-1.27.2
Collecting google-cloud-bigquery==1.26.0
  Downloading https://files.pythonhosted.org/packages/39/6d/6846ba302c751f72767003b0fc0bf89d43a78fe7fe4149ed6d1b635eb052/google_cloud_bigquery-1.26.0-py2.py3-none-any.whl (170kB)
Collecting google-resumable-media<0.6dev,>=0.5.0 (from google-cloud-bigquery==1.26.0)
  Downloading https://files.pythonhosted.org/packages/f2/cc/cd05c633298fcbba5d61b6b8844de598e001954281a004fc1a13c61a5121/google_resumable_media-0.5.1-py2.py3-none-any.whl
Installing collected packages: google-resumable-media, google-cloud-bigquery
  Found existing installation: google-resumable-media 1.0.0
    Uninstalling google-resumable-media-1.0.0:
      Successfully uninstalled google-resumable-media-1.0.0
Successfully installed google-cloud-bigquery-1.26.0 google-resumable-media-0.5.1
Collecting google-cloud-bigquery-storage
  Downloading https://files.pythonhosted.org/packages/42/9

ERROR: google-cloud-storage 1.31.0 has requirement google-resumable-media<2.0dev,>=1.0.0, but you'll have google-resumable-media 0.5.1 which is incompatible.
ERROR: fairing 0.5.3 has requirement tornado<6.0.0,>=5.1.1, but you'll have tornado 6.0.3 which is incompatible.


In [None]:
df_omsds[df_omsds.isnull().any(axis=1)]