# Outage Duration Prediction

**Name(s)**: Neil Sharma, Xiang Ding

**Website Link**: (your website link)

## Code

In [4]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import plotly.express as px
import plotly.graph_objects as go
pd.options.plotting.backend = 'plotly'

### Framing the Problem

Prediction Problem Type: This problem type is regression, as we are trying to predict a continuous quantity, in this case how long an outage occurs.

Response Variable: Our response variable will be, in minutes, how long an outage occurs for. This value can be seen in the dataset as 'OUTAGE.DURATION'

Metric: We will use mean squared error (MSE) loss as our metric for evaluation. We decided on MSE due to its sensitivity to outliers and its ability to be easily understood as it maintains the same units as our response variable, 'OUTAGE.DURATION'.

In [43]:
##################################################
#     DATA CLEANING CODE FROM PROJECT 3          #
##################################################
df = pd.read_excel('outage.xlsx', skiprows = 5)

df = df.set_index('OBS')
df = df.iloc[1: , :]
df = df[df.columns[1:]]

df['OUTAGE.START.DATE'] = pd.to_datetime(df['OUTAGE.START.DATE'])
df['OUTAGE.START.DATE'] = df['OUTAGE.START.DATE'].dt.date

df['OUTAGE.RESTORATION.DATE'] = pd.to_datetime(df['OUTAGE.RESTORATION.DATE'])
df['OUTAGE.RESTORATION.DATE'] = df['OUTAGE.RESTORATION.DATE'].dt.date
df['CUSTOMERS.AFFECTED_MISSING'] = df['CUSTOMERS.AFFECTED'].isna().astype(int)


df = df[["YEAR", "MONTH", 'NERC.REGION', 'CLIMATE.REGION', 'ANOMALY.LEVEL', 'CLIMATE.CATEGORY', 
         'CAUSE.CATEGORY', 'OUTAGE.DURATION', 'DEMAND.LOSS.MW', 'CUSTOMERS.AFFECTED',
         'OUTAGE.START.TIME', 'OUTAGE.RESTORATION.TIME', 'POPULATION']]
df = df.dropna(subset=['ANOMALY.LEVEL', 'CLIMATE.CATEGORY', 'OUTAGE.DURATION', 'OUTAGE.START.TIME', 'OUTAGE.RESTORATION.TIME', 'CUSTOMERS.AFFECTED'])


def categorize_time(time_str):
    if pd.isna(time_str):
        return 'Unknown'  
    time = pd.to_datetime(time_str, format='%I:%M:%S %p', errors='coerce')
    if time.hour < 12:
        return 'Morning'
    else:
        return 'Afternoon/Evening'

# Apply the function to create new columns
df['OUTAGE.START.CATEGORY'] = df['OUTAGE.START.TIME'].apply(categorize_time)
df['OUTAGE.END.CATEGORY'] = df['OUTAGE.RESTORATION.TIME'].apply(categorize_time)


pd.set_option('display.max_columns', None)
df

Unnamed: 0_level_0,YEAR,MONTH,NERC.REGION,CLIMATE.REGION,ANOMALY.LEVEL,CLIMATE.CATEGORY,CAUSE.CATEGORY,OUTAGE.DURATION,DEMAND.LOSS.MW,CUSTOMERS.AFFECTED,OUTAGE.START.TIME,OUTAGE.RESTORATION.TIME,POPULATION,OUTAGE.START.CATEGORY,OUTAGE.END.CATEGORY
OBS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1.0,2011.0,7.0,MRO,East North Central,-0.3,normal,severe weather,3060,,70000.0,17:00:00,20:00:00,5348119.0,Afternoon/Evening,Afternoon/Evening
3.0,2010.0,10.0,MRO,East North Central,-1.5,cold,severe weather,3000,,70000.0,20:00:00,22:00:00,5310903.0,Afternoon/Evening,Afternoon/Evening
4.0,2012.0,6.0,MRO,East North Central,-0.1,normal,severe weather,2550,,68200.0,04:30:00,23:00:00,5380443.0,Afternoon/Evening,Afternoon/Evening
5.0,2015.0,7.0,MRO,East North Central,1.2,warm,severe weather,1740,250,250000.0,02:00:00,07:00:00,5489594.0,Afternoon/Evening,Afternoon/Evening
6.0,2010.0,11.0,MRO,East North Central,-1.4,cold,severe weather,1860,,60000.0,15:00:00,22:00:00,5310903.0,Afternoon/Evening,Afternoon/Evening
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1523.0,2004.0,6.0,WECC,Northwest,0.3,normal,system operability disruption,95,157,35000.0,17:35:00,19:10:00,1391802.0,Afternoon/Evening,Afternoon/Evening
1524.0,2011.0,1.0,WECC,Northwest,-1.3,cold,intentional attack,360,0,0.0,07:00:00,13:00:00,1584134.0,Afternoon/Evening,Afternoon/Evening
1525.0,2003.0,6.0,WECC,Northwest,-0.1,normal,public appeal,1548,0,0.0,15:12:00,17:00:00,1363380.0,Afternoon/Evening,Afternoon/Evening
1527.0,2016.0,3.0,WECC,Northwest,1.6,warm,intentional attack,0,0,0.0,00:00:00,00:00:00,1680026.0,Afternoon/Evening,Afternoon/Evening


### Baseline Model

In [46]:
X = df[['ANOMALY.LEVEL', 'CUSTOMERS.AFFECTED','OUTAGE.START.CATEGORY', 'OUTAGE.END.CATEGORY']]
y = df['OUTAGE.DURATION']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Preprocessing for numerical data: no transformation needed
# Preprocessing for categorical data: OneHotEncoder
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), ['OUTAGE.START.CATEGORY', 'OUTAGE.END.CATEGORY']),
        ('standard', StandardScaler(), ['CUSTOMERS.AFFECTED', 'ANOMALY.LEVEL'])
    ])

# Create a pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', LinearRegression())])

# Train the model
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 14099309.776195405


In [45]:
trace0 = go.Scatter(
    x=np.arange(len(y_test)),
    y=y_test,
    mode='markers',
    name='Actual Values'
)
trace1 = go.Scatter(
    x=np.arange(len(y_pred)),
    y=y_pred,
    mode='markers',
    name='Predicted Values'
)

# Create layout
layout = go.Layout(
    title='Actual vs Predicted Values',
    xaxis=dict(title='Index'),
    yaxis=dict(title='Outage Duration')
)

# Create figure and add traces
fig = go.Figure(data=[trace0, trace1], layout=layout)

# Show plot
fig.show()

### Final Model

In [None]:
# TODO

### Fairness Analysis

In [None]:
# TODO