In [127]:
import pandas as pd
from prophet import Prophet
from datetime import datetime
from pandas.api.types import CategoricalDtype
from google.cloud import bigquery
import os

In [100]:
df = pd.read_csv('IWC_Work_Orders_Extract.csv', low_memory=False)

## Clean Data

In [101]:
date_columns = ['EXECUTION_START_DATE', 'EXECUTION_FINISH_DATE', 'EQUIP_START_UP_DATE', 'EQUIP_VALID_FROM', 'EQUIP_VALID_TO']

for col in df.columns:
    col.strip()

for col in date_columns:
    df[col] = pd.to_datetime(df[col], errors='coerce')
df['START_YEAR_WEEK'] = df['EXECUTION_START_DATE'].dt.to_period('W').dt.to_timestamp()

In [102]:
df['MAINTENANCE_ACTIVITY_TYPE'].unique()

array(['Unplanned', 'Planned'], dtype=object)

## Create Aggregation DF

In [103]:
# Separate DataFrames for planned and unplanned maintenance
planned_df = df[df['MAINTENANCE_ACTIVITY_TYPE'] == 'Planned']
unplanned_df = df[df['MAINTENANCE_ACTIVITY_TYPE'] == 'Unplanned']

# Group and calculate aggregates for planned maintenance
planned_agg = planned_df.groupby(['PRODUCTION_LOCATION', 'FUNCTIONAL_AREA_NODE_2_MODIFIED', 'EQUIPMENT_ID']).agg(
    average_minutes_planned=('ACTUAL_WORK_IN_MINUTES', 'mean'),
    count_planned=('ACTUAL_WORK_IN_MINUTES', 'count')
).reset_index()

# Group and calculate aggregates for unplanned maintenance
unplanned_agg = unplanned_df.groupby(['PRODUCTION_LOCATION', 'FUNCTIONAL_AREA_NODE_2_MODIFIED', 'EQUIPMENT_ID']).agg(
    average_minutes_unplanned=('ACTUAL_WORK_IN_MINUTES', 'mean'),
    count_unplanned=('ACTUAL_WORK_IN_MINUTES', 'count')
).reset_index()

# Merge the two aggregated DataFrames on the common columns
agg_df = pd.merge(planned_agg, unplanned_agg, on=['PRODUCTION_LOCATION', 'FUNCTIONAL_AREA_NODE_2_MODIFIED', 'EQUIPMENT_ID'], how='outer')

# Calculate overall average minutes and count from the original DataFrame
overall_agg = df.groupby(['PRODUCTION_LOCATION', 'FUNCTIONAL_AREA_NODE_2_MODIFIED', 'EQUIPMENT_ID']).agg(
    average_minutes=('ACTUAL_WORK_IN_MINUTES', 'mean'),
    count=('ACTUAL_WORK_IN_MINUTES', 'count')
).reset_index()

# Merge with the overall data
agg_df = pd.merge(agg_df, overall_agg, on=['PRODUCTION_LOCATION', 'FUNCTIONAL_AREA_NODE_2_MODIFIED', 'EQUIPMENT_ID'], how='outer')

# Filter the final DataFrame based on conditions
agg_df['time_saved'] = (agg_df['average_minutes_unplanned'] - agg_df['average_minutes_planned']).fillna(0)

planned_vs_unplanned_key = agg_df[(agg_df['count_planned'] > 50) & (agg_df['count_unplanned'] > 50) & (agg_df['time_saved'] > 0)]

In [104]:
planned_vs_unplanned_key.sort_values(by='count', ascending=False).head()

Unnamed: 0,PRODUCTION_LOCATION,FUNCTIONAL_AREA_NODE_2_MODIFIED,EQUIPMENT_ID,average_minutes_planned,count_planned,average_minutes_unplanned,count_unplanned,average_minutes,count,time_saved
149,COTA,CAN LINE,300025792.0,65.223529,782.0,114.04925,3801.0,105.718089,4583,48.825721
1993,ROMA,G812 SHOP / REPAIR AREA,300017655.0,38.2493,1286.0,111.770785,2930.0,89.344639,4216,73.521485
228,COTA,CAN LINE,300115000.0,59.36736,625.0,94.478462,3120.0,88.618798,3745,35.111102
1302,MONZA,BTL_PET_LINE,300001113.0,25.338889,1944.0,160.797733,794.0,64.620964,2738,135.458844
2799,SUZUKA,L4 - FILLER_ROTARY_CAN,300009199.0,41.001697,943.0,146.870423,1704.0,109.154439,2647,105.868726


## Create Training Dataframe

In [105]:
training_df = df.groupby(['PRODUCTION_LOCATION', 'FUNCTIONAL_AREA_NODE_2_MODIFIED', 'EQUIPMENT_ID', 'START_YEAR_WEEK'])['ACTUAL_WORK_IN_MINUTES'].sum().reset_index()

In [106]:
training_df.head()

Unnamed: 0,PRODUCTION_LOCATION,FUNCTIONAL_AREA_NODE_2_MODIFIED,EQUIPMENT_ID,START_YEAR_WEEK,ACTUAL_WORK_IN_MINUTES
0,COTA,AIR SYSTEMS,300025776.0,2017-11-13,60.0
1,COTA,AIR SYSTEMS,300025776.0,2017-11-27,6.0
2,COTA,AIR SYSTEMS,300025776.0,2018-11-26,90.0
3,COTA,AIR SYSTEMS,300025776.0,2019-05-27,60.0
4,COTA,AIR SYSTEMS,300025776.0,2019-10-28,240.0


## Filter to One Line Item to Prepare For Prophet

In [107]:
prod = 'COTA'
func = 'CAN LINE'
equip = 300025792.0

In [108]:
selected_df = training_df[
    (training_df['PRODUCTION_LOCATION'] == prod) &
    (training_df['FUNCTIONAL_AREA_NODE_2_MODIFIED'] == func) &
    (training_df['EQUIPMENT_ID'] == equip)
]

In [109]:
selected_df.head()

Unnamed: 0,PRODUCTION_LOCATION,FUNCTIONAL_AREA_NODE_2_MODIFIED,EQUIPMENT_ID,START_YEAR_WEEK,ACTUAL_WORK_IN_MINUTES
13643,COTA,CAN LINE,300025792.0,2017-10-30,300.0
13644,COTA,CAN LINE,300025792.0,2017-11-06,570.0
13645,COTA,CAN LINE,300025792.0,2017-11-13,636.0
13646,COTA,CAN LINE,300025792.0,2017-11-20,234.0
13647,COTA,CAN LINE,300025792.0,2017-11-27,408.0


In [110]:
prophet_df = selected_df.groupby('START_YEAR_WEEK').agg(
    y=('ACTUAL_WORK_IN_MINUTES', 'sum')
).reset_index().rename(columns={'START_YEAR_WEEK': 'ds'})

In [111]:
prophet_df.head()

Unnamed: 0,ds,y
0,2017-10-30,300.0
1,2017-11-06,570.0
2,2017-11-13,636.0
3,2017-11-20,234.0
4,2017-11-27,408.0


In [112]:
# Define categorical dtype for days of the week
cat_type = CategoricalDtype(categories=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], ordered=True)

def create_features(df, label=None):
    """
    Creates time series features from a datetime column.
    """
    df = df.copy()
    # Ensure 'ds' is in datetime format
    df['date'] = pd.to_datetime(df['ds'])
    
    # Create features based on the datetime column
    df['dayofweek'] = df['date'].dt.dayofweek
    df['weekday'] = df['date'].dt.day_name()
    df['weekday'] = df['weekday'].astype(cat_type)
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['dayofyear'] = df['date'].dt.dayofyear
    df['dayofmonth'] = df['date'].dt.day
    df['date_offset'] = (df['date'].dt.month * 100 + df['date'].dt.day - 320) % 1300

    # Assign season based on date_offset
    df['season'] = pd.cut(df['date_offset'], [0, 300, 602, 900, 1300], labels=['Spring', 'Summer', 'Fall', 'Winter'])
    
    # Select features
    X = df[['ds', 'dayofweek', 'quarter', 'month', 'year', 'dayofyear', 'dayofmonth', 'weekday', 'season']]
    
    # Optionally select label
    if label:
        y = df[label]
        return X, y
    return X

# Run the function
X, y = create_features(prophet_df, label='y')
features_and_target = pd.concat([X, y], axis=1)


In [113]:
prophet_df = pd.concat([X[['ds']], y], axis=1)

In [114]:
model = Prophet()
model.fit(prophet_df)

20:26:16 - cmdstanpy - INFO - Chain [1] start processing
20:26:16 - cmdstanpy - INFO - Chain [1] done processing


<prophet.forecaster.Prophet at 0x11fe6ed20>

## Forecast

In [115]:
periods = 4

future = model.make_future_dataframe(periods=periods, freq='W')
forecast = model.predict(future)

In [116]:
forecast.head()

Unnamed: 0,ds,trend,yhat_lower,yhat_upper,trend_lower,trend_upper,additive_terms,additive_terms_lower,additive_terms_upper,yearly,yearly_lower,yearly_upper,multiplicative_terms,multiplicative_terms_lower,multiplicative_terms_upper,yhat
0,2017-10-30,386.62412,-1016.352223,1212.837976,386.62412,386.62412,-288.984084,-288.984084,-288.984084,-288.984084,-288.984084,-288.984084,0.0,0.0,0.0,97.640035
1,2017-11-06,395.631754,-1061.650637,1196.580228,395.631754,395.631754,-315.15965,-315.15965,-315.15965,-315.15965,-315.15965,-315.15965,0.0,0.0,0.0,80.472104
2,2017-11-13,404.639389,-1106.914568,1198.404081,404.639389,404.639389,-428.66627,-428.66627,-428.66627,-428.66627,-428.66627,-428.66627,0.0,0.0,0.0,-24.026881
3,2017-11-20,413.647023,-1133.050753,1148.214198,413.647023,413.647023,-425.358213,-425.358213,-425.358213,-425.358213,-425.358213,-425.358213,0.0,0.0,0.0,-11.71119
4,2017-11-27,422.654658,-968.022279,1311.846243,422.654658,422.654658,-245.548922,-245.548922,-245.548922,-245.548922,-245.548922,-245.548922,0.0,0.0,0.0,177.105735


## Find Variables

In [117]:
avg_mins = planned_vs_unplanned_key[
    (planned_vs_unplanned_key['PRODUCTION_LOCATION'] == prod) & 
    (planned_vs_unplanned_key['FUNCTIONAL_AREA_NODE_2_MODIFIED'] == func) &
    (planned_vs_unplanned_key['EQUIPMENT_ID'] == equip)
]['average_minutes'].iloc[0]

avg_mins_planned = planned_vs_unplanned_key[
    (planned_vs_unplanned_key['PRODUCTION_LOCATION'] == prod) & 
    (planned_vs_unplanned_key['FUNCTIONAL_AREA_NODE_2_MODIFIED'] == func) &
    (planned_vs_unplanned_key['EQUIPMENT_ID'] == equip)
]['average_minutes_planned'].iloc[0]

avg_mins_unplanned = planned_vs_unplanned_key[
    (planned_vs_unplanned_key['PRODUCTION_LOCATION'] == prod) & 
    (planned_vs_unplanned_key['FUNCTIONAL_AREA_NODE_2_MODIFIED'] == func) &
    (planned_vs_unplanned_key['EQUIPMENT_ID'] == equip)
]['average_minutes_unplanned'].iloc[0]

In [118]:
print(avg_mins)
print(avg_mins_unplanned)
print(avg_mins_planned)

105.71808858826097
114.0492501973165
65.22352941176472


## Create Features in Prophet Output

In [119]:
forecast['count'] = forecast['yhat']/avg_mins

In [120]:
forecast.head()

Unnamed: 0,ds,trend,yhat_lower,yhat_upper,trend_lower,trend_upper,additive_terms,additive_terms_lower,additive_terms_upper,yearly,yearly_lower,yearly_upper,multiplicative_terms,multiplicative_terms_lower,multiplicative_terms_upper,yhat,count
0,2017-10-30,386.62412,-1016.352223,1212.837976,386.62412,386.62412,-288.984084,-288.984084,-288.984084,-288.984084,-288.984084,-288.984084,0.0,0.0,0.0,97.640035,0.923589
1,2017-11-06,395.631754,-1061.650637,1196.580228,395.631754,395.631754,-315.15965,-315.15965,-315.15965,-315.15965,-315.15965,-315.15965,0.0,0.0,0.0,80.472104,0.761195
2,2017-11-13,404.639389,-1106.914568,1198.404081,404.639389,404.639389,-428.66627,-428.66627,-428.66627,-428.66627,-428.66627,-428.66627,0.0,0.0,0.0,-24.026881,-0.227273
3,2017-11-20,413.647023,-1133.050753,1148.214198,413.647023,413.647023,-425.358213,-425.358213,-425.358213,-425.358213,-425.358213,-425.358213,0.0,0.0,0.0,-11.71119,-0.110778
4,2017-11-27,422.654658,-968.022279,1311.846243,422.654658,422.654658,-245.548922,-245.548922,-245.548922,-245.548922,-245.548922,-245.548922,0.0,0.0,0.0,177.105735,1.675264


In [121]:
forecast['time_planned'] = forecast['count'] * avg_mins_planned
forecast['time_unplanned'] = forecast['count'] * avg_mins_unplanned

In [122]:
forecast.head()

Unnamed: 0,ds,trend,yhat_lower,yhat_upper,trend_lower,trend_upper,additive_terms,additive_terms_lower,additive_terms_upper,yearly,yearly_lower,yearly_upper,multiplicative_terms,multiplicative_terms_lower,multiplicative_terms_upper,yhat,count,time_planned,time_unplanned
0,2017-10-30,386.62412,-1016.352223,1212.837976,386.62412,386.62412,-288.984084,-288.984084,-288.984084,-288.984084,-288.984084,-288.984084,0.0,0.0,0.0,97.640035,0.923589,60.239717,105.334602
1,2017-11-06,395.631754,-1061.650637,1196.580228,395.631754,395.631754,-315.15965,-315.15965,-315.15965,-315.15965,-315.15965,-315.15965,0.0,0.0,0.0,80.472104,0.761195,49.647839,86.813744
2,2017-11-13,404.639389,-1106.914568,1198.404081,404.639389,404.639389,-428.66627,-428.66627,-428.66627,-428.66627,-428.66627,-428.66627,0.0,0.0,0.0,-24.026881,-0.227273,-14.823556,-25.92033
3,2017-11-20,413.647023,-1133.050753,1148.214198,413.647023,413.647023,-425.358213,-425.358213,-425.358213,-425.358213,-425.358213,-425.358213,0.0,0.0,0.0,-11.71119,-0.110778,-7.225302,-12.634096
4,2017-11-27,422.654658,-968.022279,1311.846243,422.654658,422.654658,-245.548922,-245.548922,-245.548922,-245.548922,-245.548922,-245.548922,0.0,0.0,0.0,177.105735,1.675264,109.266648,191.062632


In [148]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "swire-capstone-2024-26aef470ddfb.json"

client = bigquery.Client()

# Set your BigQuery table ID (in the format `project.dataset.table`)
table_id = "swire-capstone-2024.swire_provided.prophet"

# Upload data to BigQuery
job = client.load_table_from_dataframe(forecast, table_id)

# Wait for the job to complete
job.result()

print(f"Data successfully uploaded to {table_id}.")

Data successfully uploaded to swire-capstone-2024.swire_provided.prophet.
