In [None]:
! pip install -q stumpy

In [None]:
from IPython.display import Image,display
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from time import sleep
from os.path import exists
from datetime import datetime

if not exists('/content/Machine-Learning-For-Manufacturing/Data/o.csv') or not exists('/content/Machine-Learning-For-Manufacturing/Data/pi.csv'):
  ! git clone https://github.com/d0c0nn0r/Machine-Learning-For-Manufacturing
! chmod ogu+rwx /content/Machine-Learning-For-Manufacturing/data/*.*

# **Introduction**

Setup is down, to import the dependency libraries and data sets into the notebook.

Pandas is used to handle importing CSV data sets.
Plotly is used for all graphing and outputs.
Stumpy is used to all Time-series related mathematical calculations, profiling and analysis. 

## **Load data**

Our predefined data sets are hosted in 3 csv files.
* o.csv: OPC Data for a specific piece of I/O
* pc.csv: Compressed data (i.e. archived values) for the corresponding PI Tag of the OPC Data captured in the previous file.
* pi.csv: Interpolated data for the corresponding PI tag of the OPC Data, measured at 1-second intervals.

The timestamp field is defined and consolidated across all datasets. Selecting and aligning the OPC and PI Data timestamps is important, as it will be required later.

In [None]:
import pandas as pd
import stumpy

import plotly
import plotly.graph_objects as go
from plotly.graph_objs import Scatter, Layout
from plotly.subplots import make_subplots

layout = Layout(
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    font_family="Courier New",
    font_size=12,
    font_color="#a5b1cd",
    title_font_family="Courier New",
    title_font_color="black",
    title_font_size=12,
    uirevision=True,
    autosize=True
)

o=pd.read_csv('/content/Machine-Learning-For-Manufacturing/Data/o.csv')
pc=pd.read_csv('/content/Machine-Learning-For-Manufacturing/Data/pc.csv')
pi=pd.read_csv('/content/Machine-Learning-For-Manufacturing/Data/pi.csv')

# 13:00 is local time!
o['TimeStamp']=pd.to_datetime(o['TimeStamp'].values)
#pc.head()
pc['TimeStamp']=pd.to_datetime(pc['LocalDateTime'].values)
pi['TimeStamp']=pd.to_datetime(pi['TimeStamp'].values)

#Filter out non-numeric rows
pi = pi[pd.to_numeric(pi.Value, errors='coerce').notnull()]
#convert value column to float type
pi = pi.astype({'Value':'float64'})

pi_shifted = pi.copy()
pi_shifted['TimeStamp'] = pi_shifted['TimeStamp'] + timedelta(seconds=30)
pi.head()
pi_shifted.head()

## **Analyzing Signal Similarity**

After loading the data, the [MASS Distance Profile](https://stumpy.readthedocs.io/en/latest/api.html#mass) is calculated.
This distance measure is computed by measuring the euclidean distance between the OPC data signal, at the Interpolated PI Data. 

The distance profile is returned as an array of distance measures, at every position in the Dataset arrays (i.e. OPC data and PI data). 

To account for possible 'client-time-drift' between the OSI-PI Interface capturing values into PI Data Historian, and the OPC Client used to capture our data set, we use this distance profile to find the "best-fit" for overlaying the signals on top of each other.

Once the "best-fit" signal overlay has been found, we now know the exact starting index (i.e. timestamp) where overlaying the Interpolated PI Data and OPC data matches best. 
This interesting also allows us to measure the 'clock-drift' between our OPC Client and the PI Interface as they acquire/read/poll/advise data from a source.


In [None]:
# need to resample
# DOC: DON'T resample, use interpolated data set
if o.index.name!='TimeStamp':
  o.set_index('TimeStamp',inplace=True)
  pc.set_index('TimeStamp',inplace=True)
  pi.set_index('TimeStamp',inplace=True)

# Resample OPC Data, as we expect a value at every single second
# In can occur than data does not occur every second, so
# we re-sample
o=o.resample('1S').interpolate(method='linear')

# Resample also for COMPRESSED data, to get a
# panda's "interpolated" value at every 1 second
pc=pc.resample('1S').interpolate(method='linear')

# alignment using stumpy
# DISTANCE using PI INTERPOLATED VALUES
distance_profile = stumpy.mass(o['Value'].values,
                               pi['Value'].values,
                               normalize=False
                               )
index_min = min(range(len(distance_profile)), key=distance_profile.__getitem__)

#DISTANCE using PANDAS INTERPOLATED VALUES
distance_profile2 = stumpy.mass(o['Value'].values,
                               pc['Value'].values,
                               normalize=False
                               )
index_min2 = min(range(len(distance_profile2)), key=distance_profile2.__getitem__)

#DISTANCE using time-shifted dataframe
distance_profile3 = stumpy.mass(o['Value'].values,
                               pi_shifted['Value'].values,
                               normalize=False
                               )
index_min3 = min(range(len(distance_profile3)), key=distance_profile3.__getitem__)

n=len(o['Value'].values)

## **Visualizing Signal Similarity: Using Interpolated Data**

Next, we want to create 3 graphs to illustrate our data.

### **Graph #1: Original Data**

We print 2 signals. 
* The "pi interpolated" represents the Interpolated Data from the PI Data Historian.
* The "opc" represents the OPC Data received from the underlying device or PLC, through the OPC Server.

### **Graph #2: Distance Profile**

We print the distance profile, which measures the similarity of the signals over the entire time period.
The closer to 0 the value is, the more similar the signals are at that point in time. i.e. 0=100% matching.
This graph does not "line-up" perfectly with Graph #1.

### **Graph #3: Time-Aligned Overlay**

We print the re-aligned PI Interpolated data and OPC Data sets.
On this graph, the signals are overlayed on-top of each other, for the 'best-fit' time period. The best-fit time period is that which was measured by the Graph #2, Distance Profile.

## **Statistics**

### Max Error

Max error computes the maximum residual error between 2 data sets: the PI Interpolated data set, and the OPC data set.
The value returned is the maximum difference between 2 corresponding values at the same index in the data sets.
For more detail, see (sklean.metrics documentation)[https://scikit-learn.org/stable/modules/model_evaluation.html#max-error]

### Explained Variance Score

The explained variance is used to measure the proportion of the variability of the 2 data sets.
The closer this value is to 100%, to more accurate the 2 data sets are.
For more detail, see (sklean.metrics documentation)[https://scikit-learn.org/stable/modules/model_evaluation.html#explained-variance-score]

### Mean Absolute Error

This refers to the magnitude of difference between the prediction of an observation and the true value of that observation. MAE takes the average of absolute errors for a group of predictions and observations as a measurement of the magnitude of errors for the entire group. MAE can also be referred as L1 loss function.

For more detail, see (sklean.metrics documentation)[https://scikit-learn.org/stable/modules/model_evaluation.html#mean-absolute-error]

### Mean Squared Error

The Mean Squared Error measures how close a regression line is to a set of data points. It is a risk function corresponding to the expected value of the squared error loss. A larger MSE indicates that the data points are dispersed widely around its central moment (mean), whereas a smaller MSE suggests the opposite. A smaller MSE is preferred because it indicates that your data points are dispersed closely around its central moment (mean).
For more detail, see (sklean.metrics documentation)[https://scikit-learn.org/stable/modules/model_evaluation.html#mean-squared-error]

### Median Absolute Error

The median_absolute_error is particularly interesting because it is robust to outliers. The loss is calculated by taking the median of all absolute differences between the target and the prediction.

For more detail, see (sklean.metrics documentation)[https://scikit-learn.org/stable/modules/model_evaluation.html#median-absolute-error]

In [None]:
# raw, similarity, aligned
fig = make_subplots(rows=3, cols=1, shared_xaxes=False)
fig.update_layout(layout)

fig.add_trace(go.Scatter(x=pi.index,y=pi['Value'], name='pi interpolated'), row=1, col=1)
fig.add_trace(go.Scatter(x=o.index,y=o['Value'], name='opc'),row=1, col=1)

fig.add_trace(go.Scatter(y=distance_profile, name='distance_profile'),row=2, col=1)

fig.add_trace(go.Scatter(y=o['Value'].values, name='opc-aligned'),row=3, col=1)
fig.add_trace(go.Scatter(y=pi['Value'].values[index_min:index_min+n], name='pi int.-aligned'),row=3, col=1)
fig.show()

# some metrics
from sklearn.metrics import *

y_true=o['Value'].values
y_pred=pi['Value'].values[index_min:index_min+n]
first_opc_ts=o.iloc[0].name
matched_pic_ts=pi.iloc[index_min].name

print('max_error: ',max_error(y_true, y_pred))
print('explained_variance_score: {:.3%}'.format(explained_variance_score(y_true, y_pred)))
print('mean_absolute_error: ',mean_absolute_error(y_true, y_pred))
print('mean_squared_error: ',mean_squared_error(y_true, y_pred))
print('median_absolute_error ',median_absolute_error(y_true, y_pred))
print('OPC Signal Start Time: ', first_opc_ts.strftime("%d-%b-%y %H:%M:%S.%f"))
print('PI Signal Start Time: ', matched_pic_ts.strftime("%d-%b-%y %H:%M:%S.%f"))

# Time shifted

Let's see output using the time-shifted data set
This should showcase how clock-drift is managed by the functions


In [None]:
# raw, similarity, aligned
fig = make_subplots(rows=3, cols=1, shared_xaxes=False)
fig.update_layout(layout)

fig.add_trace(go.Scatter(x=pi_shifted.index,y=pi_shifted['Value'], name='pi interpolated'), row=1, col=1)
fig.add_trace(go.Scatter(x=o.index,y=o['Value'], name='opc'),row=1, col=1)

fig.add_trace(go.Scatter(y=distance_profile3, name='distance_profile'),row=2, col=1)

fig.add_trace(go.Scatter(y=o['Value'].values, name='opc-aligned'),row=3, col=1)
fig.add_trace(go.Scatter(y=pi_shifted['Value'].values[index_min3:index_min3+n], name='pi int.-aligned'),row=3, col=1)
fig.show()

# some metrics
from sklearn.metrics import *

y_true          = o['Value'].values
y_pred          = pi_shifted['Value'].values[index_min3:index_min3+n]
first_opc_ts    = o.iloc[0].name
matched_pic_ts  = pi_shifted.iloc[index_min3].name

print('max_error: ',max_error(y_true, y_pred))
print('explained_variance_score: {:.3%}'.format(explained_variance_score(y_true, y_pred)))
print('mean_absolute_error: ',mean_absolute_error(y_true, y_pred))
print('mean_squared_error: ',mean_squared_error(y_true, y_pred))
print('median_absolute_error ',median_absolute_error(y_true, y_pred))
print('OPC Signal Start Time: ', first_opc_ts.strftime("%d-%b-%y %H:%M:%S.%f"))
print('PI Signal Start Time: ', matched_pic_ts.strftime("%d-%b-%y %H:%M:%S.%f"))