In [7]:
! pip install -q stumpy

In [8]:
from IPython.display import Image,display
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from time import sleep
from os.path import exists
from datetime import datetime

if not exists('/content/Machine-Learning-For-Manufacturing/Data/o.csv') or not exists('/content/Machine-Learning-For-Manufacturing/Data/pi.csv'):
  ! git clone https://github.com/d0c0nn0r/Machine-Learning-For-Manufacturing
! chmod ogu+rwx /content/Machine-Learning-For-Manufacturing/data/*.*

chmod: cannot access '/content/Machine-Learning-For-Manufacturing/data/*.*': No such file or directory


# **Introduction**

Setup is down, to import the dependency libraries and data sets into the notebook.

Pandas is used to handle importing CSV data sets.
Plotly is used for all graphing and outputs.
Stumpy is used to all Time-series related mathematical calculations, profiling and analysis. 

## **Load data**

Our predefined data sets are hosted in 3 csv files.
* o.csv: OPC Data for a specific piece of I/O
* pc.csv: Compressed data (i.e. archived values) for the corresponding PI Tag of the OPC Data captured in the previous file.
* pi.csv: Interpolated data for the corresponding PI tag of the OPC Data, measured at 1-second intervals.

The timestamp field is defined and consolidated across all datasets. Selecting and aligning the OPC and PI Data timestamps is important, as it will be required later.

In [9]:
import pandas as pd
import stumpy

import plotly
import plotly.graph_objects as go
from plotly.graph_objs import Scatter, Layout
from plotly.subplots import make_subplots

layout = Layout(
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    font_family="Courier New",
    font_size=12,
    font_color="#a5b1cd",
    title_font_family="Courier New",
    title_font_color="black",
    title_font_size=12,
    uirevision=True,
    autosize=True
)

o=pd.read_csv('/content/Machine-Learning-For-Manufacturing/Data/o.csv')
pc=pd.read_csv('/content/Machine-Learning-For-Manufacturing/Data/pc.csv')
pi=pd.read_csv('/content/Machine-Learning-For-Manufacturing/Data/pi.csv')

# 13:00 is local time!
o['TimeStamp']=pd.to_datetime(o['TimeStamp'].values)
#pc.head()
pc['TimeStamp']=pd.to_datetime(pc['LocalDateTime'].values)
pi['TimeStamp']=pd.to_datetime(pi['TimeStamp'].values)

#Filter out non-numeric rows
pi = pi[pd.to_numeric(pi.Value, errors='coerce').notnull()]
#convert value column to float type
pi = pi.astype({'Value':'float64'})

pi.head()

Unnamed: 0,TimeStamp,Value
0,2022-09-01 13:00:00,53.89225
1,2022-09-01 13:00:01,53.89875
2,2022-09-01 13:00:02,53.982708
3,2022-09-01 13:00:03,54.066666
4,2022-09-01 13:00:04,54.150623


## **Analyzing Signal Similarity**

After loading the data, the [MASS Distance Profile](https://stumpy.readthedocs.io/en/latest/api.html#mass) is calculated.
This distance measure is computed by measuring the euclidean distance OPC data signal, at every value-position in the PI Compressed Data. 

The distance profile is returned as an array of distance measures, at every position in the PI Compressed Data set. So, to find the "best-fit", we find the index-position of the minimum value, and that identifies the exact position then in the PI Compressed Data set, which is the most similar to the OPC Data. 
Knowing this starting point, we can then overlay the data, and also compare the latency in time between the data sets.


In [15]:
# need to resample
# DOC: DON'T resample, use interpolated data set
if o.index.name!='TimeStamp':
  o.set_index('TimeStamp',inplace=True)
  pc.set_index('TimeStamp',inplace=True)
  pi.set_index('TimeStamp',inplace=True)

#DOC don't resample
#o=o.resample('1S').interpolate(method='linear')
pc=pc.resample('1S').interpolate(method='linear')

# alignment using stumpy
#DOC use PI dataset
#distance_profile = stumpy.mass(o['Value'].values,pc['Value'].values,normalize=False)
distance_profile = stumpy.mass(o['Value'].values,
                               pi['Value'].values,
                               normalize=False
                               )
index_min = min(range(len(distance_profile)), key=distance_profile.__getitem__)

distance_profile2 = stumpy.mass(o['Value'].values,
                               pc['Value'].values,
                               normalize=False
                               )
index_min2 = min(range(len(distance_profile2)), key=distance_profile2.__getitem__)

n=len(o['Value'].values)

## **Visualizing Signal Similarity: Using Interpolated Data**

Next, we want to create 3 graphs to illustrate our data.

### **Graph #1: Original Data**

We print 2 signals. 
* The "pi interpolated" represents the Interpolated Data from the PI Data Historian.
* The "opc" represents the OPC Data received from the underlying device or PLC, through the OPC Server.

### **Graph #2: Distance Profile**

We print the distance profile, which measures the similarity of the signals over the entire time period.
The closer to 0 the value is, the more similar the signals are at that point in time. i.e. 0=100% matching.
This graph does not "line-up" perfectly with Graph #1.

### **Graph #3: Time-Aligned Overlay**

We print the re-aligned PI Interpolated data and OPC Data sets.
On this graph, the signals are overlayed on-top of each other, for the 'best-fit' time period. The best-fit time period is that which was measured by the Graph #2, Distance Profile.

In [11]:
# raw, similarity, aligned
fig = make_subplots(rows=3, cols=1, shared_xaxes=False)
fig.update_layout(layout)

fig.add_trace(go.Scatter(x=pi.index,y=pi['Value'], name='pi interpolated'), row=1, col=1)
fig.add_trace(go.Scatter(x=o.index,y=o['Value'], name='opc'),row=1, col=1)

fig.add_trace(go.Scatter(y=distance_profile, name='distance_profile'),row=2, col=1)

fig.add_trace(go.Scatter(y=o['Value'].values, name='opc-aligned'),row=3, col=1)
fig.add_trace(go.Scatter(y=pi['Value'].values[index_min:index_min+n], name='pi int.-aligned'),row=3, col=1)
fig.show()

In [16]:
# raw, similarity, aligned
fig = make_subplots(rows=3, cols=1, shared_xaxes=False)
fig.update_layout(layout)

fig.add_trace(go.Scatter(x=pc.index,y=pc['Value'], name='pi compressed'), row=1, col=1)
fig.add_trace(go.Scatter(x=o.index,y=o['Value'], name='opc'),row=1, col=1)

fig.add_trace(go.Scatter(y=distance_profile2, name='distance_profile'),row=2, col=1)

fig.add_trace(go.Scatter(y=o['Value'].values, name='opc-aligned'),row=3, col=1)
fig.add_trace(go.Scatter(y=pc['Value'].values[index_min2:index_min2+n], name='pc comp.-aligned'),row=3, col=1)
fig.show()

## **Final Results**

We print out some statistical results from the data set analysis.

### **Max Error**

{Description needed}

### **Explained Variance Score**

{Description needed}

### **Mean Absolute Error**

{Description needed}

### **Mean Squared Error**

{Description needed}

### **Median Absolute Error**

{Description needed}


In [12]:
# some metrics
from sklearn.metrics import *

y_true=o['Value'].values
y_pred=pi['Value'].values[index_min:index_min+n]
first_opc_ts=o.iloc[0].name
matched_pic_ts=pi.iloc[index_min].name

print('max_error: ',max_error(y_true, y_pred))
print('explained_variance_score: ',explained_variance_score(y_true, y_pred))
print('mean_absolute_error: ',mean_absolute_error(y_true, y_pred))
print('mean_squared_error: ',mean_squared_error(y_true, y_pred))
print('median_absolute_error ',median_absolute_error(y_true, y_pred))
print('OPC Signal Start Time: ', first_opc_ts.strftime("%d-%b-%y %H:%M:%S.%f"))
print('PI Signal Start Time: ', matched_pic_ts.strftime("%d-%b-%y %H:%M:%S.%f"))

max_error:  1.7631296899999995
explained_variance_score:  0.8196702412160409
mean_absolute_error:  0.36358510524253723
mean_squared_error:  0.3488542236328387
median_absolute_error  0.14625265499999784
OPC Signal Start Time:  01-Sep-22 13:24:57.000000
PI Signal Start Time:  01-Sep-22 13:25:20.000000
