Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge Notebooks #43

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20,000 changes: 20,000 additions & 0 deletions datasets/anomaly/raw_data/500_UCR_Anomaly_robotDOG1_10000_19280_19360.txt

Large diffs are not rendered by default.

2,438 changes: 2,438 additions & 0 deletions datasets/anomaly/raw_data/blockchain.csv

Large diffs are not rendered by default.

2,706 changes: 0 additions & 2,706 deletions examples/Demo Notebook/TODS Official Demo Notebook.ipynb

This file was deleted.

2,127 changes: 2,127 additions & 0 deletions examples/Demo Notebook/TODSBlockchainNotebook.ipynb

Large diffs are not rendered by default.

2,028 changes: 2,028 additions & 0 deletions examples/Demo Notebook/TODSOfficialNotebook.ipynb

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
from d3m import index
from d3m.metadata.base import ArgumentType
from d3m.metadata.pipeline import Pipeline, PrimitiveStep

# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest
# extract_columns_by_semantic_types(targets) -> ^

# Creating pipeline
pipeline_description = Pipeline()
pipeline_description.add_input(name='inputs')

# Step 0: dataset_to_dataframe
step_0 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe'))
step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0')
step_0.add_output('produce')
pipeline_description.add_step(step_0)

# Step 1: column_parser
step_1 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.data_processing.column_parser'))
step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce')
step_1.add_output('produce')
pipeline_description.add_step(step_1)

# Step 2: extract_columns_by_semantic_types(attributes)
step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.data_processing.extract_columns_by_semantic_types'))
step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce')
step_2.add_output('produce')
step_2.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE,
data=['https://metadata.datadrivendiscovery.org/types/Attribute'])
pipeline_description.add_step(step_2)

# Step 3: extract_columns_by_semantic_types(targets)
step_3 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.data_processing.extract_columns_by_semantic_types'))
step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce')
step_3.add_output('produce')
step_3.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE,
data=['https://metadata.datadrivendiscovery.org/types/TrueTarget'])
pipeline_description.add_step(step_3)

attributes = 'steps.2.produce'
targets = 'steps.3.produce'

# Step 4: processing
#step_4 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.timeseries_processing.transformation.axiswise_scaler'))
step_4 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.feature_analysis.statistical_maximum'))
#step_4 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.feature_analysis.statistical_minimum'))
step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=attributes)
step_4.add_output('produce')
pipeline_description.add_step(step_4)

# Step 5: supervised outlier detection algorithm
step_5 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.detection_algorithm.sod_primitive'))
step_5.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.4.produce')
step_5.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference=targets)
step_5.add_output('produce')
pipeline_description.add_step(step_5)

# step_5 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.detection_algorithm.pyod_knn'))
# step_5.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.4.produce')
# step_5.add_output('produce')
# pipeline_description.add_step(step_5)

# Step 6: Predictions
step_6 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.data_processing.construct_predictions'))
step_6.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.5.produce')
step_6.add_argument(name='reference', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce')
step_6.add_output('produce')
pipeline_description.add_step(step_6)

# Final Output
pipeline_description.add_output(name='output predictions', data_reference='steps.6.produce')

# Output to json
data = pipeline_description.to_json()
with open('./examples/axolotl_interface/example_pipelines/sod_pipeline.json', 'w') as f:
f.write(data)
print(data)

86 changes: 86 additions & 0 deletions examples/sk_examples/AEAutokeras.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from autokeras.engine.block import Block
import autokeras as ak
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.python.util import nest
from numpy import asarray
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
# from sklearn.model_selection import train_test_split
from autokeras import StructuredDataClassifier
from autokeras import StructuredDataRegressor
from tods.sk_interface.detection_algorithm.AutoEncoder_skinterface import AutoEncoderSKI

#how to split yahoo?
#is y true and y pred correct?
#show the notebook error
#how to get autokeras reports

#dataset
dataset = pd.read_csv("./yahoo_sub_5.csv")
data = dataset.to_numpy()
labels = dataset.iloc[:,6]
value1 = dataset.iloc[:,2] # delete later
print(labels)

#tods primitive
transformer = AutoEncoderSKI()
transformer.fit(data)
tods_output = transformer.predict(data)
prediction_score = transformer.predict_score(data)
print('result from AE primitive: \n', tods_output) #sk report
print('score from AE: \n', prediction_score)

#sk report
y_true = labels
y_pred = tods_output

print('Accuracy Score: ', accuracy_score(y_true, y_pred))

print('confusion matrix: \n', confusion_matrix(y_true, y_pred))

print(classification_report(y_true, y_pred))

precision, recall, thresholds = precision_recall_curve(y_true, y_pred)
f1_scores = 2*recall*precision/(recall+precision)

print('Best threshold: ', thresholds[np.argmax(f1_scores)])
print('Best F1-Score: ', np.max(f1_scores))


#classifier
print('Classifier Starts here:')
search = StructuredDataClassifier(max_trials=15)
# perform the search
search.fit(x=data, y=labels, verbose=0) # y = data label colume
# evaluate the model
loss, acc = search.evaluate(data, labels, verbose=0)
print('Accuracy: %.3f' % acc)
# use the model to make a prediction
# row = [0.0200,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,0.1609,0.1582,0.2238,0.0645,0.0660,0.2273,0.3100,0.2999,0.5078,0.4797,0.5783,0.5071,0.4328,0.5550,0.6711,0.6415,0.7104,0.8080,0.6791,0.3857,0.1307,0.2604,0.5121,0.7547,0.8537,0.8507,0.6692,0.6097,0.4943,0.2744,0.0510,0.2834,0.2825,0.4256,0.2641,0.1386,0.1051,0.1343,0.0383,0.0324,0.0232,0.0027,0.0065,0.0159,0.0072,0.0167,0.0180,0.0084,0.0090,0.0032]
# X_new = asarray([row]).astype('float32')
yhat = search.predict(data)
print('Predicted: %.3f' % yhat[0])
# get the best performing model
model = search.export_model()
# summarize the loaded model
model.summary()

#regressor
print('Regressor Starts here:')
search = StructuredDataRegressor(max_trials=15, loss='mean_absolute_error')
# perform the search
search.fit(x=data, y=labels, verbose=0) # y = data label
mae, _ = search.evaluate(data, labels, verbose=0)
print('MAE: %.3f' % mae)
# use the model to make a prediction
# X_new = asarray([[108]]).astype('float32')
yhat = search.predict(data)
print('Predicted: %.3f' % yhat[0])
# get the best performing model
model = search.export_model()
# summarize the loaded model
model.summary()
2 changes: 1 addition & 1 deletion examples/sk_examples/DeepLog_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
prediction_labels = transformer.predict(X_test)
prediction_score = transformer.predict_score(X_test)

print("Primitive: ", transformer.primitive)
# print("Primitive: ", transformer.primitive)
print("Prediction Labels\n", prediction_labels)
print("Prediction Score\n", prediction_score)

Expand Down
108 changes: 108 additions & 0 deletions examples/sk_examples/SubSeqAutokeras.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import autokeras as ak
from autokeras.engine.block import Block
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.python.util import nest
from tods.sk_interface.timeseries_processing.SubsequenceSegmentation_skinterface import SubsequenceSegmentationSKI
# load dataset
dataset = pd.read_csv("./yahoo_sub_5.csv")
data = dataset.to_numpy()
labels = dataset.iloc[:,6]
print(labels)
transformer = SubsequenceSegmentationSKI()
tods_output = transformer.produce(data)
print('result from SubsequenceSegmentation primitive:', tods_output)
print(tods_output.shape)

#autoregression

class MLPInteraction(Block):
"""Module for MLP operation. This block can be configured with different layer, unit, and other settings.
# Attributes:
units (int). The units of all layer in the MLP block.
num_layers (int). The number of the layers in the MLP block.
use_batchnorm (Boolean). Use batch normalization or not.
dropout_rate(float). The value of drop out in the last layer of MLP.
"""

def __init__(self,
units=None,
num_layers=None,
use_batchnorm=None,
dropout_rate=None,
**kwargs):
super().__init__(**kwargs)
self.units = units
self.num_layers = num_layers
self.use_batchnorm = use_batchnorm
self.dropout_rate = dropout_rate

def get_state(self):
state = super().get_state()
state.update({
'units': self.units,
'num_layers': self.num_layers,
'use_batchnorm': self.use_batchnorm,
'dropout_rate': self.dropout_rate})
return state

def set_state(self, state):
super().set_state(state)
self.units = state['units']
self.num_layers = state['num_layers']
self.use_batchnorm = state['use_batchnorm']
self.dropout_rate = state['dropout_rate']

def build(self, hp, inputs=None):
input_node = [tf.keras.layers.Flatten()(node) if len(node.shape) > 2 else node for node in nest.flatten(inputs)]
output_node = tf.concat(input_node, axis=1)
num_layers = self.num_layers or hp.Choice('num_layers', [1, 2, 3], default=2)
use_batchnorm = self.use_batchnorm
if use_batchnorm is None:
use_batchnorm = hp.Choice('use_batchnorm', [True, False], default=False)
dropout_rate = self.dropout_rate or hp.Choice('dropout_rate',
[0.0, 0.25, 0.5],
default=0)

for i in range(num_layers):
units = self.units or hp.Choice(
'units_{i}'.format(i=i),
[16, 32, 64, 128, 256, 512, 1024],
default=32)
output_node = tf.keras.layers.Dense(units)(output_node)
if use_batchnorm:
output_node = tf.keras.layers.BatchNormalization()(output_node)
output_node = tf.keras.layers.ReLU()(output_node)
output_node = tf.keras.layers.Dropout(dropout_rate)(output_node)
return output_node
inputs = ak.Input(shape=[7,]) #important!!! depends on shape above
print(inputs.shape)
print(inputs.dtype)
mlp_input = MLPInteraction()([inputs])
mlp_output = MLPInteraction()([mlp_input])

# Step 2.3: Setup optimizer to handle the target task
output = ak.RegressionHead()(mlp_output)

# Step 3: Build the searcher, which provides search algorithm
auto_model = ak.AutoModel(inputs=inputs,#produce
outputs=output, #final mlp out
objective='val_mean_squared_error',
max_trials=5
)
# Step 4: Use the searcher to search the recommender
auto_model.fit(x=[tods_output],
y=tods_output, #make new colume of labels of yahoo dataset # first element of next part
batch_size=32,
epochs=5)

accuracy = auto_model.evaluate(x=[tods_output],
y=labels)

print(accuracy)
# logger.info('Validation Accuracy (mse): {}'.format(auto_model.evaluate(x=[val_X_categorical],
# y=val_y)))
# # Step 5: Evaluate the searched model
# logger.info('Test Accuracy (mse): {}'.format(auto_model.evaluate(x=[tods_output],
# y=labels)))
5 changes: 3 additions & 2 deletions examples/sk_examples/Telemanom_test.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import numpy as np
from tods.tods_skinterface.primitiveSKI.detection_algorithm.Telemanom_skinterface import TelemanomSKI
from tods.sk_interface.detection_algorithm.Telemanom_skinterface import TelemanomSKI
# from tods.tods_skinterface.primitiveSKI.detection_algorithm.Telemanom_skinterface import TelemanomSKI
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
Expand All @@ -25,7 +26,7 @@
prediction_labels = transformer.predict(X_test)
prediction_score = transformer.predict_score(X_test)

print("Primitive: ", transformer.primitive)
# print("Primitive: ", transformer.primitive)
print("Prediction Labels\n", prediction_labels)
print("Prediction Score\n", prediction_score)
y_true = prediction_labels_train
Expand Down
67 changes: 67 additions & 0 deletions examples/sk_examples/Telemanom_yahoo_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import numpy as np
from tods.sk_interface.detection_algorithm.Telemanom_skinterface import TelemanomSKI
# from tods.tods_skinterface.primitiveSKI.detection_algorithm.Telemanom_skinterface import TelemanomSKI
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from sklearn import metrics
import pandas as pd
#prepare the data

data = pd.read_csv("./yahoo_sub_5.csv").to_numpy()
# print("shape:", data.shape)
# print("datatype of data:",data.dtype)
# print("First 5 rows:\n", data[:5])

# X_train = np.expand_dims(data[:10000], axis=1)
# X_test = np.expand_dims(data[10000:], axis=1)

# print("First 5 rows train:\n", X_train[:5])
# print("First 5 rows test:\n", X_test[:5])

transformer = TelemanomSKI(l_s= 2, n_predictions= 1)
transformer.fit(data)
# prediction_labels_train = transformer.predict(X_train)
prediction_labels = transformer.predict(data)
prediction_score = transformer.predict_score(data)

# print("Primitive: ", transformer.primitive)
print("Prediction Labels\n", prediction_labels)
print("Prediction Score\n", prediction_score)

df1 = pd.DataFrame(prediction_labels)
df2 = pd.DataFrame(prediction_score)

# df1.to_csv(r'./labels.csv', index = False)
df2.to_csv(r'./scores.csv', index = False)
# result = pd.merge(df1, df2[[]])
# result = [prediction_labels, prediction_score]
# # result = pd.DataFrame({'label': prediction_labels, 'score': prediction_score}, columns=['label', 'score'], index=[0])
# print(result)
# pd.DataFrame(result).to_csv("./teleSKI.csv")
# y_true = prediction_labels_train
# y_pred = prediction_labels

# print('Accuracy Score: ', accuracy_score(y_true, y_pred))

# confusion_matrix(y_true, y_pred)

# print(classification_report(y_true, y_pred))

# precision, recall, thresholds = precision_recall_curve(y_true, y_pred)
# f1_scores = 2*recall*precision/(recall+precision)

# print('Best threshold: ', thresholds[np.argmax(f1_scores)])
# print('Best F1-Score: ', np.max(f1_scores))

# fpr, tpr, threshold = metrics.roc_curve(y_true, y_pred)
# roc_auc = metrics.auc(fpr, tpr)

# plt.title('ROC')
# plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
# plt.legend(loc = 'lower right')
# plt.ylabel('True Positive Rate')
# plt.xlabel('False Positive Rate')
# plt.show()
Loading