In [42]:
"""
The features created will be based on the research on most useful features 
extractable from time series named catch22: https://arxiv.org/pdf/1901.10200.pdf
"""

"""
We extract the time series by selecting the data as an array having the length reflect a specific time frame,
In pyspark, we perform that by collect through a window 
"""
columns = ["machineID","datetime","volt","rotate","pressure","vibration"]
columns_array = ["volt","rotate","pressure","vibration"]
def create_array_features(df, features, sequence_length = 48):
    windowval = (Window.partitionBy("machineID").orderBy("datetime")
                     .rowsBetween(Window.currentRow - sequence_length + 1, Window.currentRow))
    for i in features:
        column = f"{i}_array"
        df = df.withColumn(column, F.collect_list(i).over(windowval))
        
    for i in df.columns:
        if "array" in i:
            df = df.filter(size(col(i)) == 48)
            
    return df

In [43]:
import pycatch22 as catch22
from sktime.transformations.panel import catch22
transformer = catch22.Catch22()

In [44]:
catch22_features = [
        'DN_HistogramMode_5',
        'DN_HistogramMode_10',
        'CO_f1ecac',
        'CO_FirstMin_ac',
        'CO_HistogramAMI_even_2_5',
        'CO_trev_1_num',
        'MD_hrv_classic_pnn40',
        'SB_BinaryStats_mean_longstretch1',
        'SB_TransitionMatrix_3ac_sumdiagcov',
        'PD_PeriodicityWang_th0_01',
        'CO_Embed2_Dist_tau_d_expfit_meandiff',
        'IN_AutoMutualInfoStats_40_gaussian_fmmi',
        'FC_LocalSimple_mean1_tauresrat',
        'DN_OutlierInclude_p_001_mdrmd',
        'DN_OutlierInclude_n_001_mdrmd',
        'SP_Summaries_welch_rect_area_5_1',
        'SB_BinaryStats_diff_longstretch0',
        'SB_MotifThree_quantile_hh',
        'SC_FluctAnal_2_rsrangefit_50_1_logi_prop_r1',
        'SC_FluctAnal_2_dfa_50_1_2_logi_prop_r1',
        'SP_Summaries_welch_rect_centroid',
        'FC_LocalSimple_mean3_stderr'
    ]

In [45]:
"""
This is a helper function that make sure the output having the appropriate type
consistency 
"""
def cast_float_function(function_name,input_data):
    function = getattr(catch22,function_name)
    return float(function(input_data))

In [46]:
def catch22_array(array):
    series = pd.Series(array)
    return np.squeeze(transformer.fit_transform(series).values).tolist()

In [47]:
"""
This function used to generate all required features from catch22
"""
def catch_22_features(df,columns = None, catch22_features = catch22_features):
    features = []
    if columns:
        features = columns 
    else:
        for i in df.columns:
            if "array" in i:
                features.append(i)
    for feature in features:
        for i in range(1,len(catch22_features)+1):
            name = feature.split("_")[0]
            name = f"{name}_catch22_{i}"
            udf_func = F.udf(lambda s: cast_float_function(catch22_features[i-1],s), FloatType())
            df = df.withColumn(name,udf_func(feature))
    return df

In [48]:
telemetry = telemetry.transform(create_array_features,columns_array).transform(catch_22_features)

In [49]:
# encode model type
def get_model(model):
    dict_model = {"model1" : 1.0,
                  "model2" : 2.0,
                  "model3" : 3.0,
                  "model4" : 4.0
                 }
    return dict_model[model]
get_model_encode_udf = F.udf(lambda s: get_model(s), FloatType())
telemetry = telemetry.withColumn("model_type",get_model_encode_udf("model"))

In [50]:
non_use_columns = ["machineID","comp","failure","errorID"]
for i in telemetry.columns:
    if "_date" in i or "array" in i :
        non_use_columns.append(i)
non_use_columns.append("model")
non_use_columns.append("datetime")
non_use_columns.extend(col_errors[2:])


# Labelling

In [51]:
label = maint.withColumn("encoded_label", F.lit(1)).select(["datetime","machineID","encoded_label"])
label.show()

+-------------------+---------+-------------+
|           datetime|machineID|encoded_label|
+-------------------+---------+-------------+
|2014-06-01 06:00:00|        1|            1|
|2014-07-16 06:00:00|        1|            1|
|2014-07-31 06:00:00|        1|            1|
|2014-12-13 06:00:00|        1|            1|
|2015-01-05 06:00:00|        1|            1|
|2015-01-05 06:00:00|        1|            1|
|2015-01-20 06:00:00|        1|            1|
|2015-01-20 06:00:00|        1|            1|
|2015-02-04 06:00:00|        1|            1|
|2015-02-04 06:00:00|        1|            1|
|2015-02-19 06:00:00|        1|            1|
|2015-03-06 06:00:00|        1|            1|
|2015-03-21 06:00:00|        1|            1|
|2015-04-05 06:00:00|        1|            1|
|2015-04-20 06:00:00|        1|            1|
|2015-05-05 06:00:00|        1|            1|
|2015-05-05 06:00:00|        1|            1|
|2015-05-20 06:00:00|        1|            1|
|2015-06-04 06:00:00|        1|   

In [52]:
telemetry = telemetry.join(label, on = ["datetime","machineID"],how = "left")

In [53]:
telemetry = telemetry.na.fill({"encoded_label" : 0})
windowval = (Window.partitionBy("machineID").orderBy("datetime")
                     .rowsBetween(Window.currentRow , Window.currentRow + 47))
telemetry = telemetry.withColumn("target", max("encoded_label").over(windowval))

In [55]:
saved_column = []
for i in telemetry.columns:
    if "array" in i:
        pass
    else:
        saved_column.append(i)
telemetry.select(saved_column).write.option("header",True).csv("final_data.csv")

[Stage 396:>                                                        (0 + 1) / 1]

22/12/16 19:38:53 WARN DAGScheduler: Broadcasting large task binary with size 1087.8 KiB




22/12/16 19:39:02 WARN DAGScheduler: Broadcasting large task binary with size 1279.7 KiB


[Stage 435:>                                                        (0 + 1) / 2]

22/12/16 19:39:11 ERROR Executor: Exception in task 0.0 in stage 435.0 (TID 925)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/var/folders/bv/1fvy47dn1y13_qd3n3_dt7tc0000gn/T/ipykernel_18039/3846315816.py", line 16, in <lambda>
  File "/var/folders/bv/1fvy47dn1y13_qd3n3_dt7tc0000gn/T/ipykernel_18039/3332093815.py", line 6, in cast_float_function
AttributeError: module 'sktime.transformations.panel.catch22' has no attribute 'DN_HistogramMode_5'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:559)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$2.read(PythonUDFRunner.scala:86)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$2.read(PythonUDFRunner.scala:68)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:512)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iter

Py4JJavaError: An error occurred while calling o3308.csv.
: org.apache.spark.SparkException: Job aborted.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.jobAbortedError(QueryExecutionErrors.scala:651)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:278)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:186)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:113)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:111)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.executeCollect(commands.scala:125)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:109)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:169)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:95)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:94)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:584)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:176)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:584)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:560)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:94)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:81)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:79)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:116)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:860)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:390)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:363)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:239)
	at org.apache.spark.sql.DataFrameWriter.csv(DataFrameWriter.scala:851)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:750)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 435.0 failed 1 times, most recent failure: Lost task 0.0 in stage 435.0 (TID 925) (jeans-mbp.stusgs.rmit.edu.vn executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/var/folders/bv/1fvy47dn1y13_qd3n3_dt7tc0000gn/T/ipykernel_18039/3846315816.py", line 16, in <lambda>
  File "/var/folders/bv/1fvy47dn1y13_qd3n3_dt7tc0000gn/T/ipykernel_18039/3332093815.py", line 6, in cast_float_function
AttributeError: module 'sktime.transformations.panel.catch22' has no attribute 'DN_HistogramMode_5'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:559)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$2.read(PythonUDFRunner.scala:86)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$2.read(PythonUDFRunner.scala:68)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:512)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:140)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2672)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2608)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2607)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2607)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1182)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2860)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2802)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2791)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/var/folders/bv/1fvy47dn1y13_qd3n3_dt7tc0000gn/T/ipykernel_18039/3846315816.py", line 16, in <lambda>
  File "/var/folders/bv/1fvy47dn1y13_qd3n3_dt7tc0000gn/T/ipykernel_18039/3332093815.py", line 6, in cast_float_function
AttributeError: module 'sktime.transformations.panel.catch22' has no attribute 'DN_HistogramMode_5'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:559)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$2.read(PythonUDFRunner.scala:86)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$2.read(PythonUDFRunner.scala:68)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:512)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:140)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)


In [None]:
non_use_columns.append("encoded_label")
non_use_columns.append("target")
non_use_columns.append("model_type")

In [None]:
featureCols = list(set(telemetry.columns) - set(non_use_columns))

col_used = featureCols.copy()
col_used.append("target")

## Generate synthetic data using SMOTE: 

In [None]:
"""
Since there are no library that help perform the task properly in pyspark, I switch the dataset back to pandas Dataframe 
and use the original smote implementation from imbalance learn package: https://imbalanced-learn.org/stable/
"""

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
path_to_csv = "./final_data.csv/"
files = []
for file in os.listdir(path_to_csv):
    if file.split(".")[-1] == "csv":
        files.append(file)

In [None]:
df = pd.DataFrame()
for i in files:
    if i.split(".")[-1] == "csv":
        df1 = pd.read_csv(f"{path_to_csv}{i}")
        df = pd.concat([df,df1])

In [None]:
df.head()

In [None]:
len(df)

In [None]:
train = df[df.datetime < "2015-10-10"]
test = df[df.datetime > "2015-10-10"]
features_train = train[featureCols]
target_train = train["target"]
test = test.dropna()
features_test = test[featureCols]
target_test = test["target"]

In [None]:
features_test.isna().sum()

In [None]:
features_test[features_test.period_failure4.isna()]

In [None]:
sm = SMOTE(random_state=711)
# X_res, y_res = sm.fit_resample(features_train, target_train)

In [None]:
# data_train.to_csv("./Synthetic_data/up_sample_data.csv")

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from xgboost import XGBRFClassifier 

In [None]:
# random forrest 
clf = RandomForestClassifier()
clf.fit(X_res,y_res) 

In [None]:
# evaluate 
rf_pred = clf.predict(features_test)
print(classification_report(target_test,rf_pred))

In [None]:
#XG_boost 
xg_clf = XGBRFClassifier(objective="binary:logistic")
xg_clf.fit(features_train,target_train)

In [None]:
# evaluate 
xg_pred = xg_clf.predict(features_test)
print(classification_report(target_test,xg_pred))

In [None]:
# save model 


In [None]:
"""
To do: 
- Convert df to the right type 
- Convert function to transformer extensions class - \
(optional) using UDF to add to pipeline
- Build Discrete features (Done)
- Build features of time since last event (last error, last failures, ...etc) (Done) 
- Build features for catch22 (damn this is going to be so hard :)) (I figured it out hahaha)
- label (it will be fun)
- train model on MLlib (it will be easy)
- Config scheduler (Airflow)
- Apply for a job (:))) I hope I will get one huhu)
"""

