In [2]:
import re, os
import pandas as pd
import datetime
import numpy as np
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
from matplotlib import pyplot as plt
# import plotData # helper function in starter code package


from sktime.transformations.panel.padder import PaddingTransformer
from sktime.classification.compose import ClassifierPipeline, ComposableTimeSeriesForestClassifier
from sktime.transformations.panel.summarize import RandomIntervalFeatureExtractor
from sklearn.tree import DecisionTreeClassifier
# only classifier in sktime that can process unequal length data
# https://github.com/sktime/sktime/issues/3649#issuecomment-1292459843
# from sktime.alignment.dtw_python import AlignerDTW   ## NOTE THAT THIS SOMEHOW AFFECT ALL PRINT OUTPUT. NOTHING WILL BE SHOWN FOR PRINT STATEMENT AFTER YOU RUN THIS
from sktime.classification.feature_based import RandomIntervalClassifier
from sktime.classification.distance_based import KNeighborsTimeSeriesClassifier
from sktime.classification.dictionary_based import IndividualBOSS, ContractableBOSS
from sktime.classification.kernel_based import RocketClassifier
from sktime.classification.hybrid import HIVECOTEV1, HIVECOTEV2
from sktime.classification.shapelet_based import ShapeletTransformClassifier
from sktime.classification.sklearn import RotationForest

from sktime.dists_kernels.compose_from_align import DistFromAligner
from sktime.utils.slope_and_trend import _slope
from sklearn.pipeline import Pipeline
# https://www.sktime.org/en/stable/api_reference/auto_generated/sktime.transformations.panel.catch22.Catch22.html
from sktime.transformations.panel.catch22 import Catch22

from sktime.classification.interval_based import CanonicalIntervalForest,DrCIF,RandomIntervalSpectralEnsemble,SupervisedTimeSeriesForest,TimeSeriesForestClassifier

# identify classifiers that support unequal length
from sktime.registry import all_estimators




# EDA

In [29]:
resp_test_status = {}
for root, dirs, files in os.walk("../dataPackage/"):
  for file in files:
    try:
      if re.search("^sub-(cp\d+)", file) != None and re.search("task-(\w+)_", file) != None:
        subject = re.search("^sub-(cp\d+)", file).group(1)
        rest_ils = re.search("task-(\w+)_", file).group(1)
        if subject not in resp_test_status:
          resp_test_status[subject] = {'ils_lslrespitrace':0, 'rest_lslrespitrace':0, 'ils_lslshimmerresp':0, 'rest_lslshimmerresp':0}
        if 'lslrespitrace' in file and 'dat.csv' in file:
          resp_test_status[subject][rest_ils+'_lslrespitrace'] += 1
        if 'lslshimmerresp' in file and 'dat.csv' in file:
          resp_test_status[subject][rest_ils+'_lslshimmerresp'] += 1
    except:
      print(file)

In [30]:
pd.DataFrame(resp_test_status).T

Unnamed: 0,ils_lslrespitrace,rest_lslrespitrace,ils_lslshimmerresp,rest_lslshimmerresp
cp003,0,0,12,2
cp004,0,0,12,2
cp005,0,0,12,2
cp006,0,0,12,2
cp008,0,0,12,2
cp009,0,0,12,1
cp011,0,0,12,2
cp012,0,0,12,2
cp013,0,0,12,2
cp014,12,2,12,2


lslrespitrace is not used on all subjects. Hence, we will only use lslshimmerresp for respiratory analysis

In [31]:
resp_test_len = {'subject':[],'rest/ils':[],'level':[],'run':[],'len':[],}
for root, dirs, files in os.walk("..\\dataPackage\\"):
  for file in files:
    try:
      if 'lslshimmerresp' in file and 'dat.csv' in file:
        resp_test_len["subject"].append(re.search("^sub-(cp\d+)", file).group(1))
        resp_test_len["rest/ils"].append(re.search("task-(\w+)_", file).group(1))
        resp_test_len["level"].append(re.search("level-(\d\d\w)", file).group(1))
        resp_test_len["run"].append(re.search("run-(\d{3})", file).group(1))
        resp_test_len["len"].append(len(pd.read_csv(os.path.join(root, file))))
    except:
      print(file)

sub-cp004_ses-20210330_task-ils_stream-lslshimmerresp_feat-chunk_level-01B_run-007_dat.csv


In [None]:
df_resp_test_len = pd.DataFrame(resp_test_len)
df_resp_test_len

In [None]:
with pd.option_context("display.max_rows", 1000):
  display(df_resp_test_len.groupby(["subject", "level"]).count())

In [None]:
missing_addon = {'subject':['cp030', 'cp031'],'rest/ils':['ils', 'ils'],'level':['03B', '01B'],'run':['005', '012'],'len':[0,0]}
df_plt = pd.concat([df_resp_test_len, pd.DataFrame(missing_addon)], axis = 0)
df_plt

In [None]:
def plt_compare_datum_len(lvl, df):
    df_temp = df[df["level"]==lvl]

    x = np.arange(len(df_temp["len"][df_temp["run"]==df_temp["run"].unique()[0]]))
    y1 = df_temp["len"][df_temp["run"]==df_temp["run"].unique()[0]]
    y2 = df_temp["len"][df_temp["run"]==df_temp["run"].unique()[1]]
    y3 = df_temp["len"][df_temp["run"]==df_temp["run"].unique()[2]]
    width = 0.2

    plt.bar(x-0.2, y1, width)
    plt.bar(x, y2, width)
    plt.bar(x+0.2, y3, width)
    plt.xlabel("subject")
    plt.ylabel("length")
    plt.legend(df_temp["run"].unique())
    plt.title(f"number of datapoints collected from each subject for each run in level {lvl}")
    plt.show()

In [None]:
df_temp = df_plt[df_plt["level"]=="03B"]

x = np.arange(len(df_temp["len"][df_temp["run"]==df_temp["run"].unique()[0]].unique()))
y1 = df_temp["len"][df_temp["run"]==df_temp["run"].unique()[0]]
y2 = df_temp["len"][df_temp["run"]==df_temp["run"].unique()[1]]
y3 = df_temp["len"][df_temp["run"]==df_temp["run"].unique()[2]]
print(len(y1), len(y2), len(y3), len(df_temp["len"][df_temp["run"]==df_temp["run"].unique()[0]].unique()))

In [None]:
for lvl in ["01B", "02B", "03B", "04B"]:
  plt_compare_datum_len(lvl, df_plt)


In [None]:
df_temp = df_plt[(df_plt["level"]=="000")&(df_plt["run"]=="001")]
x = np.arange(len(df_temp["subject"].unique()))
plt.bar(x, df_temp["len"])
plt.title(f"number of datapoints collected from each subject for first run in level 000")
plt.show

We note that
- for CP009, all the first run of each level (including 000 rest), there's only 1 value
- for CP030, there's only 2 rounds of level 3B 
- for CP031, there's 4 rounds of level 2B, but only 2 rounds of level 1B. All others have 3 rounds of each

In addition, while length of each run is expected to be different due to different time to land the aircraft, subject 0 (cp003) and subject 6 (cp009) needs to be looked into to see why they are so different

In [None]:
# cp003
cp003_run001_dfresp = plotData.loadTimeSeries("..\dataPackage", 
                                              "sub-cp003", 
                                              "ses-20210206", 
                                              "task-ils", 
                                              "lslshimmerresp", 
                                              "level-01B_run-001");                                           
cp003_run001_dfresp['time_dn'] = pd.to_datetime(cp003_run001_dfresp['time_dn']-719529, unit='D')

print(cp003_run001_dfresp.loc[1,"time_dn"]- cp003_run001_dfresp.loc[0,"time_dn"])
print(cp003_run001_dfresp.loc[2,"time_dn"]- cp003_run001_dfresp.loc[1,"time_dn"])
# 512 hz
print(cp003_run001_dfresp.loc[511,"time_dn"]- cp003_run001_dfresp.loc[0,"time_dn"])

In [None]:
# cp004
cp004_run001_dfresp = plotData.loadTimeSeries("..\dataPackage", 
                                              "sub-cp004", 
                                              "ses-20210330", 
                                              "task-ils", 
                                              "lslshimmerresp", 
                                              "level-01B_run-001");                                           
cp004_run001_dfresp['time_dn'] = pd.to_datetime(cp004_run001_dfresp['time_dn']-719529, unit='D')

print(cp004_run001_dfresp.loc[1,"time_dn"]- cp004_run001_dfresp.loc[0,"time_dn"])
print(cp004_run001_dfresp.loc[2,"time_dn"]- cp004_run001_dfresp.loc[1,"time_dn"])
# 128 hz
print(cp004_run001_dfresp.loc[127,"time_dn"]- cp004_run001_dfresp.loc[0,"time_dn"])

In [None]:
# cp009
cp009_run007_dfresp = plotData.loadTimeSeries("..\dataPackage", 
                                              "sub-cp009", 
                                              "ses-20210129", 
                                              "task-ils", 
                                              "lslshimmerresp", 
                                              "level-01B_run-007");    # NOTE THAT WE CHANGE TO 2ND RUN                                        
cp009_run007_dfresp['time_dn'] = pd.to_datetime(cp009_run007_dfresp['time_dn']-719529, unit='D')
cp009_run007_dfresp
print(cp009_run007_dfresp.loc[1,"time_dn"]- cp009_run007_dfresp.loc[0,"time_dn"])
print(cp009_run007_dfresp.loc[2,"time_dn"]- cp009_run007_dfresp.loc[1,"time_dn"])
# 512 hz
print(cp009_run007_dfresp.loc[511,"time_dn"]- cp009_run007_dfresp.loc[0,"time_dn"])

In [None]:
# cp009
cp011_run001_dfresp = plotData.loadTimeSeries("..\dataPackage", 
                                              "sub-cp011", 
                                              "ses-20210408", 
                                              "task-ils", 
                                              "lslshimmerresp", 
                                              "level-01B_run-001");    # NOTE THAT WE CHANGE TO 2ND RUN                                        
cp011_run001_dfresp['time_dn'] = pd.to_datetime(cp011_run001_dfresp['time_dn']-719529, unit='D')
cp011_run001_dfresp
print(cp011_run001_dfresp.loc[1,"time_dn"]- cp011_run001_dfresp.loc[0,"time_dn"])
print(cp011_run001_dfresp.loc[2,"time_dn"]- cp011_run001_dfresp.loc[1,"time_dn"])
# 128 hz
print(cp011_run001_dfresp.loc[127,"time_dn"]- cp011_run001_dfresp.loc[0,"time_dn"])

We have to sample down the frequency for CP003 and CP009 from 500+ to 128hz to match everyone

## Ignore as we will be using the cleaned signal

In [None]:
# # Downsample and convert datenum to datetime
# for root, dirs, files in os.walk("..\\dataPackage\\"):
#   for file in files:
#     try:
#       if 'lslshimmerresp' in file and 'dat.csv' in file:
#         if 'cp003' in file or 'cp009' in file:
#           df_temp = pd.read_csv(os.path.join(root, file))
#           df_temp['time_dn'] = pd.to_datetime(df_temp['time_dn']-719529, unit='D')
#           # convert to 128hz
#           df_temp = df_temp.resample(str(1/128)+'S', on="time_dn").first().set_index('time_dn')
#           df_temp.to_csv(f"..\\myCleanedData\\{file}")
#         else:
#           #just convert datenum to date and then save to new folder
#           df_temp = pd.read_csv(os.path.join(root, file))
#           df_temp['time_dn'] = pd.to_datetime(df_temp['time_dn']-719529, unit='D')
#           df_temp.to_csv(f"..\\myCleanedData\\{file}", index=False)
#     except:
#       print(file)

In [None]:
# x = pd.read_csv("..\myCleanedData\sub-cp003_ses-20210206_task-rest_stream-lslshimmerresp_feat-chunk_level-000_run-001_dat.csv")
# x["respiration_trace_mV"][(np.abs(stats.zscore(x["respiration_trace_mV"])) < 3)]
# x["respiration_trace_mV"].mean()

## Get mean of resting resp

In [None]:
# Get the mean value of all subjects resting resp
df_resting_resp_before = {'subject':[], 'mean_rest_resp':[]}
for root, dirs, files in os.walk("..\\cleanedData\\"):
  for file in files:
    if "lslshimmerresp" in file and "000" in file and "_1.csv" in file:
      subject = re.search("000_(cp\d+)", file).group(1)
      df_temp = pd.read_csv(os.path.join(root, file))
      # remove any outlier that is more than 3sd away from mean
      df_temp["respiration_trace_mV"] = df_temp["respiration_trace_mV"][(np.abs(stats.zscore(df_temp["respiration_trace_mV"])) < 3)]
  
      df_resting_resp_before['subject'].append(subject)
      df_resting_resp_before['mean_rest_resp'].append(df_temp["respiration_trace_mV"].mean())
pd.DataFrame(df_resting_resp_before)

Ask aaron, 
- is there anything wrong in the mean data?
- does it make sense to take the difference of activity - rest?

Ask XY
- cp006 missing

## Combining Data

In [None]:
# visualise scaled data for cp003
scaler = StandardScaler()
df_temp = pd.read_csv("..\cleanedData\lslshimmerresp_respiration_trace_mV_01B_cp003_1.csv")
plt.bar(x =np.arange(len(df_temp['respiration_trace_mV'][::100])), height = scaler.fit_transform(df_temp[['respiration_trace_mV']][::100]).flatten())

In [None]:
# combine into single df
df_combined = {'normalised_resp':[], 'difficulty':[]}
for root, dirs, files in os.walk("..\\cleanedData\\"):
  for file in files:
      if "lslshimmerresp" in file:
        difficulty = re.search("mV_(\d\d\w)", file).group(1)
        df_temp = pd.read_csv(os.path.join(root, file))
        # We perform scaler on EACH subject as they are independent of each other 
        scaler = StandardScaler()
        resp_series = pd.Series(scaler.fit_transform(df_temp[['respiration_trace_mV']]).flatten())

        df_combined['normalised_resp'].append(resp_series)
        df_combined['difficulty'].append(difficulty)
df_combined = pd.DataFrame(df_combined)
df_combined

In [None]:
# save to pickle rather than csv to preserve the nested series inside the dataframe
df_combined.to_pickle("..\\cleanedData\\df_combined.pkl", protocol=4)

## Combining Data (fitted to rest)

In [None]:
# generate scalers for different subject
fitted_rest_scaler = {}
for root, dirs, files in os.walk("..\\cleanedData\\"):
  for file in files:
      if "mV_000" in file and "_1.csv" in file:
        subject = re.search("000_(cp\d{3})_", file).group(1)
        df_temp = pd.read_csv(os.path.join(root, file))
        scaler = StandardScaler()
        fitted_rest_scaler[subject] = scaler.fit(df_temp[['respiration_trace_mV']])

In [None]:
# combine into single df
df_combined_fit_rest = {'normalised_resp_rest':[], 'difficulty':[]}
for root, dirs, files in os.walk("..\\cleanedData\\"):
  for file in files:
      if "lslshimmerresp" in file and "000" not in file:
        subject = re.search("_(cp\d{3})_", file).group(1)
        difficulty = re.search("mV_(\d\d\w)", file).group(1)
        df_temp = pd.read_csv(os.path.join(root, file))
        # We perform scaler on EACH subject as they are independent of each other 
        # BASED ON THEIR RESTING RESP
        resp_series = pd.Series(fitted_rest_scaler[subject].transform(df_temp[['respiration_trace_mV']]).flatten())
        
        df_combined_fit_rest['normalised_resp_rest'].append(resp_series)
        df_combined_fit_rest['difficulty'].append(difficulty)
df_combined_fit_rest = pd.DataFrame(df_combined_fit_rest)
df_combined_fit_rest

In [None]:
# save to pickle rather than csv to preserve the nested series inside the dataframe
df_combined_fit_rest.to_pickle("..\\cleanedData\\df_combined_fit_rest.pkl", protocol=4)

# Modelling

In [3]:
df_combined = pd.read_pickle("..\\cleanedData\\df_combined.pkl")

In [4]:
# Identify rows that is 0 length
# empty_row = []
# for i in range(len(df_combined)):
#   temp = df_combined.iloc[i,0]
#   if len(temp[temp==0.0]) or len(temp[temp==0]):
#     empty_row.append(i)

# df_combined.drop(empty_row, inplace=True)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df_combined["normalised_resp"], df_combined["difficulty"], random_state=42)

In [6]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)
y_train = y_train.astype("string")
y_test = y_test.astype("string")

In [7]:
df_combined_fit_rest = pd.read_pickle("..\\cleanedData\\df_combined_fit_rest.pkl")

In [8]:
# Identify rows that is 0 length
empty_row = []
for i in range(len(df_combined_fit_rest)):
  temp = df_combined_fit_rest.iloc[i,0]
  if len(temp[temp==0.0]) or len(temp[temp==0]):
    empty_row.append(i)

df_combined_fit_rest.drop(empty_row, inplace=True)

In [9]:
X_fit_rest_train, X_fit_rest_test, y_fit_rest_train, y_fit_rest_test = train_test_split(df_combined_fit_rest["normalised_resp_rest"], df_combined_fit_rest["difficulty"], random_state=42)

In [10]:
X_fit_rest_train = pd.DataFrame(X_fit_rest_train)
X_fit_rest_test = pd.DataFrame(X_fit_rest_test)
y_fit_rest_train = y_fit_rest_train.astype("string")
y_fit_rest_test = y_fit_rest_test.astype("string")

In [11]:
model_result = {
  "classifier":[],
  "accuracy_score":[],
  "AUC_score":[],
  "F1_score":[]
}

def get_class(class_list, prob_list):
  idx = list(prob_list).index(max(prob_list))
  return class_list[idx]

def log_result(classifier, class_list, y_test, y_pred_proba):
  y_pred = []
  for y_list in y_pred_proba:
    y_pred.append(get_class(class_list, y_list))
  acc = accuracy_score(y_test, y_pred)
  auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')
  f1 = f1_score(y_test, y_pred, average='micro')
  model_result["classifier"].append(classifier)
  model_result["accuracy_score"].append(acc)
  model_result["AUC_score"].append(auc)
  model_result["F1_score"].append(f1)

  display(pd.DataFrame(model_result))

## KNeighborsTimeSeriesClassifier

In [None]:


# search for all classifiers which can handle unequal length data. This may give some
# UserWarnings if soft dependencies are not installed.
all_estimators(
    filter_tags={"capability:unequal_length": True}, estimator_types="classifier"
)

In [None]:
# aligner = AlignerDTW()
# dtw_dist = DistFromAligner(aligner)
# knclassifier = KNeighborsTimeSeriesClassifier(n_neighbors=3, distance = dtw_dist, n_jobs= -1)
# knclassifier.fit(X_train, y_train)
# y_pred = knclassifier.predict(X_test)

# log_result('KNeighborsTimeSeriesClassifier',y_test, y_pred)

# NO MEMORY

## KNeighborsTimeSeriesClassifier with padding

In [None]:
# padded_KN_pipeline = ClassifierPipeline(
#     KNeighborsTimeSeriesClassifier(n_neighbors=5, distance ="dtw", n_jobs= 1, leaf_size = 2000), 
#     [PaddingTransformer()]
# )
# padded_KN_pipeline.fit(X_train, y_train)
# y_pred = padded_KN_pipeline.predict(X_test)

# log_result('KNeighborsTimeSeriesClassifier',y_test, y_pred)

# NO MEMORY

## RandomIntervalClassifier
extract at random interval and perform Rotation forest with 200 trees

In [20]:
# otherwise, we have to use paddingtransformer to process the unequal length
padded_clf = PaddingTransformer() * RandomIntervalClassifier(n_intervals=5, n_jobs=-1, random_state = 42)
padded_clf.fit(X_train, y_train)
y_pred_proba = padded_clf.predict_proba(X_test)

log_result('RandomIntervalClassifier',padded_clf.classes_, y_test, y_pred_proba)

Unnamed: 0,classifier,accuracy_score,AUC_score,F1_score
0,RandomIntervalClassifier,0.252101,0.536985,0.252101
1,IndividualBoss,0.252101,0.536985,0.252101
2,ContractableBOSS,0.294118,0.571395,0.294118
3,time_series_tree,0.310924,0.578198,0.310924
4,RandomIntervalClassifier,0.420168,0.700349,0.420168


In [21]:
# otherwise, we have to use paddingtransformer to process the unequal length
padded_clf = PaddingTransformer() * RandomIntervalClassifier(n_intervals=5, n_jobs=-1, random_state = 42)
padded_clf.fit(X_fit_rest_train, y_fit_rest_train)
y_fit_rest_pred_proba = padded_clf.predict_proba(X_fit_rest_test)

log_result('RandomIntervalClassifier_fit_rest',padded_clf.classes_, y_fit_rest_test, y_fit_rest_pred_proba)

Unnamed: 0,classifier,accuracy_score,AUC_score,F1_score
0,RandomIntervalClassifier,0.252101,0.536985,0.252101
1,IndividualBoss,0.252101,0.536985,0.252101
2,ContractableBOSS,0.294118,0.571395,0.294118
3,time_series_tree,0.310924,0.578198,0.310924
4,RandomIntervalClassifier,0.420168,0.700349,0.420168
5,RandomIntervalClassifier_fit_rest,0.247525,0.517295,0.247525


## Decision Trees with mean, std, slope


In [18]:
steps = [
    ("padding",PaddingTransformer()),
    (
        "extract",
        RandomIntervalFeatureExtractor(
            n_intervals="sqrt", features=[np.mean, np.std, _slope]
        ),
    ),
    ("clf", DecisionTreeClassifier()),
]
time_series_tree = Pipeline(steps)

In [19]:
time_series_tree.fit(X_train, y_train)
y_pred_proba = time_series_tree.predict_proba(X_test)
log_result('time_series_tree',time_series_tree.classes_, y_test, y_pred_proba)

Unnamed: 0,classifier,accuracy_score,AUC_score,F1_score
0,RandomIntervalClassifier,0.252101,0.536985,0.252101
1,IndividualBoss,0.252101,0.536985,0.252101
2,ContractableBOSS,0.294118,0.571395,0.294118
3,time_series_tree,0.310924,0.578198,0.310924


## ComposableTimeSeriesForestClassifier
https://www.sktime.org/en/v0.8.1/examples/02_classification_univariate.html

## ON PAUSE CAUSE TAKES TOO LONG

In [None]:
# https://www.sktime.org/en/v0.8.1/examples/02_classification_univariate.html
tsf_tst = PaddingTransformer() * ComposableTimeSeriesForestClassifier(
    estimator=time_series_tree,
    n_estimators=100,
    # criterion="entropy",
    bootstrap=True,
    oob_score=True,
    random_state=1,
    n_jobs=-1,
)
tsf_tst.fit(X_train, y_train.astype("string"))

if tsf_tst.oob_score:
    print(tsf.oob_score_)

y_pred_proba = tsf_tst.predict_proba(X_test)

log_result('tsf_time_series_forest',tsf_tst.classes_, y_test, y_pred_proba)

In [None]:
tsf = ComposableTimeSeriesForestClassifier()
tsf.fit(X_train, y_train)
y_pred_proba = tsf.predict_proba(X_test)
log_result('TimeSeriesForestClassifier',tsf.classes_, y_test, y_pred_proba)

## Classification using catch22

In [None]:
catch22 = PaddingTransformer() * Catch22()
X_train_catch22 = catch22.fit_transform(X_train)
X_test_catch22 = catch22.transform(X_test)

In [None]:
X_train_catch22.to_csv("..\\cleanedData\\X_train_catch22.csv")
X_test_catch22.to_csv("..\\cleanedData\\X_test_catch22.csv")

## Individual Boss


https://github.com/sktime/sktime/issues/4090

Proposed a bug fix to sktime

In [None]:
%%writefile "C:\Users\kyunomi\anaconda3\envs\cogpilot\Lib\site-packages\sktime\classification\dictionary_based\_boss.py"
# -*- coding: utf-8 -*-
"""BOSS classifiers.

Dictionary based BOSS classifiers based on SFA transform. Contains a single
BOSS and a BOSS ensemble.
"""

__author__ = ["MatthewMiddlehurst", "patrickzib"]
__all__ = ["BOSSEnsemble", "IndividualBOSS", "pairwise_distances"]

import warnings
from itertools import compress

import numpy as np
from joblib import Parallel, effective_n_jobs
from sklearn.metrics import pairwise
from sklearn.utils import check_random_state, gen_even_slices
from sklearn.utils.extmath import safe_sparse_dot
from sklearn.utils.fixes import delayed
from sklearn.utils.sparsefuncs_fast import csr_row_norms
from sklearn.utils.validation import _num_samples

from sktime.classification.base import BaseClassifier
from sktime.transformations.panel.dictionary_based import SFAFast
from sktime.utils.validation.panel import check_X_y


class BOSSEnsemble(BaseClassifier):
    """Ensemble of Bag of Symbolic Fourier Approximation Symbols (BOSS).

    Implementation of BOSS Ensemble from Schäfer (2015). [1]_

    Overview: Input *n* series of length *m* and BOSS performs a grid search over
    a set of parameter values, evaluating each with a LOOCV. It then retains
    all ensemble members within 92% of the best by default for use in the ensemble.
    There are three primary parameters:
        - *alpha*: alphabet size
        - *w*: window length
        - *l*: word length.

    For any combination, a single BOSS slides a window length *w* along the
    series. The w length window is shortened to an *l* length word through
    taking a Fourier transform and keeping the first *l/2* complex coefficients.
    These *l* coefficients are then discretized into alpha possible values,
    to form a word length *l*. A histogram of words for each
    series is formed and stored.

    Fit involves finding "n" histograms.

    Predict uses 1 nearest neighbor with a bespoke BOSS distance function.

    Parameters
    ----------
    threshold : float, default=0.92
        Threshold used to determine which classifiers to retain. All classifiers
        within percentage `threshold` of the best one are retained.
    max_ensemble_size : int or None, default=500
        Maximum number of classifiers to retain. Will limit number of retained
        classifiers even if more than `max_ensemble_size` are within threshold.
    max_win_len_prop : int or float, default=1
        Maximum window length as a proportion of the series length.
    min_window : int, default=10
        Minimum window size.
    typed_dict : bool, default="deprecated"
        Use a numba TypedDict to store word counts. May increase memory usage, but will
        be faster for larger datasets. As the Dict cannot be pickled currently, there
        will be some overhead converting it to a python dict with multiple threads and
        pickling.

        .. deprecated:: 0.13.3
            ``typed_dict`` was deprecated in version 0.13.3 and will be removed in 0.15.

    save_train_predictions : bool, default=False
        Save the ensemble member train predictions in fit for use in _get_train_probs
        leave-one-out cross-validation.
    alphabet_size : default = 4
        Number of possible letters (values) for each word.
    n_jobs : int, default=1
        The number of jobs to run in parallel for both `fit` and `predict`.
        ``-1`` means using all processors.
    use_boss_distance : boolean, default=True
        The Boss-distance is an asymmetric distance measure. It provides higher
        accuracy, yet is signifaicantly slower to compute.
    feature_selection: {"chi2", "none", "random"}, default: none
        Sets the feature selections strategy to be used. Chi2 reduces the number
        of words significantly and is thus much faster (preferred). Random also reduces
        the number significantly. None applies not feature selectiona and yields large
        bag of words, e.g. much memory may be needed.
    random_state : int or None, default=None
        Seed for random, integer.

    Attributes
    ----------
    n_classes_ : int
        Number of classes. Extracted from the data.
    classes_ : list
        The classes labels.
    n_instances_ : int
        Number of instances. Extracted from the data.
    n_estimators_ : int
        The final number of classifiers used. Will be <= `max_ensemble_size` if
        `max_ensemble_size` has been specified.
    series_length_ : int
        Length of all series (assumed equal).
    estimators_ : list
       List of DecisionTree classifiers.

    See Also
    --------
    IndividualBOSS, ContractableBOSS

    Notes
    -----
    For the Java version, see
    - `Original Publication <https://github.com/patrickzib/SFA>`_.
    - `TSML <https://github.com/uea-machine-learning/tsml/blob/master/src/main/java/
    tsml/classifiers/dictionary_based/BOSS.java>`_.


    References
    ----------
    .. [1] Patrick Schäfer, "The BOSS is concerned with time series classification
       in the presence of noise", Data Mining and Knowledge Discovery, 29(6): 2015
       https://link.springer.com/article/10.1007/s10618-014-0377-7

    Examples
    --------
    >>> from sktime.classification.dictionary_based import BOSSEnsemble
    >>> from sktime.datasets import load_unit_test
    >>> X_train, y_train = load_unit_test(split="train", return_X_y=True)
    >>> X_test, y_test = load_unit_test(split="test", return_X_y=True)
    >>> clf = BOSSEnsemble(max_ensemble_size=3)
    >>> clf.fit(X_train, y_train)
    BOSSEnsemble(...)
    >>> y_pred = clf.predict(X_test)
    """

    _tags = {
        "capability:train_estimate": True,
        "capability:multithreading": True,
        "classifier_type": "dictionary",
    }

    def __init__(
        self,
        threshold=0.92,
        max_ensemble_size=500,
        max_win_len_prop=1,
        min_window=10,
        typed_dict=True,
        save_train_predictions=False,
        feature_selection="none",
        use_boss_distance=True,
        alphabet_size=4,
        n_jobs=1,
        random_state=None,
    ):
        self.threshold = threshold
        self.max_ensemble_size = max_ensemble_size
        self.max_win_len_prop = max_win_len_prop
        self.min_window = min_window

        self.typed_dict = typed_dict
        self.save_train_predictions = save_train_predictions
        self.n_jobs = n_jobs
        self.random_state = random_state
        self.use_boss_distance = use_boss_distance

        self.estimators_ = []
        self.n_estimators_ = 0
        self.series_length_ = 0
        self.n_instances_ = 0
        self.feature_selection = feature_selection

        self._word_lengths = [16, 14, 12, 10, 8]
        self._norm_options = [True, False]
        self.alphabet_size = alphabet_size

        super(BOSSEnsemble, self).__init__()

    def _fit(self, X, y):
        """Fit a boss ensemble on cases (X,y), where y is the target variable.

        Build an ensemble of BOSS classifiers from the training set (X,
        y), through  creating a variable size ensemble of those within a
        threshold of the best.

        Parameters
        ----------
        X : 3D np.array of shape = [n_instances, n_dimensions, series_length]
            The training data.
        y : array-like, shape = [n_instances]
            The class labels.

        Returns
        -------
        self :
            Reference to self.

        Notes
        -----
        Changes state by creating a fitted model that updates attributes
        ending in "_" and sets is_fitted flag to True.
        """
        self.n_instances_, _, self.series_length_ = X.shape

        self.estimators_ = []

        # Window length parameter space dependent on series length
        max_window_searches = self.series_length_ / 4
        max_window = int(self.series_length_ * self.max_win_len_prop)
        win_inc = max(1, int((max_window - self.min_window) / max_window_searches))

        if self.typed_dict != "deprecated":
            warnings.warn(
                "``typed_dict`` was deprecated in version 0.13.3 and "
                "will be removed in 0.15."
            )

        if self.min_window > max_window + 1:
            raise ValueError(
                f"Error in BOSSEnsemble, min_window ="
                f"{self.min_window} is bigger"
                f" than max_window ={max_window}."
                f" Try set min_window to be smaller than series length in "
                f"the constructor, but the classifier may not work at "
                f"all with very short series"
            )
        max_acc = -1
        min_max_acc = -1
        for normalise in self._norm_options:
            for win_size in range(self.min_window, max_window + 1, win_inc):
                # max_word_len = min(self.min_window - 2, self.word_lengths[0])
                boss = IndividualBOSS(
                    win_size,
                    self._word_lengths[0],
                    normalise,
                    self.alphabet_size,
                    save_words=True,
                    use_boss_distance=self.use_boss_distance,
                    feature_selection=self.feature_selection,
                    n_jobs=self.n_jobs,
                    random_state=self.random_state,
                )
                boss.fit(X, y)

                best_classifier_for_win_size = boss
                best_acc_for_win_size = -1

                # the used word length may be shorter
                best_word_len = boss._transformer.word_length

                for n, word_len in enumerate(self._word_lengths):
                    if n > 0 and word_len < boss._transformer.word_length:
                        boss = boss._shorten_bags(word_len, y)

                    boss._accuracy = self._individual_train_acc(
                        boss, y, self.n_instances_, best_acc_for_win_size
                    )

                    if boss._accuracy >= best_acc_for_win_size:
                        best_acc_for_win_size = boss._accuracy
                        best_classifier_for_win_size = boss
                        best_word_len = word_len

                if self._include_in_ensemble(
                    best_acc_for_win_size,
                    max_acc,
                    min_max_acc,
                    len(self.estimators_),
                ):
                    best_classifier_for_win_size._clean()
                    best_classifier_for_win_size._set_word_len(X, y, best_word_len)
                    self.estimators_.append(best_classifier_for_win_size)

                    if best_acc_for_win_size > max_acc:
                        max_acc = best_acc_for_win_size
                        self.estimators_ = list(
                            compress(
                                self.estimators_,
                                [
                                    classifier._accuracy >= max_acc * self.threshold
                                    for c, classifier in enumerate(self.estimators_)
                                ],
                            )
                        )

                    min_max_acc, min_acc_ind = self._worst_ensemble_acc()

                    if len(self.estimators_) > self.max_ensemble_size:
                        if min_acc_ind > -1:
                            del self.estimators_[min_acc_ind]
                            min_max_acc, min_acc_ind = self._worst_ensemble_acc()

        self.n_estimators_ = len(self.estimators_)

        return self

    def _predict(self, X) -> np.ndarray:
        """Predict class values of n instances in X.

        Parameters
        ----------
        X : 3D np.array of shape = [n_instances, n_dimensions, series_length]
            The data to make predictions for.

        Returns
        -------
        y : array-like, shape = [n_instances]
            Predicted class labels.
        """
        rng = check_random_state(self.random_state)
        return np.array(
            [
                self.classes_[int(rng.choice(np.flatnonzero(prob == prob.max())))]
                for prob in self.predict_proba(X)
            ]
        )

    def _predict_proba(self, X) -> np.ndarray:
        """Predict class probabilities for n instances in X.

        Parameters
        ----------
        X : 3D np.array of shape = [n_instances, n_dimensions, series_length]
            The data to make predict probabilities for.

        Returns
        -------
        y : array-like, shape = [n_instances, n_classes_]
            Predicted probabilities using the ordering in classes_.
        """
        sums = np.zeros((X.shape[0], self.n_classes_))

        for clf in self.estimators_:
            preds = clf.predict(X)
            for i in range(0, X.shape[0]):
                sums[i, self._class_dictionary[preds[i]]] += 1
        dists = sums / (np.ones(self.n_classes_) * self.n_estimators_)

        return dists

    def _include_in_ensemble(self, acc, max_acc, min_max_acc, size):
        if acc >= max_acc * self.threshold:
            if size >= self.max_ensemble_size:
                return acc > min_max_acc
            else:
                return True
        return False

    def _worst_ensemble_acc(self):
        min_acc = 1.0
        min_acc_idx = -1

        for c, classifier in enumerate(self.estimators_):
            if classifier._accuracy < min_acc:
                min_acc = classifier._accuracy
                min_acc_idx = c

        return min_acc, min_acc_idx

    def _get_train_probs(self, X, y):
        self.check_is_fitted()
        X, y = check_X_y(X, y, coerce_to_numpy=True, enforce_univariate=True)

        n_instances, _, series_length = X.shape

        if n_instances != self.n_instances_ or series_length != self.series_length_:
            raise ValueError(
                "n_instances, series_length mismatch. X should be "
                "the same as the training data used in fit for generating train "
                "probabilities."
            )

        results = np.zeros((n_instances, self.n_classes_))
        divisors = np.zeros(n_instances)

        if self.save_train_predictions:
            for clf in self.estimators_:
                preds = clf._train_predictions
                for n, pred in enumerate(preds):
                    results[n][self._class_dictionary[pred]] += 1
                    divisors[n] += 1

        else:
            for i, clf in enumerate(self.estimators_):
                if self._transformed_data.shape[1] > 0:
                    distance_matrix = pairwise_distances(
                        clf._transformed_data,
                        use_boss_distance=self.use_boss_distance,
                        n_jobs=self.n_jobs,
                    )

                    preds = []
                    for i in range(n_instances):
                        preds.append(clf._train_predict(i, distance_matrix))

                    for n, pred in enumerate(preds):
                        results[n][self._class_dictionary[pred]] += 1
                        divisors[n] += 1

        for i in range(n_instances):
            results[i] = (
                np.ones(self.n_classes_) * (1 / self.n_classes_)
                if divisors[i] == 0
                else results[i] / (np.ones(self.n_classes_) * divisors[i])
            )

        return results

    def _individual_train_acc(self, boss, y, train_size, lowest_acc):
        correct = 0
        required_correct = int(lowest_acc * train_size)

        # there may be no words if feature selection is too aggressive
        if boss._transformed_data.shape[1] > 0:
            distance_matrix = pairwise_distances(
                boss._transformed_data,
                use_boss_distance=self.use_boss_distance,
                n_jobs=self.n_jobs,
            )

            for i in range(train_size):
                if correct + train_size - i < required_correct:
                    return -1

                c = boss._train_predict(i, distance_matrix)
                if c == y[i]:
                    correct += 1

                if self.save_train_predictions:
                    boss._train_predictions.append(c)

        return correct / train_size

    @classmethod
    def get_test_params(cls, parameter_set="default"):
        """Return testing parameter settings for the estimator.

        Parameters
        ----------
        parameter_set : str, default="default"
            Name of the set of test parameters to return, for use in tests. If no
            special parameters are defined for a value, will return `"default"` set.
            For classifiers, a "default" set of parameters should be provided for
            general testing, and a "results_comparison" set for comparing against
            previously recorded results if the general set does not produce suitable
            probabilities to compare against.

        Returns
        -------
        params : dict or list of dict, default={}
            Parameters to create testing instances of the class.
            Each dict are parameters to construct an "interesting" test instance, i.e.,
            `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance.
            `create_test_instance` uses the first (or only) dictionary in `params`.
        """
        if parameter_set == "results_comparison":
            return {
                "max_ensemble_size": 5,
                "feature_selection": "none",
                "use_boss_distance": False,
            }
        else:
            return {
                "max_ensemble_size": 2,
                "save_train_predictions": True,
                "feature_selection": "none",
                "use_boss_distance": False,
            }


class IndividualBOSS(BaseClassifier):
    """Single bag of Symbolic Fourier Approximation Symbols (IndividualBOSS).

    Bag of SFA Symbols Ensemble: implementation of a single BOSS Schaffer, the base
    classifier for the boss ensemble.

    Implementation of single BOSS model from Schäfer (2015). [1]_

    This is the underlying classifier for each classifier in the BOSS ensemble.

    Overview: input "n" series of length "m" and IndividualBoss performs a SFA
    transform to form a sparse dictionary of discretised words. The resulting
    dictionary is used with the BOSS distance function in a 1-nearest neighbor.

    Fit involves finding "n" histograms.

    Predict uses 1 nearest neighbor with a bespoke BOSS distance function.

    Parameters
    ----------
    window_size : int
        Size of the window to use in BOSS algorithm.
    word_length : int
        Length of word to use to use in BOSS algorithm.
    norm : bool, default = False
        Whether to normalize words by dropping the first Fourier coefficient.
    alphabet_size : default = 4
        Number of possible letters (values) for each word.
    save_words : bool, default = True
        Whether to keep NumPy array of words in SFA transformation even after
        the dictionary of words is returned. If True, the array is saved, which
        can shorten the time to calculate dictionaries using a shorter
        `word_length` (since the last "n" letters can be removed).
    typed_dict : bool, default="deprecated"
        Use a numba TypedDict to store word counts. May increase memory usage, but will
        be faster for larger datasets. As the Dict cannot be pickled currently, there
        will be some overhead converting it to a python dict with multiple threads and
        pickling.

        .. deprecated:: 0.13.3
            ``typed_dict`` was deprecated in version 0.13.3 and will be removed in 0.15.
    n_jobs : int, default=1
        The number of jobs to run in parallel for both `fit` and `predict`.
        ``-1`` means using all processors.
    random_state : int or None, default=None
        Seed for random, integer.

    Attributes
    ----------
    n_classes_ : int
        Number of classes. Extracted from the data.
    classes_ : list
        The classes labels.

    See Also
    --------
    BOSSEnsemble, ContractableBOSS

    Notes
    -----
    For the Java version, see
    `TSML <https://github.com/uea-machine-learning/tsml/blob/master/src/main/java/
    tsml/classifiers/dictionary_based/IndividualBOSS.java>`_.

    References
    ----------
    .. [1] Patrick Schäfer, "The BOSS is concerned with time series classification
       in the presence of noise", Data Mining and Knowledge Discovery, 29(6): 2015
       https://link.springer.com/article/10.1007/s10618-014-0377-7

    Examples
    --------
    >>> from sktime.classification.dictionary_based import IndividualBOSS
    >>> from sktime.datasets import load_unit_test
    >>> X_train, y_train = load_unit_test(split="train", return_X_y=True)
    >>> X_test, y_test = load_unit_test(split="test", return_X_y=True)
    >>> clf = IndividualBOSS()
    >>> clf.fit(X_train, y_train)
    IndividualBOSS(...)
    >>> y_pred = clf.predict(X_test)
    """

    _tags = {
        "capability:multithreading": True,
    }

    def __init__(
        self,
        window_size=10,
        word_length=8,
        norm=False,
        alphabet_size=4,
        save_words=False,
        typed_dict="deprecated",
        use_boss_distance=True,
        feature_selection="none",
        n_jobs=1,
        random_state=None,
    ):
        self.window_size = window_size
        self.word_length = word_length
        self.norm = norm
        self.alphabet_size = alphabet_size
        self.feature_selection = feature_selection
        self.use_boss_distance = use_boss_distance

        self.save_words = save_words
        self.typed_dict = typed_dict
        self.n_jobs = n_jobs
        self.random_state = random_state

        self._transformer = None
        self._transformed_data = []
        self._class_vals = []
        self._accuracy = 0
        self._subsample = []
        self._train_predictions = []

        super(IndividualBOSS, self).__init__()

    def _fit(self, X, y):
        """Fit a single boss classifier on n_instances cases (X,y).

        Parameters
        ----------
        X : 3D np.array of shape = [n_instances, n_dimensions, series_length]
            The training data.
        y : array-like, shape = [n_instances]
            The class labels.

        Returns
        -------
        self :
            Reference to self.

        Notes
        -----
        Changes state by creating a fitted model that updates attributes
        ending in "_" and sets is_fitted flag to True.
        """
        self._transformer = SFAFast(
            word_length=self.word_length,
            alphabet_size=self.alphabet_size,
            window_size=self.window_size,
            norm=self.norm,
            bigrams=False,
            remove_repeat_words=True,
            save_words=self.save_words,
            n_jobs=self.n_jobs,
            feature_selection=self.feature_selection,
            random_state=self.random_state,
        )

        self._transformed_data = self._transformer.fit_transform(X, y)
        self._class_vals = y

        return self

    def _predict(self, X):
        """Predict class values of all instances in X.

        Parameters
        ----------
        X : 3D np.array of shape = [n_instances, n_dimensions, series_length]
            The data to make predictions for.

        Returns
        -------
        y : array-like, shape = [n_instances]
            Predicted class labels.
        """
        test_bags = self._transformer.transform(X)
        if data_type == np.str_:
            data_type = "object"

        classes = np.zeros(test_bags.shape[0], dtype=data_type)        
        # classes = np.zeros(test_bags.shape[0], dtype=type(self._class_vals[0]))

        if self._transformed_data.shape[1] > 0:
            distance_matrix = pairwise_distances(
                test_bags,
                self._transformed_data,
                use_boss_distance=self.use_boss_distance,
                n_jobs=self.n_jobs,
            )

            for i in range(test_bags.shape[0]):
                min_pos = np.argmin(distance_matrix[i])
                classes[i] = self._class_vals[min_pos]
        else:
            # set to most frequent element
            counts = np.bincount(self._class_vals)
            classes[:] = np.argmax(counts)

        return classes

    def _train_predict(self, train_num, distance_matrix):
        distance_vector = distance_matrix[train_num]
        min_pos = np.argmin(distance_vector)
        return self._class_vals[min_pos]

    def _shorten_bags(self, word_len, y):
        new_boss = IndividualBOSS(
            self.window_size,
            word_len,
            self.norm,
            self.alphabet_size,
            save_words=self.save_words,
            use_boss_distance=self.use_boss_distance,
            feature_selection=self.feature_selection,
            n_jobs=self.n_jobs,
            random_state=self.random_state,
        )
        new_boss._transformer = self._transformer
        new_bag = new_boss._transformer._shorten_bags(word_len, y)
        new_boss._transformed_data = new_bag
        new_boss._class_vals = self._class_vals
        new_boss.n_classes_ = self.n_classes_
        new_boss.classes_ = self.classes_
        new_boss._class_dictionary = self._class_dictionary
        new_boss._is_fitted = True

        return new_boss

    def _clean(self):
        self._transformer.words = None
        self._transformer.save_words = False

    def _set_word_len(self, X, y, word_len):
        self.word_length = word_len

        # we have to retrain feature selection for now
        # might be optimized by remembering feature_dicts
        self._transformer.word_length = min(self._transformer.word_length, word_len)
        self._transformed_data = self._transformer.fit_transform(X, y)


def _dist_wrapper(dist_matrix, X, Y, s, XX_all=None, XY_all=None):
    """Write in-place to a slice of a distance matrix."""
    for i in range(s.start, s.stop):
        dist_matrix[i] = boss_distance(X, Y, i, XX_all, XY_all)


def pairwise_distances(X, Y=None, use_boss_distance=False, n_jobs=1):
    """Find the euclidean distance between all pairs of bop-models."""
    if use_boss_distance:
        if Y is None:
            Y = X

        XX_row_norms = csr_row_norms(X)
        XY = safe_sparse_dot(X, Y.T, dense_output=True)

        distance_matrix = np.zeros((X.shape[0], Y.shape[0]))

        if effective_n_jobs(n_jobs) > 1:
            Parallel(n_jobs=n_jobs, backend="threading")(
                delayed(_dist_wrapper)(distance_matrix, X, Y, s, XX_row_norms, XY)
                for s in gen_even_slices(_num_samples(X), effective_n_jobs(n_jobs))
            )
        else:
            for i in range(len(distance_matrix)):
                distance_matrix[i] = boss_distance(X, Y, i, XX_row_norms, XY)

    else:
        distance_matrix = pairwise.pairwise_distances(X, Y, n_jobs=n_jobs)

    if X is Y or Y is None:
        np.fill_diagonal(distance_matrix, np.inf)

    return distance_matrix


# @njit(cache=True, fastmath=True)
def boss_distance(X, Y, i, XX_all=None, XY_all=None):
    """Find the distance between two histograms.

    This returns the distance between first and second dictionaries, using a non-
    symmetric distance measure. It is used to find the distance between historgrams
    of words.

    This distance function is designed for sparse matrix, represented as either a
    dictionary or an arrray. It only measures the distance between counts present in
    the first dictionary and the second. Hence dist(a,b) does not necessarily equal
    dist(b,a).

    Parameters
    ----------
    X : sparse matrix
        Base dictionary used in distance measurement.
    Y : sparse matrix
        Second dictionary that will be used to measure distance from `first`.
    i : int
        index of current element

    Returns
    -------
    dist : float
        The boss distance between the first and second dictionaries.
    """
    mask = X[i].nonzero()[1]
    if XX_all is None:
        XX = csr_row_norms(X[i])
    else:
        XX = XX_all[i]
    if XY_all is None:
        XY = safe_sparse_dot(X[i], Y.T, dense_output=True)
    else:
        XY = XY_all[i]

    YY = csr_row_norms(Y[:, mask])
    A = XX - 2 * XY + YY
    np.maximum(A, 0, out=A)
    return A


Overwriting C:\Users\kyunomi\anaconda3\envs\cogpilot\Lib\site-packages\sktime\classification\dictionary_based\_boss.py


In [16]:
iboss = PaddingTransformer() * IndividualBOSS()
iboss.fit(X_train, y_train)
y_pred_proba = iboss.predict_proba(X_test)

log_result('IndividualBoss',iboss.classes_, y_test, y_pred_proba)

Unnamed: 0,classifier,accuracy_score,AUC_score,F1_score
0,RandomIntervalClassifier,0.252101,0.536985,0.252101
1,IndividualBoss,0.252101,0.536985,0.252101


## ContractableBoss

In [17]:
cboss = PaddingTransformer() * ContractableBOSS(n_parameter_samples=10, max_ensemble_size=3)
cboss.fit(X_train, y_train)
y_pred_proba = cboss.predict_proba(X_test)

log_result('ContractableBOSS',cboss.classes_, y_test, y_pred_proba)

Unnamed: 0,classifier,accuracy_score,AUC_score,F1_score
0,RandomIntervalClassifier,0.252101,0.536985,0.252101
1,IndividualBoss,0.252101,0.536985,0.252101
2,ContractableBOSS,0.294118,0.571395,0.294118


## Random Interval Spectral Ensemble

In [23]:
rise = PaddingTransformer() * RandomIntervalSpectralEnsemble(n_estimators=50, random_state=42)
rise.fit(X_train, y_train)
y_pred_proba = rise.predict_proba(X_test)

log_result('RISE',rise.classes_, y_test, y_pred_proba)

Unnamed: 0,classifier,accuracy_score,AUC_score,F1_score
0,RandomIntervalClassifier,0.252101,0.536985,0.252101
1,IndividualBoss,0.252101,0.536985,0.252101
2,ContractableBOSS,0.294118,0.571395,0.294118
3,time_series_tree,0.310924,0.578198,0.310924
4,RandomIntervalClassifier,0.420168,0.700349,0.420168
5,RandomIntervalClassifier_fit_rest,0.247525,0.517295,0.247525
6,RISE,0.369748,0.713467,0.369748


## Supervised Time Series Forest (STSF)


In [27]:
stsf = PaddingTransformer() * SupervisedTimeSeriesForest(n_estimators=50, random_state=42)
stsf.fit(X_train, y_train)
y_pred_proba = stsf.predict_proba(X_test)

log_result('STSF',stsf.classes_, y_test, y_pred_proba)

Unnamed: 0,classifier,accuracy_score,AUC_score,F1_score
0,RandomIntervalClassifier,0.252101,0.536985,0.252101
1,IndividualBoss,0.252101,0.536985,0.252101
2,ContractableBOSS,0.294118,0.571395,0.294118
3,time_series_tree,0.310924,0.578198,0.310924
4,RandomIntervalClassifier,0.420168,0.700349,0.420168
5,RandomIntervalClassifier_fit_rest,0.247525,0.517295,0.247525
6,RISE,0.369748,0.713467,0.369748
7,Dr_Cif,0.436975,0.71764,0.436975
8,STSF,0.453782,0.719416,0.453782


## Canonical Interval Forest (CIF)

In [11]:
cif = PaddingTransformer() * CanonicalIntervalForest(n_estimators=5, att_subsample_size=10, random_state=42)
cif.fit(X_train, y_train)
y_pred_proba = cif.predict_proba(X_test)

log_result('CIF',cif.classes_, y_test, y_pred_proba)

Unnamed: 0,classifier,accuracy_score,AUC_score,F1_score
0,CIF,0.361345,0.664489,0.361345


## Diverse Representation Canonical Interval Forest (DrCIF)

In [24]:
dr_cif = PaddingTransformer() * DrCIF(n_estimators=5, att_subsample_size=10, random_state=42)
dr_cif.fit(X_train, y_train)
y_pred_proba = dr_cif.predict_proba(X_test)

log_result('Dr_Cif',dr_cif.classes_, y_test, y_pred_proba)

Unnamed: 0,classifier,accuracy_score,AUC_score,F1_score
0,RandomIntervalClassifier,0.252101,0.536985,0.252101
1,IndividualBoss,0.252101,0.536985,0.252101
2,ContractableBOSS,0.294118,0.571395,0.294118
3,time_series_tree,0.310924,0.578198,0.310924
4,RandomIntervalClassifier,0.420168,0.700349,0.420168
5,RandomIntervalClassifier_fit_rest,0.247525,0.517295,0.247525
6,RISE,0.369748,0.713467,0.369748
7,Dr_Cif,0.436975,0.71764,0.436975


## ShapeletTransformClassifier

In [12]:
stc = PaddingTransformer() * ShapeletTransformClassifier(
    estimator=RotationForest(n_estimators=3),
    n_shapelet_samples=100,
    max_shapelets=10,
    batch_size=20,
)
stc.fit(X_train, y_train)
y_pred_proba = stc.predict_proba(X_test)

log_result('ShapeletTransformClassifier',stc.classes_, y_test, y_pred_proba)

Unnamed: 0,classifier,accuracy_score,AUC_score,F1_score
0,CIF,0.361345,0.664489,0.361345
1,ShapeletTransformClassifier,0.243697,0.469264,0.243697


## RocketClassifier

In [13]:
rocket = PaddingTransformer() * RocketClassifier(num_kernels=500)
rocket.fit(X_train, y_train)
y_pred_proba = rocket.predict_proba(X_test)

log_result('RocketClassifier',rocket.classes_, y_test, y_pred_proba)

Unnamed: 0,classifier,accuracy_score,AUC_score,F1_score
0,CIF,0.361345,0.664489,0.361345
1,ShapeletTransformClassifier,0.243697,0.469264,0.243697
2,RocketClassifier,0.361345,0.620612,0.361345


## HIVECOTEV1

In [12]:
hivecotev = PaddingTransformer() * HIVECOTEV1()
hivecotev.fit(X_train, y_train)
y_pred_proba = hivecotev.predict_proba(X_test)

log_result('HIVECOTEV1',hivecotev.classes_, y_test, y_pred_proba)

## Try looking
- filter out different subject with different resp hz then do training for them.
- do gridsearch using genetic to improve result