In [1]:
import re, os, time
import pandas as pd
import datetime
import numpy as np
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
from matplotlib import pyplot as plt
# import plotData # helper function in starter code package


from sktime.transformations.panel.padder import PaddingTransformer
from sktime.classification.compose import ClassifierPipeline, ComposableTimeSeriesForestClassifier
from sktime.transformations.panel.summarize import RandomIntervalFeatureExtractor
from sklearn.tree import DecisionTreeClassifier
# only classifier in sktime that can process unequal length data
# https://github.com/sktime/sktime/issues/3649#issuecomment-1292459843
# from sktime.alignment.dtw_python import AlignerDTW   ## NOTE THAT THIS SOMEHOW AFFECT ALL PRINT OUTPUT. NOTHING WILL BE SHOWN FOR PRINT STATEMENT AFTER YOU RUN THIS
from sktime.classification.feature_based import RandomIntervalClassifier
from sktime.classification.distance_based import KNeighborsTimeSeriesClassifier
from sktime.classification.dictionary_based import IndividualBOSS, ContractableBOSS
from sktime.classification.kernel_based import RocketClassifier
from sktime.classification.hybrid import HIVECOTEV1, HIVECOTEV2
from sktime.classification.shapelet_based import ShapeletTransformClassifier
from sktime.classification.sklearn import RotationForest

from sktime.dists_kernels.compose_from_align import DistFromAligner
from sktime.utils.slope_and_trend import _slope
from sklearn.pipeline import Pipeline
# https://www.sktime.org/en/stable/api_reference/auto_generated/sktime.transformations.panel.catch22.Catch22.html
from sktime.transformations.panel.catch22 import Catch22

from sktime.classification.interval_based import CanonicalIntervalForest,DrCIF,RandomIntervalSpectralEnsemble,SupervisedTimeSeriesForest,TimeSeriesForestClassifier

# identify classifiers that support unequal length
from sktime.registry import all_estimators




## Combining Data

In [2]:
# combine into single df
df_combined_subject = {'subject':[],'normalised_resp':[], 'difficulty':[]}
for root, dirs, files in os.walk("..\\cleanedData\\"):
  for file in files:
      if "lslshimmerresp" in file:
        difficulty = re.search("mV_(\d\d\w)", file).group(1)
        subject = re.search("_(cp\d+)_", file).group(1)
        df_temp = pd.read_csv(os.path.join(root, file))
        # We perform scaler on EACH subject as they are independent of each other 
        scaler = StandardScaler()
        resp_series = pd.Series(scaler.fit_transform(df_temp[['respiration_trace_mV']]).flatten())

        df_combined_subject['subject'].append(subject)
        df_combined_subject['normalised_resp'].append(resp_series)
        df_combined_subject['difficulty'].append(difficulty)
df_combined_subject = pd.DataFrame(df_combined_subject)

In [3]:
# save to pickle rather than csv to preserve the nested series inside the dataframe
df_combined_subject.to_pickle("..\\cleanedData\\df_combined_subject.pkl", protocol=4)

# Modelling

In [4]:
df_combined_subject = pd.read_pickle("..\\cleanedData\\df_combined_subject.pkl")

In [5]:
# Identify rows that is 0 length
# empty_row = []
# for i in range(len(df_combined)):
#   temp = df_combined.iloc[i,0]
#   if len(temp[temp==0.0]) or len(temp[temp==0]):
#     empty_row.append(i)

# df_combined.drop(empty_row, inplace=True)

In [6]:
%%time
# perform train test split according by subject
# split into 5 different folds for CV
from sklearn.model_selection import GroupKFold
X_train, X_test, y_train, y_test = [], [], [], []
gss = GroupKFold(n_splits=5)
for train, test in gss.split(df_combined_subject["normalised_resp"], df_combined_subject["difficulty"], df_combined_subject["subject"]):
  X_train.append(df_combined_subject.loc[train,["normalised_resp"]])
  X_test.append(df_combined_subject.loc[test,["normalised_resp"]])
  y_train.append(df_combined_subject.loc[train,"difficulty"].astype("string"))
  y_test.append(df_combined_subject.loc[test,"difficulty"].astype("string"))

CPU times: total: 0 ns
Wall time: 11.9 ms


In [7]:
# X_train, X_test, y_train, y_test = train_test_split(df_combined_subject["normalised_resp"], df_combined_subject["difficulty"], random_state=42)

In [8]:
# X_train = pd.DataFrame(X_train)
# X_test = pd.DataFrame(X_test)
# y_train = y_train.astype("string")
# y_test = y_test.astype("string")

In [9]:
model_result = {
  "classifier":[],
  "accuracy_score":[],
  "AUC_score":[],
  "F1_score":[],
  "runtime(s)":[],
}

def get_class(class_list, prob_list):
  idx = list(prob_list).index(max(prob_list))
  return class_list[idx]

def log_result(classifier_name, class_list, y_test, y_pred_proba, runtime):
  y_pred = []
  for y_list in y_pred_proba:
    y_pred.append(get_class(class_list, y_list))
  acc = accuracy_score(y_test, y_pred)
  auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')
  f1 = f1_score(y_test, y_pred, average='micro')
  model_result["classifier"].append(classifier_name)
  model_result["accuracy_score"].append(acc)
  model_result["AUC_score"].append(auc)
  model_result["F1_score"].append(f1)
  model_result["runtime(s)"].append(runtime)

  display(pd.DataFrame(model_result))
  pd.DataFrame(model_result).to_csv("..\\cleanedData\\respiration_split_pilot_result.csv")

In [10]:
def run_model(classifier_name,classifier, padding = True,  fold = 0):
  start = time.time()
  
  # set up pipeline
  if padding:
    clf = PaddingTransformer() * classifier()
  else:
    clf = classifier()
  
  clf.fit(X_train[fold], y_train[fold])
  y_pred_proba = clf.predict_proba(X_test[fold])
  end = time.time()

  log_result(classifier_name, clf.classes_, y_test[fold], y_pred_proba, end-start)

## KNeighborsTimeSeriesClassifier

In [None]:


# search for all classifiers which can handle unequal length data. This may give some
# UserWarnings if soft dependencies are not installed.
all_estimators(
    filter_tags={"capability:unequal_length": True}, estimator_types="classifier"
)

In [None]:
# aligner = AlignerDTW()
# dtw_dist = DistFromAligner(aligner)
# knclassifier = KNeighborsTimeSeriesClassifier(n_neighbors=3, distance = dtw_dist, n_jobs= -1)
# knclassifier.fit(X_train, y_train)
# y_pred = knclassifier.predict(X_test)

# log_result('KNeighborsTimeSeriesClassifier',y_test, y_pred)

# NO MEMORY

## KNeighborsTimeSeriesClassifier with padding

In [None]:
# padded_KN_pipeline = ClassifierPipeline(
#     KNeighborsTimeSeriesClassifier(n_neighbors=5, distance ="dtw", n_jobs= 1, leaf_size = 2000), 
#     [PaddingTransformer()]
# )
# padded_KN_pipeline.fit(X_train, y_train)
# y_pred = padded_KN_pipeline.predict(X_test)

# log_result('KNeighborsTimeSeriesClassifier',y_test, y_pred)

# NO MEMORY

## ComposableTimeSeriesForestClassifier
https://www.sktime.org/en/v0.8.1/examples/02_classification_univariate.html

## ON PAUSE CAUSE TAKES TOO LONG

In [None]:
# https://www.sktime.org/en/v0.8.1/examples/02_classification_univariate.html
tsf_tst = PaddingTransformer() * ComposableTimeSeriesForestClassifier(
    estimator=time_series_tree,
    n_estimators=100,
    # criterion="entropy",
    bootstrap=True,
    oob_score=True,
    random_state=1,
    n_jobs=-1,
)
tsf_tst.fit(X_train, y_train.astype("string"))

if tsf_tst.oob_score:
    print(tsf.oob_score_)

y_pred_proba = tsf_tst.predict_proba(X_test)

log_result('tsf_time_series_forest',tsf_tst.classes_, y_test, y_pred_proba)

In [None]:
tsf = ComposableTimeSeriesForestClassifier()
tsf.fit(X_train, y_train)
y_pred_proba = tsf.predict_proba(X_test)
log_result('TimeSeriesForestClassifier',tsf.classes_, y_test, y_pred_proba)

## Classification using catch22

In [None]:
catch22 = PaddingTransformer() * Catch22()
X_train_catch22 = catch22.fit_transform(X_train)
X_test_catch22 = catch22.transform(X_test)

In [None]:
X_train_catch22.to_csv("..\\cleanedData\\X_train_catch22.csv")
X_test_catch22.to_csv("..\\cleanedData\\X_test_catch22.csv")

## RandomIntervalClassifier
extract at random interval and perform Rotation forest with 200 trees

In [13]:
run_model(
  "RandomIntervalClassifier", 
  lambda: RandomIntervalClassifier(n_intervals=5, n_jobs=1, random_state = 42),
  padding = True,
  fold = 0
  )

## Decision Trees with mean, std, slope


In [None]:
steps = [
    ("padding",PaddingTransformer()),
    (
        "extract",
        RandomIntervalFeatureExtractor(
            n_intervals="sqrt", features=[np.mean, np.std, _slope]
        ),
    ),
    ("clf", DecisionTreeClassifier()),
]
time_series_tree = Pipeline(steps)

In [None]:
start = time.time()
time_series_tree.fit(X_train[0], y_train[0])
y_pred_proba = time_series_tree.predict_proba(X_test[0])
end = time.time()
log_result('RandomeIntervalDecisionTree',time_series_tree.classes_, y_test[0], y_pred_proba, end-start)

Unnamed: 0,classifier,accuracy_score,AUC_score,F1_score
0,RandomIntervalClassifier,0.252101,0.536985,0.252101
1,IndividualBoss,0.252101,0.536985,0.252101
2,ContractableBOSS,0.294118,0.571395,0.294118
3,time_series_tree,0.310924,0.578198,0.310924


## Individual Boss


In [None]:
run_model(
  "IndividualBoss", 
  lambda: IndividualBoss(),
  padding = True,
  fold = 0
  )

## ContractableBoss

In [None]:
run_model(
  "ContractableBOSS", 
  lambda: ContractableBOSS(n_parameter_samples=10, max_ensemble_size=3),
  padding = True,
  fold = 0
  )

## Random Interval Spectral Ensemble

In [None]:
run_model(
  "RandomIntervalSpectralEnsemble", 
  lambda: RandomIntervalSpectralEnsemble(n_estimators=50, random_state=42),
  padding = True,
  fold = 0
  )

## Supervised Time Series Forest (STSF)


In [None]:
run_model(
  "SupervisedTimeSeriesForest", 
  lambda: SupervisedTimeSeriesForest(n_estimators=50, random_state=42),
  padding = True,
  fold = 0
  )

## Canonical Interval Forest (CIF)

In [None]:
run_model(
  "CanonicalIntervalForest", 
  lambda: CanonicalIntervalForest(n_estimators=5, att_subsample_size=10, random_state=42),
  padding = True,
  fold = 0
  )

## Diverse Representation Canonical Interval Forest (DrCIF)

In [None]:
run_model(
  "DiverseRepresentationCanonicalIntervalForest", 
  lambda: DrCIF(n_estimators=5, att_subsample_size=10, random_state=42),
  padding = True,
  fold = 0
  )

## ShapeletTransformClassifier

In [None]:
run_model(
  "ShapeletTransformClassifier", 
  lambda: ShapeletTransformClassifier(
    estimator=RotationForest(n_estimators=3),
    n_shapelet_samples=100,
    max_shapelets=10,
    batch_size=20,
    ),
  padding = True,
  fold = 0
  )

## RocketClassifier

In [None]:
run_model(
  "RocketClassifier", 
  lambda: RocketClassifier(num_kernels=500),
  padding = True,
  fold = 0
  )

## HIVECOTEV1

In [None]:
run_model(
  "HIVECOTEV1", 
  lambda: HIVECOTEV1(),
  padding = True,
  fold = 0
  )

## Try looking
- filter out different subject with different resp hz then do training for them.
- do gridsearch using genetic to improve result