In [None]:
import glob
import os
import pandas as pd
import lightgbm as lgb
import numpy as np
import seaborn as sns

from nilearn.signal import clean
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from tsfresh import extract_features
from tsfresh.feature_extraction import EfficientFCParameters
from matplotlib import pyplot as plt

In [None]:
DATASET_DIR = "../dataset"
EXTRACTED_ZIP_DIR = f"{DATASET_DIR}/AOMIC"
TS_DIR = f"{EXTRACTED_ZIP_DIR}/TS"
NOISE_DIR = f"{EXTRACTED_ZIP_DIR}/Noise"
DATASET_FILE_SUFFIX = "_acq-seq_desc-confounds_regressors_6_motion_and_derivs.txt"
LABELS_DICT = {
    "task-restingstate": 0,
    "task-stopsignal": 1,
    "task-workingmemory": 2,
    "task-emomatching": 3,
}

In [None]:
def create_dataset():
    zip_file_path = f"{DATASET_DIR}/AOMIC.zip"
    if not os.path.exists(EXTRACTED_ZIP_DIR):
        print("Extracting dataset...")
        os.system(f"unzip {zip_file_path} -d {EXTRACTED_ZIP_DIR}")

    dataset = []
    labels = []

    data_files_paths = [i for i in glob.glob(f"{TS_DIR}/sub-*.txt")]
    data_files_paths.sort()
    data_files_names = [i.split("/")[-1] for i in data_files_paths]

    for data_file_path, data_file_name in zip(data_files_paths, data_files_names):
        # load dataset element
        data = np.loadtxt(data_file_path)

        file_name_parts = data_file_name.split("_")
        data_id = "_".join(file_name_parts[:2])

        # clean the element (denoise, detrend, standardize)
        noise = np.loadtxt(f"{NOISE_DIR}/{data_id}{DATASET_FILE_SUFFIX}")
        cleaned_data = clean(data, confounds=noise, standardize=True, detrend=True)
        dataset.append(cleaned_data)

        # get label
        label_key = file_name_parts[1]
        labels.append(LABELS_DICT[label_key])

    # make all elements of the dataset the same length (some time series are longer than others)
    shortest_data_len = min([len(item) for item in dataset])
    dataset = [item[:shortest_data_len] for item in dataset]

    return np.array(dataset), np.array(labels)

In [None]:
dataset, labels = create_dataset()

In [None]:
# element x time x features
dataset.shape

In [None]:
def split(dataset, labels):
    return train_test_split(dataset, labels, test_size=0.2, random_state=42)

In [None]:
x_train, x_test, y_train, y_test = split(dataset, labels)

In [None]:
def create_dataframe(dataset):
  ids = np.array([[id_value] * dataset.shape[1] for id_value in range(dataset.shape[0])])
  ids = ids.reshape(-1)

  df_values = dataset.reshape(-1, dataset.shape[2])
  df = pd.DataFrame(df_values, columns=[f"region{i}" for i in range(df_values.shape[1])])
  df["ids"] = ids

  # make ids the first column
  df = df[["ids"] + [c for c in df if c not in ["ids"]]]

  return df

In [None]:
df_train = create_dataframe(x_train)
df_test = create_dataframe(x_test)

In [None]:
df_train.head()

In [None]:
def calculate_features(df):
  return extract_features(
      df[:],
      column_id="ids",
      default_fc_parameters=EfficientFCParameters(),
      column_value="region0", # for now only 1 region is used, to make
  )

In [None]:
train_features = calculate_features(df_train)
test_features = calculate_features(df_test)

In [None]:
# needed to make lightgbm work
train_features.columns = [i for i in range(train_features.shape[1])]
test_features.columns = [i for i in range(test_features.shape[1])]

In [None]:
classifier = lgb.LGBMClassifier()
classifier.fit(train_features, y_train)

In [None]:
y_pred = classifier.predict(test_features)

In [None]:
accuracy = accuracy_score(y_test, y_pred, normalize=True)

In [None]:
print("LightGBM Model accuracy score: {0:0.2f}".format(accuracy))

In [None]:
def create_confusion_matrix(y_test, y_pred):
  cm = confusion_matrix(y_test, y_pred).astype(np.float64)

  for i in range(cm.shape[0]):
      cm[i, :] /= np.sum(cm[i, :])

  cm_matrix = pd.DataFrame(data=cm)
  label_names = [key.split("-")[1] for key in LABELS_DICT.keys()]
  cm_matrix["types"] = np.array(label_names)

  cm_matrix.set_index("types", inplace=True)
  cm_matrix.columns = label_names

  return cm_matrix

In [None]:
cm_matrix = create_confusion_matrix(y_test, y_pred)

In [None]:
def plot_confusion_matrix(cm_matrix):
  plt.figure(figsize=(7, 5))
  sns.heatmap(
      cm_matrix,
      annot=True,
      fmt=".3f",
      square=True,
      cbar=False,
      cmap="Blues",
      linewidths=3,
      vmin=0,
      vmax=1,
  )
  plt.xlabel("Predicted label", labelpad=16)
  plt.ylabel("True label", labelpad=12)
  plt.tick_params(axis="y", rotation=0)
  plt.show()

In [None]:
plot_confusion_matrix(cm_matrix)