In [1527]:
import pandas as pd
from ast import literal_eval
from sktime.datatypes import check_raise
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix

In [None]:
# df_temp = pd.read_csv("../data/01_cum_multi_samelen.csv", low_memory=False)
# df_temp

In [1528]:
def load_as_pd_multiindex(file_path):
    """
    Load a file as pd_multiindex.
    https://www.sktime.net/en/stable/examples/AA_datatypes_and_datasets.html#Section-1.2.1:-Time-series-panels---the-%22pd-multiindex%22-mtype
    - file_path: E.g., "../data/01_single.csv"
    """
    df = pd.read_csv(file_path, low_memory=False)
    # df.iloc[0].five_p_cleav_1[0]
    # Some problem in reading the data.
    # https://stackoverflow.com/questions/79413934/write-read-columns-of-list-of-numbers-integer-or-float-to-from-csv-in-python
    # https://stackoverflow.com/questions/23111990/pandas-dataframe-stored-list-as-string-how-to-convert-back-to-list/63020659#63020659
    df = df.map(literal_eval)
    # print(df.head())
    #
    # print(df.iloc[0].size, df.shape[0], len(df), df.iloc[0].iloc[0])
    # 8 827 827 [-2, 2, -2, 2, 1, -2, -2, -2, -2, 2, 1, 1, 1, -2]
    # 8: no of columns (features) an instance has
    # 827: no of instances
    # [-2, 2, -2, 2, 1, -2, -2, -2, -2, 2, 1, 1, 1, -2]: first instance's first feature (i.e., five_p_cleav_1)
    #
    for i in range(len(df)):
        for j in range(df.iloc[0].size):
            # https://stackoverflow.com/questions/19482970/get-a-list-from-pandas-dataframe-column-headers
            col = pd.DataFrame(df.iloc[i].iloc[j], columns=[df.columns.values[j]]) # E.g., five_p_cleav_1
            if j == 0:
                col_all = col
            else:
                # 【pandas数据合并一】：pd.concat()用法
                # https://blog.csdn.net/xue_11/article/details/118424380  
                col_all = pd.concat([col_all, col], axis=1)
        # https://stackoverflow.com/questions/25457920/convert-row-names-into-a-column-in-pandas
        col_all.index.name = 'time points'
        col_all.reset_index(inplace=True)
        # https://stackoverflow.com/questions/29517072/add-column-to-dataframe-with-constant-value
        col_all.insert(0, 'instances', i)
        if i == 0:
            rows = col_all
        else:
            rows = pd.concat([rows, col_all], axis=0)
    rows = rows.set_index(["instances", "time points"])
    return rows

# Prepare binary classification data (3p, 5p)

In [1529]:
ts_panel = load_as_pd_multiindex("../data/01_cum_multi_samelen.csv")

In [1530]:
check_raise(ts_panel, mtype="pd-multiindex")

True

The 1st instance's 1st feature (i.e., five_p_cleav_1) in time series representation is shown below.

In [1531]:
ts_panel.loc[0, "five_p_cleav_1"]

time points
0     0
1     0
2     1
3     1
4     2
5     1
6     1
7     1
8     1
9     1
10    2
11    1
12    0
13   -1
14   -1
Name: five_p_cleav_1, dtype: int64

In [1532]:
# # https://stackoverflow.com/questions/57417520/selecting-and-renaming-columns-at-the-same-time
# # https://www.geeksforgeeks.org/how-to-rename-columns-in-pandas-dataframe/
# # https://stackoverflow.com/questions/45590866/python-pandas-concat-dataframes-with-different-columns-ignoring-column-names
# pos_instances = ts_panel[["five_p_cleav_1"]]
# pos_instances= pos_instances.rename(columns={'five_p_cleav_1': 'ts_1'})
# # pos_instances
# neg_instances = ts_panel[["five_p_non_cleav_1"]]
# neg_instances = neg_instances.rename(columns={'five_p_non_cleav_1': 'ts_1'})
# # https://stackoverflow.com/questions/79445936/shift-change-the-index-of-a-dataframe
# neg_instances.index = neg_instances.index.map(lambda idx: (idx[0] + 827, idx[1]))
# # neg_instances

In [1533]:
# # https://stackoverflow.com/questions/57417520/selecting-and-renaming-columns-at-the-same-time
# # https://www.geeksforgeeks.org/how-to-rename-columns-in-pandas-dataframe/
# # https://stackoverflow.com/questions/45590866/python-pandas-concat-dataframes-with-different-columns-ignoring-column-names
# pos_instances = ts_panel[["five_p_cleav_1", "five_p_cleav_compl_1"]]
# pos_instances= pos_instances.rename(columns={'five_p_cleav_1': 'ts_1', 'five_p_cleav_compl_1': 'ts_2'})
# # pos_instances
# neg_instances = ts_panel[["five_p_non_cleav_1", "five_p_non_cleav_compl_1"]]
# neg_instances = neg_instances.rename(columns={'five_p_non_cleav_1': 'ts_1', 'five_p_non_cleav_compl_1': 'ts_2'})
# # https://stackoverflow.com/questions/79445936/shift-change-the-index-of-a-dataframe
# neg_instances.index = neg_instances.index.map(lambda idx: (idx[0] + 827, idx[1]))
# # neg_instances

In [1534]:
# https://stackoverflow.com/questions/57417520/selecting-and-renaming-columns-at-the-same-time
# https://www.geeksforgeeks.org/how-to-rename-columns-in-pandas-dataframe/
# https://stackoverflow.com/questions/45590866/python-pandas-concat-dataframes-with-different-columns-ignoring-column-names
pos_instances = ts_panel[["five_p_cleav_1", "five_p_cleav_2"]]
pos_instances= pos_instances.rename(columns={'five_p_cleav_1': 'ts_1', 'five_p_cleav_2': 'ts_2'})
# pos_instances
neg_instances = ts_panel[["five_p_non_cleav_1", "five_p_non_cleav_2"]]
neg_instances = neg_instances.rename(columns={'five_p_non_cleav_1': 'ts_1', 'five_p_non_cleav_2': 'ts_2'})
# https://stackoverflow.com/questions/79445936/shift-change-the-index-of-a-dataframe
neg_instances.index = neg_instances.index.map(lambda idx: (idx[0] + 827, idx[1]))
# neg_instances

In [1535]:
# # https://stackoverflow.com/questions/57417520/selecting-and-renaming-columns-at-the-same-time
# # https://www.geeksforgeeks.org/how-to-rename-columns-in-pandas-dataframe/
# # https://stackoverflow.com/questions/45590866/python-pandas-concat-dataframes-with-different-columns-ignoring-column-names
# pos_instances = ts_panel[["five_p_cleav_1", "five_p_cleav_compl_1", "three_p_cleav_1", "three_p_cleav_compl_1"]]
# pos_instances= pos_instances.rename(columns={'five_p_cleav_1': 'ts_1', 'five_p_cleav_compl_1': 'ts_2', 'three_p_cleav_1': 'ts_3', 'three_p_cleav_compl_1': 'ts_4'})
# # pos_instances
# neg_instances = ts_panel[["five_p_non_cleav_1", "five_p_non_cleav_compl_1", "three_p_non_cleav_1", "three_p_non_cleav_compl_1"]]
# neg_instances = neg_instances.rename(columns={'five_p_non_cleav_1': 'ts_1', 'five_p_non_cleav_compl_1': 'ts_2', 'three_p_non_cleav_1': 'ts_3', 'three_p_non_cleav_compl_1': 'ts_4'})
# # https://stackoverflow.com/questions/79445936/shift-change-the-index-of-a-dataframe
# neg_instances.index = neg_instances.index.map(lambda idx: (idx[0] + 827, idx[1]))
# # neg_instances

In [1536]:
X = pd.concat([pos_instances, neg_instances], axis=0)
X

Unnamed: 0_level_0,Unnamed: 1_level_0,ts_1,ts_2
instances,time points,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,0,0
0,1,0,-1
0,2,1,-1
0,3,1,-2
0,4,2,-2
...,...,...,...
1653,10,2,4
1653,11,2,5
1653,12,1,5
1653,13,1,6


In [1537]:
# https://stackoverflow.com/questions/31270971/how-to-create-a-numpy-array-of-n-numbers-of-the-same-value
y = np.concatenate((np.full((1, 827), 'cleav'), np.full((1, 827), 'non_cleav')), axis=None)

In [1538]:
# Split data into train and test sets
np.random.seed(18)

# Split into train & test set
# https://gist.github.com/shaypal5/3e34e85bd89d65d4ac118daa9a42b174
X_train_ix, X_test_ix, y_train, y_test = train_test_split(X.index.get_level_values(0).unique(), y, test_size=0.2)
X_train = X.loc[X_train_ix]
X_test = X.loc[X_test_ix]

In [1539]:
X_train.index.get_level_values(0).unique()

Index([ 427,  501, 1541, 1070, 1561, 1576,  308, 1611,  986,   25,
       ...
        913,  738,  264,  578,  242, 1198,  837, 1144,  275, 1322],
      dtype='int64', name='instances', length=1323)

In [1540]:
X_train_ix[:10]

Index([427, 501, 1541, 1070, 1561, 1576, 308, 1611, 986, 25], dtype='int64', name='instances')

In [1541]:
y_train[:10]

array(['cleav', 'cleav', 'non_cleav', 'non_cleav', 'non_cleav',
       'non_cleav', 'cleav', 'non_cleav', 'non_cleav', 'cleav'],
      dtype='<U9')

In [1542]:
len(y_train), len(y_test)

(1323, 331)

# Prepare multiclass classification data

# Rocket

## Multivariate Time Series

In [1543]:
X_train

Unnamed: 0_level_0,Unnamed: 1_level_0,ts_1,ts_2
instances,time points,Unnamed: 2_level_1,Unnamed: 3_level_1
427,0,0,0
427,1,0,-1
427,2,0,-2
427,3,-1,-2
427,4,-1,-3
...,...,...,...
1322,10,-2,-2
1322,11,-1,-2
1322,12,-1,-3
1322,13,-1,-4


In [1544]:
num_instances = X_train.index.get_level_values('instances').nunique()
print(num_instances)

1323


In [1545]:
from sktime.transformations.panel.rocket import Rocket
rocket = Rocket()
rocket.fit(X_train)
X_train_transform = rocket.transform(X_train)

In [1546]:
import numpy as np
from sklearn.linear_model import RidgeClassifierCV
classifier = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10))
classifier.fit(X_train_transform, y_train)

In [1547]:
type(y_train)

numpy.ndarray

In [1548]:
# X_test, y_test = load_basic_motions(split="test", return_type="pd-multiindex", return_X_y=True)
X_test_transform = rocket.transform(X_test)

In [1549]:
classifier.score(X_test_transform, y_test)

0.6314199395770392

In [1550]:
y_pred = classifier.predict(X_test_transform)

In [1551]:
y_true = y_test

In [1552]:
len(y_pred), len(y_true)

(331, 331)

In [1553]:
# https://stackoverflow.com/questions/33275461/specificity-in-scikit-learn
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
accuracy = (tp + tn) / (tp + tn + fp + fn)
specificity = tn / (tn+fp)
sensitivity = tp / (tp+fn)
f1_score = 2 * tp / (2 * tp + fp + fn)
from sklearn.metrics import matthews_corrcoef
mcc = matthews_corrcoef(y_true, y_pred)


In [1554]:
accuracy, specificity, sensitivity, f1_score, mcc

(0.6314199395770392,
 0.6071428571428571,
 0.656441717791411,
 0.6369047619047619,
 0.2638255423615275)