In [None]:
import pandas as pd
from ast import literal_eval
from sktime.datatypes import check_raise

In [None]:
def load_as_pd_multiindex(file_path):
    """
    Load a file as pd_multiindex.
    https://www.sktime.net/en/stable/examples/AA_datatypes_and_datasets.html#Section-1.2.1:-Time-series-panels---the-%22pd-multiindex%22-mtype
    - file_path: E.g., "../data/01_single.csv"
    """
    df = pd.read_csv(file_path, low_memory=False)
    # df.iloc[0].five_p_cleav_1[0]
    # Some problem in reading the data.
    # https://stackoverflow.com/questions/79413934/write-read-columns-of-list-of-numbers-integer-or-float-to-from-csv-in-python
    # https://stackoverflow.com/questions/23111990/pandas-dataframe-stored-list-as-string-how-to-convert-back-to-list/63020659#63020659
    df = df.map(literal_eval)
    # print(df.head())
    #
    # print(df.iloc[0].size, df.shape[0], len(df), df.iloc[0].iloc[0])
    # 8 827 827 [-2, 2, -2, 2, 1, -2, -2, -2, -2, 2, 1, 1, 1, -2]
    # 8: no of columns (features) an instance has
    # 827: no of instances
    # [-2, 2, -2, 2, 1, -2, -2, -2, -2, 2, 1, 1, 1, -2]: first instance's first feature (i.e., five_p_cleav_1)
    #
    for i in range(len(df)):
        for j in range(df.iloc[0].size):
            # https://stackoverflow.com/questions/19482970/get-a-list-from-pandas-dataframe-column-headers
            col = pd.DataFrame(df.iloc[i].iloc[j], columns=[df.columns.values[j]]) # E.g., five_p_cleav_1
            row = pd.concat([row, col], axis=1)
            row.index.name = 'time points'
            row.reset_index(inplace=True)
            row.insert(0, 'instances', i)
        if i == 0:
            rows = row
        else:
            rows = pd.concat([rows, row], axis=0)
    rows = rows.set_index(["instances", "time points"])
    print(rows.head())
    return

In [None]:
load_as_pd_multiindex("../data/01_single.csv")

In [None]:
for i in range(len(df)):
    # t: temp
    # https://stackoverflow.com/questions/19482970/get-a-list-from-pandas-dataframe-column-headers
    t1 = pd.DataFrame(df.iloc[i].iloc[0], columns=[df.columns.values[0]]) # five_p_cleav_1
    t2 = pd.DataFrame(df.iloc[i].iloc[1], columns=[df.columns.values[1]]) # five_p_cleav_compl_1
    t3 = pd.DataFrame(df.iloc[i].iloc[2], columns=[df.columns.values[2]]) # five_p_non_cleav_1
    t4 = pd.DataFrame(df.iloc[i].iloc[3], columns=[df.columns.values[3]]) # five_p_non_cleav_compl_1
    t5 = pd.DataFrame(df.iloc[i].iloc[4], columns=[df.columns.values[4]]) # three_p_cleav_1
    t6 = pd.DataFrame(df.iloc[i].iloc[5], columns=[df.columns.values[5]]) # three_p_cleav_compl_1
    t7 = pd.DataFrame(df.iloc[i].iloc[6], columns=[df.columns.values[6]]) # three_p_non_cleav_1
    t8 = pd.DataFrame(df.iloc[i].iloc[7], columns=[df.columns.values[7]]) # three_p_non_cleav_compl_1
    # 【pandas数据合并一】：pd.concat()用法
    # https://blog.csdn.net/xue_11/article/details/118424380
    t = pd.concat([t1,t2,t3,t4,t5,t6,t7,t8], axis=1)
    # https://stackoverflow.com/questions/25457920/convert-row-names-into-a-column-in-pandas
    t.index.name = 'time points'
    t.reset_index(inplace=True)
    # https://stackoverflow.com/questions/29517072/add-column-to-dataframe-with-constant-value
    t.insert(0, 'instances', i)
    if i == 0:
        t_all = t
    else:
        t_all = pd.concat([t_all, t], axis=0)
t_all = t_all.set_index(["instances", "time points"])

In [None]:

check_raise(t_all, mtype="pd-multiindex")

True

In [8]:
t_all.loc[0, "five_p_cleav_1"]

time points
0    -2
1     2
2    -2
3     2
4     1
5    -2
6    -2
7    -2
8    -2
9     2
10    1
11    1
12    1
13   -2
Name: five_p_cleav_1, dtype: int64

# Prepare binary classification data (3p, 5p)

In [None]:
# t1 = pd.DataFrame(df.iloc[i].iloc[0], columns=[df.columns.values[0]]) # five_p_cleav_1

In [9]:
# https://stackoverflow.com/questions/57417520/selecting-and-renaming-columns-at-the-same-time
# https://www.geeksforgeeks.org/how-to-rename-columns-in-pandas-dataframe/
# https://stackoverflow.com/questions/45590866/python-pandas-concat-dataframes-with-different-columns-ignoring-column-names
X = t_all[["five_p_cleav_1", "five_p_cleav_compl_1"]]
X= X.rename(columns={'five_p_cleav_1': 'strand', 'five_p_cleav_compl_1': 'strand_compl'})
X

Unnamed: 0_level_0,Unnamed: 1_level_0,strand,strand_compl
instances,time points,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,-2,2
0,1,2,-2
0,2,-2,2
0,3,2,-2
0,4,1,-1
...,...,...,...
826,9,2,0
826,10,-1,0
826,11,-2,0
826,12,1,-1


In [10]:
temp = t_all[["five_p_non_cleav_1", "five_p_non_cleav_compl_1"]]
temp = temp.rename(columns={'five_p_non_cleav_1': 'strand', 'five_p_non_cleav_compl_1': 'strand_compl'})
# https://stackoverflow.com/questions/79445936/shift-change-the-index-of-a-dataframe
temp.index = temp.index.map(lambda idx: (idx[0] + 827, idx[1]))
temp


Unnamed: 0_level_0,Unnamed: 1_level_0,strand,strand_compl
instances,time points,Unnamed: 2_level_1,Unnamed: 3_level_1
827,0,2,-2
827,1,1,-1
827,2,1,-2
827,3,-2,2
827,4,-2,2
...,...,...,...
1653,9,-1,1
1653,10,-1,0
1653,11,1,0
1653,12,-1,1


In [11]:
X = pd.concat([X, temp], axis=0)
X

Unnamed: 0_level_0,Unnamed: 1_level_0,strand,strand_compl
instances,time points,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,-2,2
0,1,2,-2
0,2,-2,2
0,3,2,-2
0,4,1,-1
...,...,...,...
1653,9,-1,1
1653,10,-1,0
1653,11,1,0
1653,12,-1,1


In [13]:
# https://stackoverflow.com/questions/31270971/how-to-create-a-numpy-array-of-n-numbers-of-the-same-value
import numpy as np
y = np.concatenate((np.full((1, 827), 'five_p_cleav'), np.full((1, 827), 'five_p_cleav_compl')), axis=None)
y

array(['five_p_cleav', 'five_p_cleav', 'five_p_cleav', ...,
       'five_p_cleav_compl', 'five_p_cleav_compl', 'five_p_cleav_compl'],
      dtype='<U18')

In [14]:
from sklearn.model_selection import train_test_split, cross_val_score

# Split data into train and test sets
np.random.seed(18)

# Split into train & test set
# https://gist.github.com/shaypal5/3e34e85bd89d65d4ac118daa9a42b174
X_train_ix, X_test_ix, y_train, y_test = train_test_split(X.index.get_level_values(0).unique(), y, test_size=0.2)

In [15]:
X_train = X.loc[X_train_ix]
X_test = X.loc[X_test_ix]

In [16]:
X_train.index.get_level_values(0).unique()

Index([ 427,  501, 1541, 1070, 1561, 1576,  308, 1611,  986,   25,
       ...
        913,  738,  264,  578,  242, 1198,  837, 1144,  275, 1322],
      dtype='int64', name='instances', length=1323)

In [17]:
X_train_ix[:10]

Index([427, 501, 1541, 1070, 1561, 1576, 308, 1611, 986, 25], dtype='int64', name='instances')

In [18]:
y_train[:10]

array(['five_p_cleav', 'five_p_cleav', 'five_p_cleav_compl',
       'five_p_cleav_compl', 'five_p_cleav_compl', 'five_p_cleav_compl',
       'five_p_cleav', 'five_p_cleav_compl', 'five_p_cleav_compl',
       'five_p_cleav'], dtype='<U18')

In [19]:
len(y_train), len(y_test)

(1323, 331)

# Prepare multiclass classification data

# Rocket

## Multivariate Time Series

In [20]:
X_train

Unnamed: 0_level_0,Unnamed: 1_level_0,strand,strand_compl
instances,time points,Unnamed: 2_level_1,Unnamed: 3_level_1
427,0,-2,2
427,1,-2,1
427,2,1,-2
427,3,-2,2
427,4,1,-1
...,...,...,...
1322,9,1,-1
1322,10,2,-2
1322,11,-2,2
1322,12,-2,2


In [21]:
num_instances = X_train.index.get_level_values('instances').nunique()
print(num_instances)

1323


In [22]:
from sktime.transformations.panel.rocket import Rocket
rocket = Rocket()
rocket.fit(X_train)
X_train_transform = rocket.transform(X_train)

In [23]:
import numpy as np
from sklearn.linear_model import RidgeClassifierCV
classifier = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10))
classifier.fit(X_train_transform, y_train)

In [24]:
type(y_train)

numpy.ndarray

In [25]:
# X_test, y_test = load_basic_motions(split="test", return_type="pd-multiindex", return_X_y=True)
X_test_transform = rocket.transform(X_test)

In [26]:
classifier.score(X_test_transform, y_test)

0.7613293051359517

In [31]:
y_pred = classifier.predict(X_test_transform)

In [32]:
y_true = y_test

In [34]:
len(y_pred), len(y_true)

(331, 331)

In [35]:
# https://stackoverflow.com/questions/33275461/specificity-in-scikit-learn
from sklearn.metrics import confusion_matrix


In [41]:

tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
accuracy = (tp + tn) / (tp + tn + fp + fn)
specificity = tn / (tn+fp)
sensitivity = tp / (tp+fn)
f1_score = 2 * tp / (2 * tp + fp + fn)
from sklearn.metrics import matthews_corrcoef
mcc = matthews_corrcoef(y_true, y_pred)
