In [18]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectPercentile, VarianceThreshold, f_regression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import Normalizer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import LeaveOneGroupOut, LeaveOneOut,KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import make_scorer
from sklearn.svm import LinearSVR, SVC
from sklearn.decomposition import NMF
import matplotlib.pyplot as plt
import random
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler, Normalizer

In [19]:
%matplotlib qt

In [20]:
def group_by_ratio(np_x, np_y, **args):
    aug_x = None
    aug_y = None
    ratio_list = [0, 0.1, 0.5, 0.9, 1]
    for ratio in ratio_list:
        idx = np_y == ratio
        ratio_x = np_x[idx, :]
        ratio_y = np_y[idx]
        if aug_x is None:
            aug_x = ratio_x
        else:
            aug_x = np.concatenate((aug_x, ratio_x))
        if aug_y is None:
            aug_y = ratio_y
        else:
            aug_y = np.concatenate((aug_y, ratio_y))

    return aug_x, aug_y



In [21]:
resolution = 23000
datasets = ['G2S1(6)','G2S2(6)','G2FS1(6)','G2FS2(6)','G2S2(3)','G2FS2(3)']
np_input = np.empty(shape=[resolution,0])
np_output = np.empty(shape=[0])
ms_level = 'MS2'
df_metadata = pd.DataFrame()
for dataset in datasets:
    threshold = f'data\\test_{dataset}_{ms_level}'
# NOTE: Make sure that the outcome column is labeled 'target' in the data file
    try:
        df_input = pd.read_csv(threshold + '\\' + 'input.csv', header = None)
        df_output = pd.read_csv(threshold + '\\' +'output.csv', header = None)
        df_currmeta = pd.read_csv(threshold+'\\'+'metadata.csv', header=None, skiprows=1)
    except:
        continue
    df_currmeta[4] = dataset
    df_metadata = pd.concat((df_metadata, df_currmeta))
    df_mass_features_ms5 = pd.read_csv(threshold + '\\' +'mass_features.csv', header = None)
    np_mass_features_ms5 = df_mass_features_ms5.to_numpy()
    np_mass_features_ms5 = np_mass_features_ms5.reshape(-1)
    np_str_mass_features_ms5 = ['%.3f' % x for x in np_mass_features_ms5]
    np_input_curr = df_input.to_numpy()
    np_output_curr = df_output.to_numpy()
    np_output_curr = np_output_curr.T
    np_output_curr = np_output_curr.reshape(-1)
    np_input = np.append(np_input, np_input_curr, axis=1)
    np_output = np.append(np_output, np_output_curr)


np_group = df_metadata[4]
# np_permutation = np.array(range(len(np_output)))
# np.random.shuffle(np_permutation)
# np_input = np_input[:,np_permutation]
# np_output = np_output[np_permutation]
# np_group = np_group[np_permutation]


In [22]:
df_metadata

Unnamed: 0,0,1,2,3,4
0,A,CID,30,G2S16_MS2,G2S1(6)
1,A,CID,35,G2S16_MS2,G2S1(6)
2,A,CID,40,G2S16_MS2,G2S1(6)
3,A,CID,45,G2S16_MS2,G2S1(6)
4,A,CID,50,G2S16_MS2,G2S1(6)
...,...,...,...,...,...
10,C,CID,30,G2FS23_MS2,G2FS2(3)
11,C,CID,35,G2FS23_MS2,G2FS2(3)
12,C,CID,40,G2FS23_MS2,G2FS2(3)
13,C,CID,45,G2FS23_MS2,G2FS2(3)


In [23]:
def customize_score(true_value, predict):
    #predict = 1/(1+np.exp(-predict))
    predict = np.clip(predict,0,1)
    return -mean_squared_error(true_value, predict, squared=False)
my_scorer = make_scorer(customize_score, greater_is_better=True)

In [24]:
model = NMF(n_components=2, random_state=42, init ='nndsvd', max_iter=10000)
W = model.fit_transform(np_input.T)
H = model.components_
est = model.inverse_transform(W)
cs = cosine_similarity(est, np_input.T)
weight = cs.diagonal()
idx = np.argwhere(weight > 0.8).reshape(-1)
np.random.shuffle(idx)
np_input = np_input[:,idx]
np_output = np_output[idx]
df_metadata = df_metadata.iloc[idx]
np_group = np_group.iloc[idx]

In [25]:
df_metadata

Unnamed: 0,0,1,2,3,4
12,C,CID,40,G2S26_MS2,G2S2(6)
11,C,CID,35,G2FS16_MS2,G2FS1(6)
9,B,CID,50,G2S23_MS2,G2S2(3)
13,C,CID,45,G2S26_MS2,G2S2(6)
0,A,CID,30,G2S26_MS2,G2S2(6)
...,...,...,...,...,...
5,B,CID,30,G2S26_MS2,G2S2(6)
10,C,CID,30,G2S16_MS2,G2S1(6)
3,A,CID,45,G2S23_MS2,G2S2(3)
2,A,CID,40,G2FS23_MS2,G2FS2(3)


In [26]:
exported_pipeline = Pipeline([
    ('logT', FunctionTransformer(np.log1p)),
    ('Norm', Normalizer()),
    ('NMF',NMF(random_state=42, max_iter=10000, init='nndsvd')),
    ('Standardlize', StandardScaler()),
    #('PCA', PCA()),
    #('var', VarianceThreshold(threshold=0.001)),
    #('select', SelectPercentile(score_func=f_regression, percentile = 80)),
    #('pca', PCA()),
    ('LR', SVC(probability=False))]
)


# exported_pipeline = make_pipeline(
# #     StackingEstimator(estimator=DecisionTreeRegressor(max_depth=7, min_samples_leaf=4, min_samples_split=18)),
#     Normalizer(norm="max"),
#     VarianceThreshold(threshold=0.005),
# #     RobustScaler(),
#     SelectPercentile(score_func=f_regression, percentile=37),
#     LinearSVR(C=1.0, dual=True, epsilon=0.0001, loss="epsilon_insensitive", tol=0.0001, max_iter = 10000)
# )

#exported_pipeline.fit(np_input, np_output)
#results = exported_pipeline.predict(np_input)
#print(results)

In [27]:
param_grid = {
    # "var__threshold": np.linspace(0.0001, 0.01, 5),
    # "select__percentile": np.linspace(10, 100, 5),
    #'pca__n_components': [5, 10, 15, 20, 25],
    # 'model__eta': [0.1,0.2,0.3],
    # 'model__max_depth': [2,3],
    #'model__gamma': [0, 1, 2],
    #'model__n_estimators': [500],
    #'model__reg_alpha':[ 0,0.1,0.2,0.3],
    #'model__reg_lambda':[0,0.1,0.2,0.3]
    # 'model__colsample_bytree': [0.3,0.7,1],
    
#     "variancethreshold__threshold": np.linspace(0.0001, 0.01, 20),
#     "selectpercentile__percentile": np.linspace(10, 100, 10),
    # "PCA__n_components": [2,3,4,8,16],
    "NMF__n_components": [2,4,8],
    "Norm__norm": ['l1'],
 #   "NMF__alpha_H": [0, 0.001],
 #   "NMF__alpha_W": [0, 0.001],
#    "LR__C": [0.1,0.5,1],
#    "LR__epsilon": [0.001,0.01,0.1,0],
}

In [33]:
# Basically do nothing here because we don't need to filter the data
np_group = np_group[0:len(np_output)]
idx_filtered = np.where(df_metadata[0] != 'Z')[0] # modify this line to filter the data
np_input_filtered = np_input[:, idx_filtered]
np_output_filtered = np_output[idx_filtered]
np_group_filtered = np_group.iloc[idx_filtered]
df_metadata_filtered = df_metadata.iloc[idx_filtered]

In [34]:

idxs = [*range(len(np_output_filtered))]
random.shuffle(idxs)
np_input_shuffle = np_input_filtered[:, idxs]
np_output_shuffle = np_output_filtered[idxs]
np_group_shuffle = np_group_filtered.iloc[idxs]
df_metadata_shuffle = df_metadata_filtered.iloc[idxs]
print(df_metadata_shuffle)
datasets_3 = ['G2S2(3)','G2FS2(3)']
datasets_6 = ['G2S1(6)','G2S2(6)','G2FS1(6)','G2FS2(6)']
my_cv = []
my_cv2 = []
for dataset_3 in datasets_3:
    for dataset_6 in datasets_6:
        idx_3 = np.where(df_metadata_shuffle[4] == dataset_3)
        idx_6 = np.where(df_metadata_shuffle[4] == dataset_6)
        idx_3_train = np.where((df_metadata_shuffle[4] != dataset_3) & (df_metadata_shuffle[4].isin(datasets_3)))
        idx_6_train = np.where((df_metadata_shuffle[4] != dataset_6) & (df_metadata_shuffle[4].isin(datasets_6)))
        idx_train = list(idx_3_train[0]) + list(idx_6_train[0])
        idx_test = list(idx_3[0]) + list(idx_6[0])
        random.shuffle(idx_train)
        random.shuffle(idx_test)
        my_cv.append((idx_train, idx_test))
        my_cv2.append((idx_test, idx_train))

    0    1   2           3         4
7   B  CID  40   G2S23_MS2   G2S2(3)
11  C  CID  35  G2FS16_MS2  G2FS1(6)
4   A  CID  50   G2S23_MS2   G2S2(3)
9   B  CID  50  G2FS16_MS2  G2FS1(6)
10  C  CID  30   G2S16_MS2   G2S1(6)
.. ..  ...  ..         ...       ...
1   A  CID  35   G2S26_MS2   G2S2(6)
13  C  CID  45   G2S23_MS2   G2S2(3)
0   A  CID  30  G2FS26_MS2  G2FS2(6)
7   B  CID  40  G2FS23_MS2  G2FS2(3)
7   B  CID  40   G2S16_MS2   G2S1(6)

[79 rows x 5 columns]


In [35]:
np_input.shape

(23000, 79)

In [36]:
mask

array([    0,     1,     2, ..., 22997, 22998, 22999], dtype=int64)

In [37]:
# peaks = [370.36, 376.37, 450.40]
# for p in peaks:
#     mask = np.where((np_mass_features_ms5 > p - 0.5) & (np_mass_features_ms5 < p + 0.5))[0]
#     np_input_shuffle[mask, :] = 0

logo = LeaveOneGroupOut()

search = GridSearchCV(exported_pipeline, param_grid, cv = my_cv, scoring = 'accuracy',verbose = 1) #scoring = 'neg_root_mean_squared_error'
search.fit(np_input_shuffle.T, np_output_shuffle,groups=np_group_shuffle)
exported_pipeline = search.best_estimator_

Fitting 8 folds for each of 3 candidates, totalling 24 fits
Best parameter (CV score=0.992):
{'NMF__n_components': 2, 'Norm__norm': 'l1'}


In [None]:
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score=0.992):
{'NMF__n_components': 2, 'Norm__norm': 'l1'}


In [41]:
steps = search.best_estimator_.named_steps
nmf = steps['NMF']
W = nmf.transform(np_input.T)
tsne = TSNE(2, perplexity=30, random_state=42, n_iter=5000)
W_2d = tsne.fit_transform(W)
size = 50
alpha = 0.7
plt.figure(figsize=(8,4))
for dataset, m in zip(datasets, ["1","2","3","4","v","^"]):
    curr_idx = df_metadata[4] == dataset
    plt.scatter(W_2d[curr_idx,0], W_2d[curr_idx,1], marker=m, alpha=alpha, s=size, label = dataset)
plt.legend()
# plt.title("T-SNE plot for NIST Standards in MS2")
plt.show()



In [42]:
from collections import defaultdict
pred_dict = defaultdict(lambda : [])
gt_dict = defaultdict(lambda : [])
for trainidx, testidx in my_cv:
    train_x = np_input_shuffle[:, trainidx]
    train_y = np_output_shuffle[trainidx]
    test_x = np_input_shuffle[:, testidx]
    test_y = np_output_shuffle[testidx]
    df_test = df_metadata_shuffle.iloc[testidx]
    exported_pipeline.fit(train_x.T, train_y)
    pred = exported_pipeline.predict(test_x.T)
    for i in range(pred.shape[0]):
        key = df_test.iloc[i][0] + str(df_test.iloc[i][2])
        pred_dict[key].append(pred[i])
        gt_dict[key].append(test_y[i])

In [43]:
# get accuracy for different configurations
isolation_window = ['A', 'B', 'C']
energy_level = [30 , 35, 40, 45, 50]
all_pred = []
all_gt = []
for (i, iw) in enumerate(isolation_window):
    for (j, el) in enumerate(energy_level):
        key = iw + str(el)
        accuracy = accuracy_score(pred_dict[key], gt_dict[key])
        f1 = f1_score(pred_dict[key], gt_dict[key])
        print(f'{iw}{el} | accuracy: {"%.3f" % accuracy} | f1: {"%.3f" % f1}')
        all_pred = all_pred + pred_dict[key]
        all_gt = all_gt + gt_dict[key]

accuracy = accuracy_score(all_pred, all_gt)
f1 = f1_score(all_pred, all_gt)
print(f'all | accuracy: {"%.3f" % accuracy} | f1: {"%.3f" % f1}')

A30 | accuracy: 1.000 | f1: 1.000
A35 | accuracy: 1.000 | f1: 1.000
A40 | accuracy: 1.000 | f1: 1.000
A45 | accuracy: 1.000 | f1: 1.000
A50 | accuracy: 0.929 | f1: 0.933
B30 | accuracy: 1.000 | f1: 1.000
B35 | accuracy: 1.000 | f1: 1.000
B40 | accuracy: 1.000 | f1: 1.000
B45 | accuracy: 1.000 | f1: 1.000
B50 | accuracy: 0.938 | f1: 0.933
C30 | accuracy: 1.000 | f1: 1.000
C35 | accuracy: 1.000 | f1: 1.000
C40 | accuracy: 1.000 | f1: 1.000
C45 | accuracy: 1.000 | f1: 1.000
C50 | accuracy: 1.000 | f1: 1.000
all | accuracy: 0.991 | f1: 0.992


In [44]:
# plot basis spectrum
steps = search.best_estimator_.named_steps
nmf = steps['NMF']
H = nmf.components_
plt.figure(figsize=(16,9))
for i in range(H.shape[0]):
    ax = plt.subplot(H.shape[0],1,i+1)
    ax.set_title(f'H{i+1}', fontsize = 15)
    plt.plot(np_mass_features_ms5, H[i,:])
plt.suptitle(f'Basis Spectrum', fontsize = 20)
plt.tight_layout()
plt.show()