In [68]:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
from hyperparams import get_train_test_split, create_param_grid, clfs
import seaborn as sns
import shap
import importlib

from sklearn.model_selection import ParameterSampler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import SelectFromModel, SelectKBest, f_regression, r_regression, mutual_info_regression
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, PolynomialFeatures, SplineTransformer, KBinsDiscretizer, \
     StandardScaler, OrdinalEncoder, LabelEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, r2_score, get_scorer_names, accuracy_score, f1_score, precision_score, \
     confusion_matrix, balanced_accuracy_score, ConfusionMatrixDisplay, classification_report
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, KFold, train_test_split, cross_val_score, GridSearchCV, cross_validate
from sklearn.svm import SVC
from xgboost.sklearn import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
import warnings 
warnings.filterwarnings("ignore")
# Number of random trials
NUM_TRIALS = 1

# Load the dataset
df = pd.read_csv('rapamycinTrain.tsv', sep='\t', low_memory=True)
df_ = pd.read_csv('allTrain.tsv', sep='\t', low_memory=True)
label = "Rapamycin_response"
# X_train, X_test, y_train, y_test, features, labels = get_train_test_split(df)

In [69]:
df_.type

0       Breast
1       Breast
2       Breast
3       Breast
4       Breast
5     BrainCNS
6     BrainCNS
7     BrainCNS
8     BrainCNS
9     BrainCNS
10       Bowel
11       Bowel
12       Bowel
13       Bowel
14       Bowel
15        Lung
16        Lung
17        Lung
18        Lung
19        Lung
20        Lung
21        Lung
22       Blood
23       Blood
24       Blood
25       Blood
26        Skin
27        Skin
28        Skin
29        Skin
30        Skin
31        Skin
32       Ovary
33       Ovary
34       Ovary
35       Ovary
36       Ovary
37       Ovary
38    Prostate
39    Prostate
40      Kidney
41      Kidney
42      Kidney
43      Kidney
44      Kidney
45      Kidney
46      Kidney
47      Kidney
Name: type, dtype: object

In [70]:
df.type

Breast_SQ68           Breast
Breast_EI813          Breast
Breast_PV9            Breast
Breast_KY76           Breast
Breast_EPQV2          Breast
BrainCNS_BXX24      BrainCNS
BrainCNS_JYV1       BrainCNS
BrainCNS_NZ6        BrainCNS
BrainCNS_MKT49      BrainCNS
BrainCNS_QMQT2      BrainCNS
Bowel_PCEM5            Bowel
Bowel_WPJG932          Bowel
Bowel_OY8              Bowel
Bowel_ZMEK961          Bowel
Bowel_KRW49            Bowel
Lung_ND782              Lung
Lung_KWV9               Lung
Lung_WSJM17             Lung
Lung_UJGS7              Lung
Lung_XKM274             Lung
Lung_ZK56               Lung
Lung_MN34               Lung
Blood_ZS5              Blood
Blood_VQU9             Blood
Blood_TKZ1             Blood
Blood_FV5              Blood
Skin_GMZV17             Skin
Skin_ONW428             Skin
Skin_IV21               Skin
Skin_EJT91              Skin
Skin_BZDY16             Skin
Skin_FIXK19             Skin
Ovary_PM7              Ovary
Ovary_KK2              Ovary
Ovary_HZ923   

In [2]:
ful_res = pd.read_csv('outputs/cv_results_Fulvestrant_response.csv')

In [196]:
temp2 = ful_res.sort_values('accuracy', ascending=False).reset_index()

In [73]:
ful_res.isna().sum()

seed            0
feat_sel        0
models          0
model_params    0
accuracy        0
f1              0
dtype: int64

In [227]:
def group_results(cv_results):
    cv_results_all = cv_results.groupby(['seed','feat_sel','models'])[['accuracy', 'f1']].mean()
    cv_results_all[['accuracy_std', 'f1_std']] = cv_results.groupby(['seed','feat_sel','models'])[['accuracy', 'f1']].std()
    cv_results_all = cv_results_all.fillna(0)
    cv_results_all = cv_results_all.sort_values(['accuracy', 'accuracy_std'],ascending=False)

    cv_results_ = cv_results.groupby(['feat_sel','models'])[['accuracy', 'f1']].mean()#.sort_values(['accuracy', 'f1'],ascending=False)
    cv_results_[['accuracy_std', 'f1_std']] = ful_res.groupby(['feat_sel','models'])[['accuracy', 'f1']].std()#.sort_values(['accuracy', 'f1'],ascending=False)
    cv_results_ = cv_results_.fillna(0)
    cv_results_ = cv_results_.sort_values(['accuracy', 'accuracy_std'],ascending=False)
    return cv_results_all, cv_results_

In [229]:
def filter_results(cv_results, thresh=0.7):
    cv_results_all, _ = group_results(cv_results)
    filtered_res = cv_results_all[cv_results_all['accuracy'] >= thresh].sort_values('accuracy_std').reset_index()
    true_res = cv_results.sort_values('accuracy', ascending=False).reset_index()
    merged_df = pd.merge(true_res, filtered_res, on=['seed', 'feat_sel', 'models'], how='inner')[['feat_sel', 'models', 'model_params']]
    return merged_df

In [225]:
def calc_shap(pipe, X_train, y_train, X_test):
    pipe.fit(X_train, y_train)
    model = pipe.named_steps['clf']
    feat_bool = pipe.named_steps['feats'].get_support()
    selected_feats = X_train.iloc[:,feat_bool]
    print(f"------{ pipe['clf'].__class__.__name__ }-----")
    explainer = shap.KernelExplainer(model.predict, selected_feats)
    shap_values = explainer.shap_values(X_test.iloc[:,feat_bool])
    return shap_values, X_test.iloc[:,feat_bool]

In [176]:
def get_model(model_str):
    model_str = model_str.split("'")[1]
    model_ls = model_str.rsplit('.', 1)
    mod = ".".join(model_ls[:-1])
    module = importlib.import_module(mod)
    model_class = getattr(module, model_ls[-1])
    return model_class

def merge_dictionaries(dict1, dict2):
    from collections import Counter
    counter1 = Counter(dict1)
    counter2 = Counter(dict2)

    merged_counter = counter1 + counter2
    merged_dict = dict(merged_counter)

    return merged_dict

In [223]:
label = "Rapamycin_response"
def get_feature_imps(cv_results, X_train, y_train, X_test, label):
    import ast
    shap_importance = {}
    for fsm_str, model_str, mp_str in cv_results[["feat_sel", "models", "model_params"]].values:
        fsm = get_model(fsm_str)
        model = get_model(model_str)
        mp = ast.literal_eval(mp_str)
        p = Pipeline([("feats", SelectFromModel(fsm(), max_features=200)), ("clf", model(**mp))])
        sv, xt = calc_shap(p, X_train, y_train[label], X_test)
        svdf = pd.DataFrame(sv, columns=xt.columns.tolist())
        vals = np.abs(svdf.values).mean(0)
        temp_dict = dict(zip(xt.columns.tolist(), vals))
        shap_importance = merge_dictionaries(shap_importance, temp_dict)

    return shap_importance
        


In [131]:
ful_res_all, ful_res_ = group_results(ful_res)

In [191]:
temp = temp.reset_index()

In [246]:
temp2

Unnamed: 0,index,seed,feat_sel,models,model_params,accuracy,f1
0,38,1,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.ensemble._gb.GradientBoostingC...,"{'ccp_alpha': 0.0, 'criterion': 'friedman_mse'...",1.0000,1.000000
1,4,0,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.linear_model._logistic.Logisti...,"{'C': 0.78, 'class_weight': 'balanced', 'dual'...",1.0000,1.000000
2,94,0,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.neighbors._classification.KNei...,"{'algorithm': 'ball_tree', 'leaf_size': 5, 'me...",1.0000,1.000000
3,64,0,<class 'xgboost.sklearn.XGBClassifier'>,<class 'sklearn.svm._classes.SVC'>,"{'C': 0.8477, 'break_ties': False, 'cache_size...",0.9375,0.903704
4,73,2,<class 'sklearn.linear_model._logistic.Logisti...,<class 'sklearn.svm._classes.SVC'>,"{'C': 0.3242, 'break_ties': False, 'cache_size...",0.9375,0.903704
...,...,...,...,...,...,...,...
145,14,2,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.linear_model._logistic.Logisti...,"{'C': 0.67, 'class_weight': 'balanced', 'dual'...",0.3750,0.711111
146,115,2,<class 'sklearn.ensemble._forest.RandomForestC...,<class 'sklearn.ensemble._weight_boosting.AdaB...,"{'algorithm': 'SAMME.R', 'base_estimator': 'de...",0.3750,0.600000
147,92,0,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.neighbors._classification.KNei...,"{'algorithm': 'kd_tree', 'leaf_size': 4, 'metr...",0.3750,0.600000
148,81,1,<class 'sklearn.linear_model._logistic.Logisti...,<class 'lightgbm.sklearn.LGBMClassifier'>,"{'boosting_type': 'gbdt', 'class_weight': 'bal...",0.3750,0.600000


In [247]:
pd.merge(temp2, temp, on=['seed', 'feat_sel', 'models'], how='inner')

Unnamed: 0,index,seed,feat_sel,models,model_params,accuracy_x,f1_x,accuracy_y,f1_y,accuracy_std,f1_std
0,38,1,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.ensemble._gb.GradientBoostingC...,"{'ccp_alpha': 0.0, 'criterion': 'friedman_mse'...",1.0,1.0,0.75,0.855556,0.353553,0.204275
1,36,1,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.ensemble._gb.GradientBoostingC...,"{'ccp_alpha': 0.0, 'criterion': 'friedman_mse'...",0.5,0.711111,0.75,0.855556,0.353553,0.204275
2,4,0,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.linear_model._logistic.Logisti...,"{'C': 0.78, 'class_weight': 'balanced', 'dual'...",1.0,1.0,0.78125,0.819048,0.309359,0.255905
3,2,0,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.linear_model._logistic.Logisti...,"{'C': 0.45, 'class_weight': 'balanced', 'dual'...",0.5625,0.638095,0.78125,0.819048,0.309359,0.255905
4,73,2,<class 'sklearn.linear_model._logistic.Logisti...,<class 'sklearn.svm._classes.SVC'>,"{'C': 0.3242, 'break_ties': False, 'cache_size...",0.9375,0.903704,0.9375,0.903704,0.0,0.0
5,1,0,<class 'sklearn.ensemble._forest.RandomForestC...,<class 'sklearn.linear_model._logistic.Logisti...,"{'C': 0.23, 'class_weight': 'balanced', 'dual'...",0.9375,0.906667,0.9375,0.906667,0.0,0.0
6,76,0,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'lightgbm.sklearn.LGBMClassifier'>,"{'boosting_type': 'gbdt', 'class_weight': 'bal...",0.875,0.819048,0.875,0.819048,0.0,0.0
7,41,2,<class 'xgboost.sklearn.XGBClassifier'>,<class 'sklearn.ensemble._gb.GradientBoostingC...,"{'ccp_alpha': 0.0, 'criterion': 'friedman_mse'...",0.75,0.886275,0.75,0.886275,0.0,0.0
8,35,1,<class 'sklearn.linear_model._logistic.Logisti...,<class 'sklearn.ensemble._gb.GradientBoostingC...,"{'ccp_alpha': 0.0, 'criterion': 'squared_error...",0.75,0.633333,0.75,0.633333,0.0,0.0


In [248]:
merged_df = pd.merge(temp2, temp, on=['seed', 'feat_sel', 'models'], how='inner')[['feat_sel', 'models', 'model_params', 'accuracy_x']]

In [249]:
merged_df

Unnamed: 0,feat_sel,models,model_params,accuracy_x
0,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.ensemble._gb.GradientBoostingC...,"{'ccp_alpha': 0.0, 'criterion': 'friedman_mse'...",1.0
1,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.ensemble._gb.GradientBoostingC...,"{'ccp_alpha': 0.0, 'criterion': 'friedman_mse'...",0.5
2,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.linear_model._logistic.Logisti...,"{'C': 0.78, 'class_weight': 'balanced', 'dual'...",1.0
3,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.linear_model._logistic.Logisti...,"{'C': 0.45, 'class_weight': 'balanced', 'dual'...",0.5625
4,<class 'sklearn.linear_model._logistic.Logisti...,<class 'sklearn.svm._classes.SVC'>,"{'C': 0.3242, 'break_ties': False, 'cache_size...",0.9375
5,<class 'sklearn.ensemble._forest.RandomForestC...,<class 'sklearn.linear_model._logistic.Logisti...,"{'C': 0.23, 'class_weight': 'balanced', 'dual'...",0.9375
6,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'lightgbm.sklearn.LGBMClassifier'>,"{'boosting_type': 'gbdt', 'class_weight': 'bal...",0.875
7,<class 'xgboost.sklearn.XGBClassifier'>,<class 'sklearn.ensemble._gb.GradientBoostingC...,"{'ccp_alpha': 0.0, 'criterion': 'friedman_mse'...",0.75
8,<class 'sklearn.linear_model._logistic.Logisti...,<class 'sklearn.ensemble._gb.GradientBoostingC...,"{'ccp_alpha': 0.0, 'criterion': 'squared_error...",0.75


In [152]:
ful_res_all.head(15)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,accuracy,f1,accuracy_std,f1_std
seed,feat_sel,models,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,<class 'sklearn.ensemble._forest.RandomForestClassifier'>,<class 'sklearn.linear_model._logistic.LogisticRegression'>,0.9375,0.906667,0.0,0.0
2,<class 'sklearn.linear_model._logistic.LogisticRegression'>,<class 'sklearn.svm._classes.SVC'>,0.9375,0.903704,0.0,0.0
0,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'lightgbm.sklearn.LGBMClassifier'>,0.875,0.819048,0.0,0.0
0,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.linear_model._logistic.LogisticRegression'>,0.78125,0.819048,0.309359,0.255905
1,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.ensemble._gb.GradientBoostingClassifier'>,0.75,0.855556,0.353553,0.204275
1,<class 'sklearn.linear_model._logistic.LogisticRegression'>,<class 'sklearn.ensemble._gb.GradientBoostingClassifier'>,0.75,0.633333,0.0,0.0
2,<class 'xgboost.sklearn.XGBClassifier'>,<class 'sklearn.ensemble._gb.GradientBoostingClassifier'>,0.75,0.886275,0.0,0.0
0,<class 'xgboost.sklearn.XGBClassifier'>,<class 'sklearn.svm._classes.SVC'>,0.6875,0.781264,0.353553,0.173156
2,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'lightgbm.sklearn.LGBMClassifier'>,0.6875,0.840741,0.353553,0.089043
0,<class 'sklearn.linear_model._logistic.LogisticRegression'>,<class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>,0.6875,0.8,0.0,0.0


In [185]:
temp = ful_res_all[ful_res_all['accuracy'] >= 0.70].sort_values('accuracy_std')

In [226]:
get_feature_imps(merged_df, X_train, y_train, X_test, "Fulvestrant_response")

------GradientBoostingClassifier-----


  0%|          | 0/14 [00:00<?, ?it/s]

------GradientBoostingClassifier-----


  0%|          | 0/14 [00:00<?, ?it/s]

------LogisticRegression-----


  0%|          | 0/14 [00:00<?, ?it/s]

------LogisticRegression-----


  0%|          | 0/14 [00:00<?, ?it/s]

------SVC-----


  0%|          | 0/14 [00:00<?, ?it/s]

------LogisticRegression-----


  0%|          | 0/14 [00:00<?, ?it/s]

------LGBMClassifier-----


  0%|          | 0/14 [00:00<?, ?it/s]

------GradientBoostingClassifier-----


  0%|          | 0/14 [00:00<?, ?it/s]

------GradientBoostingClassifier-----


  0%|          | 0/14 [00:00<?, ?it/s]

{'type': 0.49260193387196655,
 'mut_C1orf222': 0.005656903224477074,
 'mut_CAMTA1': 0.004732119344602808,
 'mut_H6PD': 0.04694867589468317,
 'mut_SPEN': 0.006583723484787072,
 'mut_HSPG2': 0.007333240255374085,
 'mut_ZSCAN20': 0.004846320735786162,
 'mut_CSMD2': 0.0037335766841414047,
 'mut_MACF1': 0.004476504759462121,
 'mut_CDCP2': 0.010748626240413952,
 'mut_LRRC7': 0.09875688251555945,
 'mut_ADGRL2': 0.004886003100545071,
 'mut_COL11A1': 0.0035973431070915928,
 'mut_CHI3L2': 0.25258756477735417,
 'mut_NOTCH2': 0.0028585718945741976,
 'mut_PDE4DIP': 0.13597260620132695,
 'mut_OTUD7B': 0.005299627534799436,
 'mut_RHBG': 0.006410000345569172,
 'mut_SPTA1': 0.10215868265314466,
 'mut_ASPM': 0.03363770122121409,
 'mut_CACNA1S': 0.04259023649780553,
 'mut_USH2A': 0.005424817527335616,
 'mut_APOB': 0.022378462042446465,
 'mut_LTBP1': 0.0029994229634712102,
 'mut_PRKD3': 0.004980057694181541,
 'mut_PREPL': 0.005063819959349862,
 'mut_SPTBN1': 0.005774856988152499,
 'mut_USP34': 0.004558128

In [222]:
ful_res_.head(15)

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,f1,accuracy_std,f1_std
feat_sel,models,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'lightgbm.sklearn.LGBMClassifier'>,0.6875,0.80291,0.255155,0.080578
<class 'sklearn.linear_model._logistic.LogisticRegression'>,<class 'sklearn.svm._classes.SVC'>,0.645833,0.817139,0.252591,0.09776
<class 'sklearn.ensemble._forest.RandomForestClassifier'>,<class 'sklearn.linear_model._logistic.LogisticRegression'>,0.640625,0.800523,0.224624,0.104319
<class 'sklearn.linear_model._logistic.LogisticRegression'>,<class 'sklearn.ensemble._gb.GradientBoostingClassifier'>,0.614583,0.70864,0.169635,0.06346
<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.ensemble._gb.GradientBoostingClassifier'>,0.59375,0.816667,0.272431,0.126198
<class 'xgboost.sklearn.XGBClassifier'>,<class 'sklearn.svm._classes.SVC'>,0.5625,0.749782,0.25,0.116937
<class 'sklearn.linear_model._logistic.LogisticRegression'>,<class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>,0.5625,0.782571,0.108253,0.064535
<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.neighbors._classification.KNeighborsClassifier'>,0.553571,0.772362,0.202293,0.129767
<class 'sklearn.ensemble._forest.RandomForestClassifier'>,<class 'lightgbm.sklearn.LGBMClassifier'>,0.546875,0.773203,0.138585,0.106054
<class 'sklearn.linear_model._logistic.LogisticRegression'>,<class 'xgboost.sklearn.XGBClassifier'>,0.546875,0.773203,0.138585,0.106054


probable best models `svc`, `Logistic`, `LGBM`, `GBC`

seems like `xgb` is not a good feature selector here

In [16]:
gef_res = pd.read_csv('outputs/cv_results_Gefitinib_response.csv')

In [74]:
gef_res.isna().sum()

seed            0
feat_sel        0
models          0
model_params    0
accuracy        0
f1              0
dtype: int64

In [134]:
gef_res_all, gef_res_ = group_results(gef_res)

In [135]:
gef_res_all

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,accuracy,f1,accuracy_std,f1_std
seed,feat_sel,models,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,<class 'sklearn.ensemble._forest.RandomForestClassifier'>,<class 'sklearn.ensemble._gb.GradientBoostingClassifier'>,0.833333,0.882051,0.000000,0.000000
1,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>,0.833333,0.882051,0.000000,0.000000
2,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.ensemble._bagging.BaggingClassifier'>,0.761905,0.800000,0.000000,0.000000
2,<class 'sklearn.linear_model._logistic.LogisticRegression'>,<class 'xgboost.sklearn.XGBClassifier'>,0.750000,0.827917,0.117851,0.092513
0,<class 'xgboost.sklearn.XGBClassifier'>,<class 'sklearn.linear_model._logistic.LogisticRegression'>,0.750000,0.666667,0.000000,0.000000
...,...,...,...,...,...,...
2,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.svm._classes.SVC'>,0.354167,0.402165,0.265165,0.225907
2,<class 'sklearn.linear_model._logistic.LogisticRegression'>,<class 'lightgbm.sklearn.LGBMClassifier'>,0.333333,0.350000,0.000000,0.000000
1,<class 'xgboost.sklearn.XGBClassifier'>,<class 'sklearn.svm._classes.SVC'>,0.285714,0.400000,0.000000,0.000000
2,<class 'sklearn.ensemble._forest.RandomForestClassifier'>,<class 'xgboost.sklearn.XGBClassifier'>,0.250000,0.333333,0.000000,0.000000


In [172]:
gef_res_all[gef_res_all['accuracy'] >= 0.75].sort_values('accuracy_std')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,accuracy,f1,accuracy_std,f1_std
seed,feat_sel,models,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,<class 'sklearn.ensemble._forest.RandomForestClassifier'>,<class 'sklearn.ensemble._gb.GradientBoostingClassifier'>,0.833333,0.882051,0.0,0.0
1,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>,0.833333,0.882051,0.0,0.0
2,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.ensemble._bagging.BaggingClassifier'>,0.761905,0.8,0.0,0.0
0,<class 'xgboost.sklearn.XGBClassifier'>,<class 'sklearn.linear_model._logistic.LogisticRegression'>,0.75,0.666667,0.0,0.0
2,<class 'sklearn.linear_model._logistic.LogisticRegression'>,<class 'xgboost.sklearn.XGBClassifier'>,0.75,0.827917,0.117851,0.092513


In [250]:
weights = merged_df['accuracy_x']/merged_df['accuracy_x'].iloc[0]
weights

0    1.0000
1    0.5000
2    1.0000
3    0.5625
4    0.9375
5    0.9375
6    0.8750
7    0.7500
8    0.7500
Name: accuracy_x, dtype: float64

In [49]:
import pandas as pd

# Example DataFrame
data = {
    'A': [1, 3, 5],
    'B': [4, 2, 6],
    'C': [7, 1, 8]
}

df = pd.DataFrame(data)

# Find the column name with the highest value for each row
max_column = df.idxmax(axis=1)

# Add a new column to the DataFrame with the column name having the highest value
df['max_column'] = max_column

# Display the result
print(df)


   A  B  C max_column
0  1  4  7          C
1  3  2  1          A
2  5  6  8          C


In [53]:
{k:[v] for k,v in max_column.to_dict().items()}

{0: ['C'], 1: ['A'], 2: ['C']}

In [55]:
from collections import Counter

data = {
    'A': [1, 3, 3, 5],
    'B': [4, 2, 6, 6],
    'C': [7, 1, 1, 8]
}



def find_max_freq_element(data):
    result = {}
    for key, values in data.items():
        # Use Counter to count the frequency of each element in the list
        counter = Counter(values)
        
        # Find the element with the highest frequency
        most_common_element, frequency = counter.most_common(1)[0]
        
        # Store the result in the dictionary
        result[key] = most_common_element
    return result

print(result)


{'A': 3, 'B': 6, 'C': 1}


In [254]:
pd.concat([pd.DataFrame(), df], axis=1)

Unnamed: 0,A,B,C,max_column
0,1,4,7,C
1,3,2,1,A
2,5,6,8,C


In [252]:
df

Unnamed: 0,A,B,C,max_column
0,1,4,7,C
1,3,2,1,A
2,5,6,8,C


In [240]:
gef_res_.reset_index()

Unnamed: 0,feat_sel,models,accuracy,f1,accuracy_std,f1_std
0,<class 'sklearn.linear_model._logistic.Logisti...,<class 'xgboost.sklearn.XGBClassifier'>,0.666667,0.701944,0.138585,0.106054
1,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.linear_model._logistic.Logisti...,0.654762,0.663278,0.239629,0.145565
2,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.ensemble._weight_boosting.AdaB...,0.630952,0.703526,0.091998,0.11928
3,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.neighbors._classification.KNei...,0.619048,0.616667,0.202293,0.129767
4,<class 'xgboost.sklearn.XGBClassifier'>,<class 'sklearn.neighbors._classification.KNei...,0.610714,0.65191,0.036084,0.051281
5,<class 'sklearn.ensemble._forest.RandomForestC...,<class 'sklearn.ensemble._gb.GradientBoostingC...,0.595238,0.615643,0.0,0.0
6,<class 'sklearn.linear_model._logistic.Logisti...,<class 'sklearn.ensemble._gb.GradientBoostingC...,0.592262,0.610806,0.169635,0.06346
7,<class 'xgboost.sklearn.XGBClassifier'>,<class 'sklearn.ensemble._gb.GradientBoostingC...,0.587302,0.651068,0.135497,0.085406
8,<class 'sklearn.ensemble._forest.RandomForestC...,<class 'sklearn.neighbors._classification.KNei...,0.583333,0.618271,0.088388,0.078567
9,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.ensemble._forest.RandomForestC...,0.583333,0.647917,0.0,0.072452


In [35]:
mit_res = pd.read_csv('outputs/cv_results_Mitomycin_response.csv')

In [75]:
mit_res.isna().sum()

seed            0
feat_sel        0
models          0
model_params    0
accuracy        0
f1              0
dtype: int64

In [140]:
mit_res_all, mit_res_ = group_results(mit_res)

In [141]:
mit_res_all

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,accuracy,f1,accuracy_std,f1_std
seed,feat_sel,models,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'lightgbm.sklearn.LGBMClassifier'>,0.875,0.886003,0.0,0.0
0,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>,0.800,0.791667,0.0,0.0
0,<class 'sklearn.linear_model._logistic.LogisticRegression'>,<class 'sklearn.linear_model._logistic.LogisticRegression'>,0.800,0.791667,0.0,0.0
0,<class 'xgboost.sklearn.XGBClassifier'>,<class 'lightgbm.sklearn.LGBMClassifier'>,0.800,0.800000,0.0,0.0
0,<class 'xgboost.sklearn.XGBClassifier'>,<class 'sklearn.svm._classes.SVC'>,0.800,0.800000,0.0,0.0
0,...,...,...,...,...,...
0,<class 'sklearn.ensemble._forest.RandomForestClassifier'>,<class 'lightgbm.sklearn.LGBMClassifier'>,0.375,0.400000,0.0,0.0
1,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>,0.325,0.333333,0.0,0.0
2,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>,0.325,0.333333,0.0,0.0
2,<class 'sklearn.linear_model._logistic.LogisticRegression'>,<class 'sklearn.ensemble._forest.RandomForestClassifier'>,0.300,0.277778,0.0,0.0


In [169]:
mit_res_all[mit_res_all['accuracy'] >= 0.70].sort_values('accuracy_std')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,accuracy,f1,accuracy_std,f1_std
seed,feat_sel,models,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'lightgbm.sklearn.LGBMClassifier'>,0.875,0.886003,0.0,0.0
2,<class 'sklearn.ensemble._forest.RandomForestClassifier'>,<class 'xgboost.sklearn.XGBClassifier'>,0.708333,0.70303,0.0,0.0
1,<class 'sklearn.linear_model._logistic.LogisticRegression'>,<class 'sklearn.ensemble._gb.GradientBoostingClassifier'>,0.708333,0.70303,0.0,0.0
1,<class 'sklearn.ensemble._forest.RandomForestClassifier'>,<class 'sklearn.ensemble._gb.GradientBoostingClassifier'>,0.75,0.690909,0.0,0.0
2,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.ensemble._bagging.BaggingClassifier'>,0.775,0.777778,0.0,0.0
1,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.ensemble._forest.ExtraTreesClassifier'>,0.791667,0.8,0.0,0.0
0,<class 'sklearn.linear_model._logistic.LogisticRegression'>,<class 'sklearn.ensemble._gb.GradientBoostingClassifier'>,0.75,0.759259,0.0,0.0
0,<class 'xgboost.sklearn.XGBClassifier'>,<class 'sklearn.svm._classes.SVC'>,0.8,0.8,0.0,0.0
0,<class 'xgboost.sklearn.XGBClassifier'>,<class 'lightgbm.sklearn.LGBMClassifier'>,0.8,0.8,0.0,0.0
0,<class 'sklearn.linear_model._logistic.LogisticRegression'>,<class 'sklearn.linear_model._logistic.LogisticRegression'>,0.8,0.791667,0.0,0.0


In [142]:
mit_res_.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,f1,accuracy_std,f1_std
feat_sel,models,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
<class 'xgboost.sklearn.XGBClassifier'>,<class 'sklearn.svm._classes.SVC'>,0.8,0.8,0.25,0.116937
<class 'sklearn.linear_model._logistic.LogisticRegression'>,<class 'sklearn.ensemble._gb.GradientBoostingClassifier'>,0.729167,0.731145,0.169635,0.06346
<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.linear_model._logistic.LogisticRegression'>,0.7125,0.720779,0.239629,0.145565
<class 'xgboost.sklearn.XGBClassifier'>,<class 'xgboost.sklearn.XGBClassifier'>,0.7125,0.698397,0.0,0.072452
<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'lightgbm.sklearn.LGBMClassifier'>,0.7,0.702261,0.255155,0.080578
<class 'sklearn.linear_model._logistic.LogisticRegression'>,<class 'sklearn.linear_model._logistic.LogisticRegression'>,0.7,0.694437,0.036084,0.03849
<class 'sklearn.ensemble._forest.RandomForestClassifier'>,<class 'xgboost.sklearn.XGBClassifier'>,0.681667,0.663359,0.072169,0.118373
<class 'sklearn.ensemble._forest.RandomForestClassifier'>,<class 'sklearn.linear_model._logistic.LogisticRegression'>,0.666667,0.658761,0.224624,0.104319
<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.ensemble._forest.ExtraTreesClassifier'>,0.663889,0.65846,0.036084,0.077536
<class 'xgboost.sklearn.XGBClassifier'>,<class 'sklearn.linear_model._logistic.LogisticRegression'>,0.658333,0.674059,0.088388,0.0


probable best models `xgb`, `lr`

`lr` with `gbr` is a good model

In [58]:
rap_res = pd.read_csv('outputs/cv_results_Rapamycin_response.csv')

In [79]:
sum(rap_res['accuracy'] == np.nan)

0

In [143]:
rap_res_all, rap_res_ = group_results(rap_res)

In [144]:
rap_res_all

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,accuracy,f1,accuracy_std,f1_std
seed,feat_sel,models,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,<class 'sklearn.linear_model._logistic.LogisticRegression'>,<class 'sklearn.neighbors._classification.KNeighborsClassifier'>,0.900000,0.898990,0.000000,0.000000
2,<class 'sklearn.ensemble._forest.RandomForestClassifier'>,<class 'sklearn.svm._classes.SVC'>,0.900000,0.898990,0.000000,0.000000
2,<class 'sklearn.linear_model._logistic.LogisticRegression'>,<class 'sklearn.ensemble._bagging.BaggingClassifier'>,0.900000,0.888889,0.000000,0.000000
0,<class 'sklearn.linear_model._logistic.LogisticRegression'>,<class 'sklearn.linear_model._logistic.LogisticRegression'>,0.850000,0.840278,0.070711,0.068746
0,<class 'sklearn.linear_model._logistic.LogisticRegression'>,<class 'sklearn.ensemble._gb.GradientBoostingClassifier'>,0.800000,0.791667,0.000000,0.000000
...,...,...,...,...,...,...
2,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.ensemble._bagging.BaggingClassifier'>,0.400000,0.400000,0.000000,0.000000
0,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'lightgbm.sklearn.LGBMClassifier'>,0.366667,0.334499,0.152753,0.164266
2,<class 'xgboost.sklearn.XGBClassifier'>,<class 'sklearn.svm._classes.SVC'>,0.325000,0.333333,0.000000,0.000000
2,<class 'sklearn.ensemble._forest.RandomForestClassifier'>,<class 'xgboost.sklearn.XGBClassifier'>,0.300000,0.292929,0.000000,0.000000


In [170]:
rap_res_all[rap_res_all['accuracy'] >= 0.70].sort_values('accuracy_std')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,accuracy,f1,accuracy_std,f1_std
seed,feat_sel,models,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,<class 'sklearn.linear_model._logistic.LogisticRegression'>,<class 'sklearn.neighbors._classification.KNeighborsClassifier'>,0.9,0.89899,0.0,0.0
2,<class 'sklearn.ensemble._forest.RandomForestClassifier'>,<class 'sklearn.svm._classes.SVC'>,0.9,0.89899,0.0,0.0
2,<class 'sklearn.linear_model._logistic.LogisticRegression'>,<class 'sklearn.ensemble._bagging.BaggingClassifier'>,0.9,0.888889,0.0,0.0
0,<class 'sklearn.linear_model._logistic.LogisticRegression'>,<class 'sklearn.ensemble._gb.GradientBoostingClassifier'>,0.8,0.791667,0.0,0.0
1,<class 'sklearn.ensemble._forest.RandomForestClassifier'>,<class 'sklearn.linear_model._logistic.LogisticRegression'>,0.8,0.791667,0.0,0.0
1,<class 'xgboost.sklearn.XGBClassifier'>,<class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>,0.8,0.791667,0.0,0.0
0,<class 'sklearn.ensemble._forest.RandomForestClassifier'>,<class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>,0.7,0.67033,0.0,0.0
1,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'lightgbm.sklearn.LGBMClassifier'>,0.7,0.67033,0.0,0.0
1,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>,0.7,0.69697,0.0,0.0
1,<class 'sklearn.ensemble._forest.RandomForestClassifier'>,<class 'sklearn.neighbors._classification.KNeighborsClassifier'>,0.7,0.69697,0.0,0.0


In [145]:
rap_res_.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,f1,accuracy_std,f1_std
feat_sel,models,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
<class 'sklearn.ensemble._forest.RandomForestClassifier'>,<class 'sklearn.linear_model._logistic.LogisticRegression'>,0.8,0.791667,0.224624,0.104319
<class 'sklearn.linear_model._logistic.LogisticRegression'>,<class 'sklearn.ensemble._bagging.BaggingClassifier'>,0.75,0.736111,0.0,0.088735
<class 'sklearn.linear_model._logistic.LogisticRegression'>,<class 'sklearn.linear_model._logistic.LogisticRegression'>,0.721429,0.709993,0.036084,0.03849
<class 'sklearn.ensemble._forest.RandomForestClassifier'>,<class 'sklearn.svm._classes.SVC'>,0.705,0.697702,0.036084,0.062786
<class 'sklearn.linear_model._logistic.LogisticRegression'>,<class 'sklearn.ensemble._gb.GradientBoostingClassifier'>,0.7,0.695833,0.169635,0.06346
<class 'sklearn.linear_model._logistic.LogisticRegression'>,<class 'sklearn.neighbors._classification.KNeighborsClassifier'>,0.685714,0.677369,0.0,0.0
<class 'sklearn.linear_model._logistic.LogisticRegression'>,<class 'sklearn.svm._classes.SVC'>,0.658333,0.650379,0.252591,0.09776
<class 'sklearn.ensemble._forest.RandomForestClassifier'>,<class 'sklearn.ensemble._bagging.BaggingClassifier'>,0.658333,0.644873,0.0,0.072452
<class 'xgboost.sklearn.XGBClassifier'>,<class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>,0.655,0.651263,0.044194,0.041595
<class 'sklearn.ensemble._forest.RandomForestClassifier'>,<class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>,0.6125,0.594098,0.0625,0.096681


In [82]:
mitful_res = pd.read_csv('outputs/cv_results_Mitomycin-Fulvestrant_response.csv')

In [83]:
sum(mitful_res['accuracy'] == np.nan)

0

In [146]:
mitful_res_all, mitful_res_ = group_results(mitful_res)

In [147]:
mitful_res_all

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,accuracy,f1,accuracy_std,f1_std
seed,feat_sel,models,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,<class 'sklearn.ensemble._forest.RandomForestClassifier'>,<class 'sklearn.linear_model._logistic.LogisticRegression'>,0.875000,0.886003,0.0,0.0
0,<class 'xgboost.sklearn.XGBClassifier'>,<class 'lightgbm.sklearn.LGBMClassifier'>,0.875000,0.896703,0.0,0.0
2,<class 'sklearn.ensemble._forest.RandomForestClassifier'>,<class 'lightgbm.sklearn.LGBMClassifier'>,0.875000,0.896703,0.0,0.0
0,<class 'sklearn.ensemble._forest.RandomForestClassifier'>,<class 'xgboost.sklearn.XGBClassifier'>,0.750000,0.759259,0.0,0.0
0,<class 'sklearn.linear_model._logistic.LogisticRegression'>,<class 'sklearn.ensemble._gb.GradientBoostingClassifier'>,0.750000,0.759259,0.0,0.0
...,...,...,...,...,...,...
2,<class 'sklearn.ensemble._forest.RandomForestClassifier'>,<class 'sklearn.neighbors._classification.KNeighborsClassifier'>,0.300000,0.277778,0.0,0.0
2,<class 'sklearn.linear_model._logistic.LogisticRegression'>,<class 'sklearn.ensemble._bagging.BaggingClassifier'>,0.300000,0.277778,0.0,0.0
2,<class 'sklearn.ensemble._forest.RandomForestClassifier'>,<class 'sklearn.svm._classes.SVC'>,0.200000,0.202020,0.0,0.0
2,<class 'xgboost.sklearn.XGBClassifier'>,<class 'sklearn.linear_model._logistic.LogisticRegression'>,0.200000,0.166667,0.0,0.0


In [166]:
mitful_res_all[mitful_res_all['accuracy'] >= 0.70].sort_values('accuracy_std')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,accuracy,f1,accuracy_std,f1_std
seed,feat_sel,models,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,<class 'sklearn.ensemble._forest.RandomForestClassifier'>,<class 'sklearn.linear_model._logistic.LogisticRegression'>,0.875,0.886003,0.0,0.0
0,<class 'xgboost.sklearn.XGBClassifier'>,<class 'lightgbm.sklearn.LGBMClassifier'>,0.875,0.896703,0.0,0.0
2,<class 'sklearn.ensemble._forest.RandomForestClassifier'>,<class 'lightgbm.sklearn.LGBMClassifier'>,0.875,0.896703,0.0,0.0
0,<class 'sklearn.ensemble._forest.RandomForestClassifier'>,<class 'xgboost.sklearn.XGBClassifier'>,0.75,0.759259,0.0,0.0
0,<class 'sklearn.linear_model._logistic.LogisticRegression'>,<class 'sklearn.ensemble._gb.GradientBoostingClassifier'>,0.75,0.759259,0.0,0.0
2,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.svm._classes.SVC'>,0.75,0.780952,0.0,0.0
0,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.linear_model._logistic.LogisticRegression'>,0.7,0.67033,0.0,0.0
1,<class 'sklearn.ensemble._forest.RandomForestClassifier'>,<class 'sklearn.ensemble._bagging.BaggingClassifier'>,0.7,0.69697,0.0,0.0
1,<class 'sklearn.ensemble._forest.RandomForestClassifier'>,<class 'sklearn.ensemble._forest.RandomForestClassifier'>,0.7,0.69697,0.0,0.0
0,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.ensemble._forest.ExtraTreesClassifier'>,0.708333,0.723954,0.235702,0.229172


In [148]:
mitful_res_.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,f1,accuracy_std,f1_std
feat_sel,models,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
<class 'sklearn.ensemble._forest.RandomForestClassifier'>,<class 'sklearn.linear_model._logistic.LogisticRegression'>,0.705556,0.714959,0.224624,0.104319
<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.linear_model._logistic.LogisticRegression'>,0.683333,0.68022,0.239629,0.145565
<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.neighbors._classification.KNeighborsClassifier'>,0.627778,0.631913,0.202293,0.129767
<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.svm._classes.SVC'>,0.625,0.615476,0.068465,0.02468
<class 'sklearn.ensemble._forest.RandomForestClassifier'>,<class 'sklearn.ensemble._forest.RandomForestClassifier'>,0.55,0.438452,0.0,0.0
<class 'sklearn.linear_model._logistic.LogisticRegression'>,<class 'sklearn.ensemble._gb.GradientBoostingClassifier'>,0.54375,0.52883,0.169635,0.06346
<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.ensemble._bagging.BaggingClassifier'>,0.54375,0.54925,0.0,0.064803
<class 'sklearn.linear_model._logistic.LogisticRegression'>,<class 'sklearn.ensemble._forest.ExtraTreesClassifier'>,0.541667,0.561905,0.023623,0.069029
<class 'xgboost.sklearn.XGBClassifier'>,<class 'sklearn.ensemble._forest.ExtraTreesClassifier'>,0.540278,0.544076,0.0,0.0
<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'lightgbm.sklearn.LGBMClassifier'>,0.536111,0.521757,0.255155,0.080578


probable best models `xgb`, `lr`

`lr` with `gbr` is a good model

In [118]:
rapgef_res = pd.read_csv('outputs/cv_results_Rapamycin-Gefitinib_response.csv')

In [119]:
sum(rapgef_res['accuracy'] == np.nan)

0

In [149]:
rapgef_res_all, rapgef_res_ = group_results(rapgef_res)

In [150]:
rapgef_res_all

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,accuracy,f1,accuracy_std,f1_std
seed,feat_sel,models,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.linear_model._logistic.LogisticRegression'>,0.8750,0.896703,0.000000,0.000000
0,<class 'sklearn.linear_model._logistic.LogisticRegression'>,<class 'sklearn.svm._classes.SVC'>,0.8000,0.772222,0.000000,0.000000
1,<class 'xgboost.sklearn.XGBClassifier'>,<class 'xgboost.sklearn.XGBClassifier'>,0.8000,0.791667,0.000000,0.000000
0,<class 'sklearn.ensemble._forest.RandomForestClassifier'>,<class 'sklearn.neighbors._classification.KNeighborsClassifier'>,0.7625,0.764444,0.194454,0.175991
2,<class 'sklearn.linear_model._logistic.LogisticRegression'>,<class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>,0.7500,0.780952,0.000000,0.000000
...,...,...,...,...,...,...
1,<class 'sklearn.linear_model._logistic.LogisticRegression'>,<class 'lightgbm.sklearn.LGBMClassifier'>,0.3000,0.277778,0.000000,0.000000
2,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'xgboost.sklearn.XGBClassifier'>,0.3000,0.292929,0.000000,0.000000
2,<class 'sklearn.ensemble._forest.RandomForestClassifier'>,<class 'sklearn.neighbors._classification.KNeighborsClassifier'>,0.2000,0.200000,0.000000,0.000000
2,<class 'sklearn.ensemble._forest.RandomForestClassifier'>,<class 'sklearn.svm._classes.SVC'>,0.2000,0.200000,0.000000,0.000000


In [168]:
rapgef_res_all[rapgef_res_all['accuracy'] >= 0.75].sort_values('accuracy_std')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,accuracy,f1,accuracy_std,f1_std
seed,feat_sel,models,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.linear_model._logistic.LogisticRegression'>,0.875,0.896703,0.0,0.0
0,<class 'sklearn.linear_model._logistic.LogisticRegression'>,<class 'sklearn.svm._classes.SVC'>,0.8,0.772222,0.0,0.0
1,<class 'xgboost.sklearn.XGBClassifier'>,<class 'xgboost.sklearn.XGBClassifier'>,0.8,0.791667,0.0,0.0
2,<class 'sklearn.linear_model._logistic.LogisticRegression'>,<class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>,0.75,0.780952,0.0,0.0
0,<class 'sklearn.ensemble._forest.RandomForestClassifier'>,<class 'sklearn.neighbors._classification.KNeighborsClassifier'>,0.7625,0.764444,0.194454,0.175991


In [151]:
rapgef_res_.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,f1,accuracy_std,f1_std
feat_sel,models,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
<class 'xgboost.sklearn.XGBClassifier'>,<class 'xgboost.sklearn.XGBClassifier'>,0.725,0.724838,0.0,0.072452
<class 'sklearn.linear_model._logistic.LogisticRegression'>,<class 'sklearn.svm._classes.SVC'>,0.65,0.584524,0.252591,0.09776
<class 'sklearn.ensemble._forest.RandomForestClassifier'>,<class 'lightgbm.sklearn.LGBMClassifier'>,0.629167,0.613725,0.138585,0.106054
<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.ensemble._forest.ExtraTreesClassifier'>,0.625,0.64,0.036084,0.077536
<class 'sklearn.ensemble._forest.RandomForestClassifier'>,<class 'sklearn.ensemble._gb.GradientBoostingClassifier'>,0.581944,0.585532,0.0,0.0
<class 'xgboost.sklearn.XGBClassifier'>,<class 'sklearn.svm._classes.SVC'>,0.575,0.54701,0.25,0.116937
<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.ensemble._gb.GradientBoostingClassifier'>,0.55,0.502671,0.272431,0.126198
<class 'xgboost.sklearn.XGBClassifier'>,<class 'sklearn.neighbors._classification.KNeighborsClassifier'>,0.548611,0.537038,0.036084,0.051281
<class 'lightgbm.sklearn.LGBMClassifier'>,<class 'sklearn.linear_model._logistic.LogisticRegression'>,0.548333,0.533677,0.239629,0.145565
<class 'xgboost.sklearn.XGBClassifier'>,<class 'sklearn.linear_model._logistic.LogisticRegression'>,0.545,0.525685,0.088388,0.0


`lr` with `gbr` is a good model

In [174]:
from collections import Counter

def merge_dictionaries(dict1, dict2):
    counter1 = Counter(dict1)
    counter2 = Counter(dict2)

    merged_counter = counter1 + counter2
    merged_dict = dict(merged_counter)

    return merged_dict

# Example usage:
dict1 = {'a': 1, 'b': 2, 'c': 3}
dict2 = {'b': 5, 'c': 7, 'd': 9}

merged_dict = merge_dictionaries(dict1, dict2)
print(merged_dict)


{'a': 1, 'b': 7, 'c': 10, 'd': 9}


In [183]:
import pandas as pd

# Sample DataFrames (replace these with your actual DataFrames)
df1 = pd.DataFrame({
    'seed': [38, 4, 94, 64, 73],
    'feat_sel': [1, 0, 0, 0, 2],
    'models': ['LGBM', 'LGBM', 'KNeighbors', 'XGB', 'Logistic'],
    'accuracy': [1.0, 1.0, 1.0, 0.9375, 0.9375],
    'f1': [1.0, 1.0, 1.0, 0.903704, 0.903704]
})

df2 = pd.DataFrame({
    'seed': [0, 2, 0, 1, 2, 0, 1],
    'feat_sel': ['RandomForest', 'Logistic', 'LGBM', 'Logistic', 'XGB', 'LGBM', 'LGBM'],
    'models': ['Logistic', 'SVC', 'LGBM', 'GradientBoosting', 'GradientBoosting', 'Logistic', 'GradientBoosting'],
    'accuracy': [0.9375, 0.9375, 0.875, 0.75, 0.75, 0.78125, 0.75],
    'f1': [0.906667, 0.903704, 0.819048, 0.633333, 0.886275, 0.819048, 0.855556],
    'accuracy_std': [0, 0, 0, 0, 0, 0.309359, 0.353553],
    'f1_std': [0, 0, 0, 0, 0, 0.255905, 0.204275]
})

# Merge and filter
merged_df = pd.concat([df1, df2], on=['seed', 'feat_sel', 'models', 'model_params'], join='inner')

# Display the result
print(merged_df)


TypeError: concat() got an unexpected keyword argument 'on'

In [180]:
df1

Unnamed: 0,ID,Name,Score
0,1,Alice,85
1,2,Bob,92
2,3,Charlie,78
3,4,David,95
4,5,Eve,88


In [181]:
df2

Unnamed: 0,ID,Name
0,2,Bob
1,4,David


In [236]:
pd.DataFrame.from_dict({"adefwc":23, "cdcdacdw":34, "adecwfwc":232, "cdcdacdwcw":354, "adewcfwc":623, "cdcdacdw":314}, index=0)

TypeError: DataFrame.from_dict() got an unexpected keyword argument 'index'

In [238]:
def dict_to_dataframe(input_dict):
    df = pd.DataFrame(list(input_dict.items()), columns=['features', 'feature_importance'])
    return df

In [239]:
dict_to_dataframe({"adefwc":23, "cdcdacdw":34, "adecwfwc":232, "cdcdacdwcw":354, "adewcfwc":623, "cdcdacdw":314})

Unnamed: 0,features,feature_importance
0,adefwc,23
1,cdcdacdw,314
2,adecwfwc,232
3,cdcdacdwcw,354
4,adewcfwc,623


In [44]:
from utils import prep_test
import pandas as pd

traindf = pd.read_csv("allTrain.tsv", sep="\t", low_memory=True)
testdf = pd.read_csv("testData.tsv", sep="\t", low_memory=True, index_col=0)
_, _, _, _, X_train, y_train = get_train_test_split(traindf)
X_test = prep_test(testdf)

In [5]:
X_test.shape

(9, 15989)

In [6]:
X_train.shape

(48, 15989)

In [47]:
np.all(X_train.columns == X_test.columns)

True

In [46]:
diff_cols = X_train.columns != X_test.columns
test_cols = X_test.iloc[:, diff_cols].columns.to_list()
train_cols = X_train.iloc[:, diff_cols].columns.tolist()
rename_cols = dict(zip(test_cols, train_cols))
X_test = X_test.rename(rename_cols, axis=1)

In [19]:
cols = X_test.iloc[:,(X_train.columns!=X_test.columns)].columns.to_list()
newcols = [f'{col.split(".")[0]}-{col.split(".")[1]}' for col in cols]

In [37]:
cols = X_test.iloc[:,(X_train.columns!=X_test.columns)].columns.to_list()
cols

['rna_CTB-178M22',
 'rna_CTC-338M12',
 'rna_GS1-124K5',
 'rna_DTX2P1-UPK3BP1',
 'rna_GS1-259H13',
 'rna_STAG3L5P-PVRIG2P',
 'rna_CH17-360D5',
 'rna_THRA1-BTR',
 'rna_CH17-340M24']

In [35]:
rename_cols = dict(zip(cols,newcols))
X_test = X_test.rename(rename_cols, axis=1)

In [38]:
train_cols = X_train.iloc[:,(X_train.columns!=X_test.columns)].columns.tolist()

In [39]:
train_cols

['rna_CTB-178M22.2',
 'rna_CTC-338M12.4',
 'rna_GS1-124K5.11',
 'rna_DTX2P1-UPK3BP1-PMS2P11',
 'rna_GS1-259H13.2',
 'rna_STAG3L5P-PVRIG2P-PILRB',
 'rna_CH17-360D5.1',
 'rna_THRA1/BTR',
 'rna_CH17-340M24.3']