In [2]:
import pandas as pd
import wandb

In [3]:
api = wandb.Api()
entity, project = "pratik24111991", "TFBS_Finetuned_Models_Work"
runs = api.runs(entity + "/" + project)

In [4]:
summary_list, config_list, name_list, tag_list = [], [], [], []
for run in runs:
    # .summary contains output keys/values for
    # metrics such as accuracy.
    #  We call ._json_dict to omit large files
    summary_list.append(run.summary._json_dict)

    # .config contains the hyperparameters.
    #  We remove special values that start with _.
    config_list.append({k: v for k, v in run.config.items() if not k.startswith("_")})

    # .name is the human-readable name of the run.
    name_list.append(run.name)
    tag_list.append(run.tags)

runs_df = pd.DataFrame(
    {"summary": summary_list, "config": config_list, "name": name_list, "tags":tag_list}
)

In [5]:
runs_df

Unnamed: 0,summary,config,name,tags
0,{},"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_March_12th_2024_00_02_05_logstep=...,"[H3K79me1, TFBS_NonTFBS, e-5]"
1,{},"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_March_11th_2024_22_12_45_logstep=...,"[H3K79me1, TFBS_NonTFBS, e-4]"
2,{},"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_March_11th_2024_20_39_45_logstep=...,"[H3K79me1, TFBS_NonTFBS, e-3]"
3,{},"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_March_11th_2024_18_52_35_logstep=...,"[H3K79me1, TFBS_NonTFBS, e-6]"
4,"{'_wandb': {'runtime': 35114}, 'eval_f1': 0.76...","{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_March_11th_2024_14_16_40_logstep=...,"[H3K79me1, TFBS_NonTFBS, e-5]"
...,...,...,...,...
2355,"{'eval_f1': 0.33504827683416727, 'eval_precisi...","{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_February_27th_2024_19_05_22_logst...,"[SAP30, TFBS_NonTFBS, e-3]"
2356,"{'_step': 324, '_wandb': {'runtime': 693}, 'ev...","{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_February_27th_2024_18_56_57_logst...,"[SAP30, TFBS_NonTFBS, e-6]"
2357,"{'_timestamp': 1709078760.263911, 'train_loss'...","{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_February_27th_2024_18_55_50_logst...,"[SAP30, TFBS_NonTFBS, e-5]"
2358,"{'eval_mcc': 0.7767198869762932, 'eval_precisi...","{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_February_27th_2024_18_55_00_logst...,"[SAP30, TFBS_NonTFBS, e-4]"


In [6]:
# Define a function to filter tags
def filter_tags(tags):
    return [tag for tag in tags if '-' not in tag and 'TFBS' not in tag]

In [7]:
runs_df['tags'] = runs_df['tags'].apply(filter_tags)

In [8]:
runs_df

Unnamed: 0,summary,config,name,tags
0,{},"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_March_12th_2024_00_02_05_logstep=...,[H3K79me1]
1,{},"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_March_11th_2024_22_12_45_logstep=...,[H3K79me1]
2,{},"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_March_11th_2024_20_39_45_logstep=...,[H3K79me1]
3,{},"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_March_11th_2024_18_52_35_logstep=...,[H3K79me1]
4,"{'_wandb': {'runtime': 35114}, 'eval_f1': 0.76...","{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_March_11th_2024_14_16_40_logstep=...,[H3K79me1]
...,...,...,...,...
2355,"{'eval_f1': 0.33504827683416727, 'eval_precisi...","{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_February_27th_2024_19_05_22_logst...,[SAP30]
2356,"{'_step': 324, '_wandb': {'runtime': 693}, 'ev...","{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_February_27th_2024_18_56_57_logst...,[SAP30]
2357,"{'_timestamp': 1709078760.263911, 'train_loss'...","{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_February_27th_2024_18_55_50_logst...,[SAP30]
2358,"{'eval_mcc': 0.7767198869762932, 'eval_precisi...","{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_February_27th_2024_18_55_00_logst...,[SAP30]


In [9]:
# Normalize the summary column to create a DataFrame of summaries
summaries_df = pd.json_normalize(runs_df['summary'])

# Concatenate the new summary columns with the original DataFrame (minus the old 'summary' column)
expanded_runs_df = pd.concat([runs_df.drop('summary', axis=1), summaries_df], axis=1)
# Assuming expanded_runs_df is your DataFrame
expanded_runs_df['tags'] = expanded_runs_df['tags'].apply(lambda x: x[0] if x else None)

In [10]:
expanded_runs_df

Unnamed: 0,config,name,tags,eval_f1,eval_mcc,eval_recall,learning_rate,_step,eval_precision,eval_auc,...,eval_Validation loss,eval_acc,_wandb.runtime,Confusion Matrix._type,Confusion Matrix.width,Confusion Matrix.format,Confusion Matrix.height,Confusion Matrix.sha256,Confusion Matrix.path,Confusion Matrix.size
0,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_March_12th_2024_00_02_05_logstep=...,H3K79me1,,,,,,,,...,,,,,,,,,,
1,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_March_11th_2024_22_12_45_logstep=...,H3K79me1,,,,,,,,...,,,,,,,,,,
2,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_March_11th_2024_20_39_45_logstep=...,H3K79me1,,,,,,,,...,,,,,,,,,,
3,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_March_11th_2024_18_52_35_logstep=...,H3K79me1,,,,,,,,...,,,,,,,,,,
4,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_March_11th_2024_14_16_40_logstep=...,H3K79me1,0.763926,0.532850,0.764757,1.114784e-06,16329.0,0.768103,0.834585,...,0.488250,0.764634,35114.0,image-file,1200.0,png,1200.0,581d69cdd53dde6db39ee0b73f5649cdabc20e61220942...,media/images/Confusion Matrix_10886_581d69cdd5...,61277.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2355,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_February_27th_2024_19_05_22_logst...,SAP30,0.335048,0.000000,0.500000,3.873874e-04,327.0,0.251934,0.505400,...,0.693131,0.503869,692.0,image-file,1200.0,png,1200.0,8bb7708fa5de7feeb1d06e439d1d08483f9f17af9479c6...,media/images/Confusion Matrix_218_8bb7708fa5de...,49619.0
2356,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_February_27th_2024_18_56_57_logst...,SAP30,0.797994,0.597406,0.798311,1.381381e-07,324.0,0.799096,0.879813,...,0.537199,0.798092,693.0,image-file,1200.0,png,1200.0,5a78816cab0e1fa1c6a6f02ac5e67d8fb0dd137cfec7c4...,media/images/Confusion Matrix_216_5a78816cab0e...,57947.0
2357,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_February_27th_2024_18_55_50_logst...,SAP30,0.899477,0.799286,0.899412,1.381381e-06,324.0,0.899874,0.962615,...,0.245715,0.899523,700.0,image-file,1200.0,png,1200.0,265f5743eafafb5593d85b3f5b25a8bde9f25d156c0678...,media/images/Confusion Matrix_216_265f5743eafa...,59356.0
2358,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_February_27th_2024_18_55_00_logst...,SAP30,0.887354,0.776720,0.887280,1.381381e-05,324.0,0.889443,0.951327,...,0.341513,0.887546,696.0,image-file,1200.0,png,1200.0,7375fb8417576ec4fc3d46996dd222308076be3b318ac5...,media/images/Confusion Matrix_216_7375fb841757...,58933.0


In [11]:
# Remove rows with any NaN values and keep the original index
expanded_runs_df = expanded_runs_df.dropna().reset_index()
expanded_runs_df

Unnamed: 0,index,config,name,tags,eval_f1,eval_mcc,eval_recall,learning_rate,_step,eval_precision,...,eval_Validation loss,eval_acc,_wandb.runtime,Confusion Matrix._type,Confusion Matrix.width,Confusion Matrix.format,Confusion Matrix.height,Confusion Matrix.sha256,Confusion Matrix.path,Confusion Matrix.size
0,4,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_March_11th_2024_14_16_40_logstep=...,H3K79me1,0.763926,0.532850,0.764757,1.114784e-06,16329.0,0.768103,...,0.488250,0.764634,35114.0,image-file,1200.0,png,1200.0,581d69cdd53dde6db39ee0b73f5649cdabc20e61220942...,media/images/Confusion Matrix_10886_581d69cdd5...,61277.0
1,5,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_March_11th_2024_14_02_14_logstep=...,ZEB1,0.834396,0.674062,0.834274,3.787879e-06,390.0,0.839811,...,0.383239,0.835277,856.0,image-file,1200.0,png,1200.0,feb43d53ed3ba438f719cd1cb72616505e348922b9a060...,media/images/Confusion Matrix_260_feb43d53ed3b...,55995.0
2,6,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_March_11th_2024_13_47_46_logstep=...,ZEB1,0.830014,0.663142,0.829843,1.262626e-06,390.0,0.833308,...,0.389353,0.830630,855.0,image-file,1200.0,png,1200.0,741d5e1c7589a93cfc1cdbafbd42cc5ad60c64a834096d...,media/images/Confusion Matrix_260_741d5e1c7589...,59693.0
3,7,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_March_11th_2024_13_33_19_logstep=...,ZEB1,0.833570,0.672659,0.833462,3.787879e-06,390.0,0.839222,...,0.384626,0.834488,854.0,image-file,1200.0,png,1200.0,8c4c8962d28f9d8b77e05b0483b0b5026bba83e93398cf...,media/images/Confusion Matrix_260_8c4c8962d28f...,56735.0
4,8,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_March_11th_2024_13_18_53_logstep=...,ZEB1,0.829924,0.662975,0.829754,1.262626e-06,390.0,0.833230,...,0.389356,0.830543,856.0,image-file,1200.0,png,1200.0,7ce8762ff6041058de4616b26a350952983759cf1e667d...,media/images/Confusion Matrix_260_7ce8762ff604...,59942.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2350,2355,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_February_27th_2024_19_05_22_logst...,SAP30,0.335048,0.000000,0.500000,3.873874e-04,327.0,0.251934,...,0.693131,0.503869,692.0,image-file,1200.0,png,1200.0,8bb7708fa5de7feeb1d06e439d1d08483f9f17af9479c6...,media/images/Confusion Matrix_218_8bb7708fa5de...,49619.0
2351,2356,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_February_27th_2024_18_56_57_logst...,SAP30,0.797994,0.597406,0.798311,1.381381e-07,324.0,0.799096,...,0.537199,0.798092,693.0,image-file,1200.0,png,1200.0,5a78816cab0e1fa1c6a6f02ac5e67d8fb0dd137cfec7c4...,media/images/Confusion Matrix_216_5a78816cab0e...,57947.0
2352,2357,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_February_27th_2024_18_55_50_logst...,SAP30,0.899477,0.799286,0.899412,1.381381e-06,324.0,0.899874,...,0.245715,0.899523,700.0,image-file,1200.0,png,1200.0,265f5743eafafb5593d85b3f5b25a8bde9f25d156c0678...,media/images/Confusion Matrix_216_265f5743eafa...,59356.0
2353,2358,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_February_27th_2024_18_55_00_logst...,SAP30,0.887354,0.776720,0.887280,1.381381e-05,324.0,0.889443,...,0.341513,0.887546,696.0,image-file,1200.0,png,1200.0,7375fb8417576ec4fc3d46996dd222308076be3b318ac5...,media/images/Confusion Matrix_216_7375fb841757...,58933.0


In [12]:
# Group by 'tags' and find the index of the max 'eval_acc' for each group
idx = expanded_runs_df.groupby('tags')['eval_acc'].idxmax()

# Use the indices to select rows from the original DataFrame
best_acc_df = expanded_runs_df.loc[idx].reset_index(drop=True)
best_acc_df

Unnamed: 0,index,config,name,tags,eval_f1,eval_mcc,eval_recall,learning_rate,_step,eval_precision,...,eval_Validation loss,eval_acc,_wandb.runtime,Confusion Matrix._type,Confusion Matrix.width,Confusion Matrix.format,Confusion Matrix.height,Confusion Matrix.sha256,Confusion Matrix.path,Confusion Matrix.size
0,119,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_March_11th_2024_07_48_56_logstep=...,AGO2,0.895784,0.794619,0.895822,0.000004,633.0,0.898803,...,0.298919,0.895999,1362.0,image-file,1200.0,png,1200.0,958cd42b536bc3994165310ae2cbd90fbd514c41e93d89...,media/images/Confusion Matrix_422_958cd42b536b...,60413.0
1,1070,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_March_03th_2024_07_58_34_logstep=...,ARID3A,0.763331,0.533582,0.764785,0.000036,795.0,0.768812,...,0.765750,0.764071,1699.0,image-file,1200.0,png,1200.0,9790b8d093b46b5410432abe0ae5c05698c8f6338f0c86...,media/images/Confusion Matrix_530_9790b8d093b4...,59950.0
2,307,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_March_10th_2024_18_51_55_logstep=...,ARNT,0.750420,0.501429,0.750620,0.000121,873.0,0.750809,...,0.593425,0.750443,1831.0,image-file,1200.0,png,1200.0,4654e6e9f4b65a33e5232fe1c4e4f3ad0d152246fc91aa...,media/images/Confusion Matrix_582_4654e6e9f4b6...,58394.0
3,1720,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_February_29th_2024_20_07_02_logst...,BLC3,0.908875,0.818516,0.908947,0.000034,3555.0,0.909569,...,0.361423,0.908906,7580.0,image-file,1200.0,png,1200.0,62feb30361869d99e534374c07accb544ecca05e54c11e...,media/images/Confusion Matrix_2370_62feb303618...,62161.0
4,1295,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_March_02th_2024_07_41_10_logstep=...,BRD9,0.738001,0.478249,0.738164,0.000004,243.0,0.740089,...,0.512739,0.738697,534.0,image-file,1200.0,png,1200.0,0b36807bba0d5ceda023762edbaf45d0b463b29bbec878...,media/images/Confusion Matrix_162_0b36807bba0d...,58948.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,2295,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_February_27th_2024_22_06_01_logst...,ZNF652,0.851404,0.702968,0.851480,0.000137,228.0,0.851488,...,0.373125,0.851404,490.0,image-file,1200.0,png,1200.0,1ac2070351ebd1bceb2e5d4f27380f1ea6667dc82f6400...,media/images/Confusion Matrix_152_1ac2070351eb...,55152.0
95,160,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_March_11th_2024_06_11_19_logstep=...,ZNF692,0.831956,0.664753,0.831936,0.000129,486.0,0.832817,...,0.425019,0.832101,1025.0,image-file,1200.0,png,1200.0,d0d6ce66dba2ae38fd584c6e040cfc876601c15f1f4a5a...,media/images/Confusion Matrix_324_d0d6ce66dba2...,55171.0
96,621,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_March_09th_2024_03_13_54_logstep=...,ZNF792,0.808163,0.621292,0.808337,0.000126,408.0,0.812972,...,0.530071,0.809028,862.0,image-file,1200.0,png,1200.0,0a705a0c192a9c707044695c28c4b400f4158812f32126...,media/images/Confusion Matrix_272_0a705a0c192a...,59776.0
97,1693,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_March_01th_2024_02_09_02_logstep=...,ZSCAN9,0.812409,0.625169,0.812420,0.000132,282.0,0.812749,...,0.483263,0.812469,601.0,image-file,1200.0,png,1200.0,e5c3db86ea9e90d0c522f3bbd8795316ce7d735f6f33ba...,media/images/Confusion Matrix_188_e5c3db86ea9e...,54036.0


In [13]:
best_acc_df.sort_values(by="eval_acc", ascending=False)[['tags','eval_acc']]

Unnamed: 0,tags,eval_acc
48,RBM14,0.968908
57,SPI1,0.957358
63,THAP1,0.937964
12,E2F4,0.918800
45,RAD212,0.915177
...,...,...
36,MYBL2,0.739527
4,BRD9,0.738697
80,ZNF318,0.731597
28,ILF3,0.715873
