In [1]:
import pandas as pd
import wandb

In [2]:
api = wandb.Api()
entity, project = "pratik24111991", "TFBS_Finetuned_Models_Work"
runs = api.runs(entity + "/" + project)

In [8]:
summary_list, config_list, name_list, tag_list = [], [], [], []
for run in runs:
    # .summary contains output keys/values for
    # metrics such as accuracy.
    #  We call ._json_dict to omit large files
    summary_list.append(run.summary._json_dict)

    # .config contains the hyperparameters.
    #  We remove special values that start with _.
    config_list.append({k: v for k, v in run.config.items() if not k.startswith("_")})

    # .name is the human-readable name of the run.
    name_list.append(run.name)
    tag_list.append(run.tags)

runs_df = pd.DataFrame(
    {"summary": summary_list, "config": config_list, "name": name_list, "tags":tag_list}
)

In [9]:
runs_df

Unnamed: 0,summary,config,name,tags
0,"{'_step': 2151, 'eval_f1': 0.33346292593170573...","{'epochs': 10, 'Dropout': 0.1, 'batch_size': 4...",TFBS_NonTFBS_April_10th_2024_21_32_52_logstep=...,"[H2AFZ, TFBS_NonTFBS, e-3]"
1,"{'_runtime': 41269.81987476349, '_timestamp': ...","{'epochs': 10, 'Dropout': 0.1, 'batch_size': 4...",TFBS_NonTFBS_April_10th_2024_08_31_51_logstep=...,"[H2AFZ, TFBS_NonTFBS, e-3]"
2,"{'eval_acc': 0.5002916400379234, 'eval_auc': 0...","{'epochs': 10, 'Dropout': 0.1, 'batch_size': 4...",TFBS_NonTFBS_April_09th_2024_20_13_20_logstep=...,"[H2AFZ, TFBS_NonTFBS, e-3]"
3,"{'eval_Validation loss': 0.6931685654233757, '...","{'epochs': 10, 'Dropout': 0.1, 'batch_size': 4...",TFBS_NonTFBS_April_09th_2024_07_06_31_logstep=...,"[H2AFZ, TFBS_NonTFBS, e-3]"
4,"{'_wandb': {'runtime': 72}, '_runtime': 68.505...","{'epochs': 10, 'Dropout': 0.1, 'batch_size': 4...",TFBS_NonTFBS_April_09th_2024_07_05_07_logstep=...,"[TFBS_NonTFBS, ZNF830, e-3]"
...,...,...,...,...
5346,"{'eval_f1': 0.33504827683416727, '_timestamp':...","{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_February_27th_2024_19_05_22_logst...,"[SAP30, TFBS_NonTFBS, e-3]"
5347,"{'_wandb': {'runtime': 693}, '_timestamp': 170...","{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_February_27th_2024_18_56_57_logst...,"[SAP30, TFBS_NonTFBS, e-6]"
5348,"{'_step': 324, 'train_loss': 0.242748406336263...","{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_February_27th_2024_18_55_50_logst...,"[SAP30, TFBS_NonTFBS, e-5]"
5349,"{'_step': 324, 'eval_recall': 0.88727966698592...","{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_February_27th_2024_18_55_00_logst...,"[SAP30, TFBS_NonTFBS, e-4]"


In [10]:
# Define a function to filter tags
def filter_tags(tags):
    return [tag for tag in tags if '-' not in tag and 'TFBS' not in tag]

In [11]:
runs_df['tags'] = runs_df['tags'].apply(filter_tags)

In [12]:
runs_df

Unnamed: 0,summary,config,name,tags
0,"{'_step': 2151, 'eval_f1': 0.33346292593170573...","{'epochs': 10, 'Dropout': 0.1, 'batch_size': 4...",TFBS_NonTFBS_April_10th_2024_21_32_52_logstep=...,[H2AFZ]
1,"{'_runtime': 41269.81987476349, '_timestamp': ...","{'epochs': 10, 'Dropout': 0.1, 'batch_size': 4...",TFBS_NonTFBS_April_10th_2024_08_31_51_logstep=...,[H2AFZ]
2,"{'eval_acc': 0.5002916400379234, 'eval_auc': 0...","{'epochs': 10, 'Dropout': 0.1, 'batch_size': 4...",TFBS_NonTFBS_April_09th_2024_20_13_20_logstep=...,[H2AFZ]
3,"{'eval_Validation loss': 0.6931685654233757, '...","{'epochs': 10, 'Dropout': 0.1, 'batch_size': 4...",TFBS_NonTFBS_April_09th_2024_07_06_31_logstep=...,[H2AFZ]
4,"{'_wandb': {'runtime': 72}, '_runtime': 68.505...","{'epochs': 10, 'Dropout': 0.1, 'batch_size': 4...",TFBS_NonTFBS_April_09th_2024_07_05_07_logstep=...,[ZNF830]
...,...,...,...,...
5346,"{'eval_f1': 0.33504827683416727, '_timestamp':...","{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_February_27th_2024_19_05_22_logst...,[SAP30]
5347,"{'_wandb': {'runtime': 693}, '_timestamp': 170...","{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_February_27th_2024_18_56_57_logst...,[SAP30]
5348,"{'_step': 324, 'train_loss': 0.242748406336263...","{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_February_27th_2024_18_55_50_logst...,[SAP30]
5349,"{'_step': 324, 'eval_recall': 0.88727966698592...","{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_February_27th_2024_18_55_00_logst...,[SAP30]


In [13]:
# Normalize the summary column to create a DataFrame of summaries
summaries_df = pd.json_normalize(runs_df['summary'])

# Concatenate the new summary columns with the original DataFrame (minus the old 'summary' column)
expanded_runs_df = pd.concat([runs_df.drop('summary', axis=1), summaries_df], axis=1)
# Assuming expanded_runs_df is your DataFrame
expanded_runs_df['tags'] = expanded_runs_df['tags'].apply(lambda x: x[0] if x else None)

In [14]:
expanded_runs_df

Unnamed: 0,config,name,tags,_step,eval_f1,_runtime,eval_acc,train_loss,eval_precision,eval_auc,...,learning_rate,eval_Validation loss,Confusion Matrix.size,Confusion Matrix._type,Confusion Matrix.width,Confusion Matrix.format,Confusion Matrix.height,Confusion Matrix.sha256,Confusion Matrix.path,_wandb.runtime
0,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 4...",TFBS_NonTFBS_April_10th_2024_21_32_52_logstep=...,H2AFZ,2151.0,0.333463,8522.936523,0.500292,0.633849,0.250146,0.500162,...,2.833682e-03,0.693147,54721.0,image-file,1200.0,png,1200.0,736401ff65e7992b9d14637f01dd77c572e67e0f28a9ca...,media/images/Confusion Matrix_0_736401ff65e799...,
1,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 4...",TFBS_NonTFBS_April_10th_2024_08_31_51_logstep=...,H2AFZ,12906.0,0.333463,41269.819875,0.500292,0.693174,0.250146,0.499568,...,1.118080e-04,0.693166,54721.0,image-file,1200.0,png,1200.0,736401ff65e7992b9d14637f01dd77c572e67e0f28a9ca...,media/images/Confusion Matrix_10755_736401ff65...,46848.0
2,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 4...",TFBS_NonTFBS_April_09th_2024_20_13_20_logstep=...,H2AFZ,12906.0,0.333463,41078.283773,0.500292,0.693245,0.250146,0.499269,...,3.354239e-04,0.693165,54721.0,image-file,1200.0,png,1200.0,736401ff65e7992b9d14637f01dd77c572e67e0f28a9ca...,media/images/Confusion Matrix_10755_736401ff65...,44296.0
3,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 4...",TFBS_NonTFBS_April_09th_2024_07_06_31_logstep=...,H2AFZ,12906.0,0.333463,41565.112284,0.500292,0.693176,0.250146,0.500000,...,1.118080e-04,0.693169,54721.0,image-file,1200.0,png,1200.0,736401ff65e7992b9d14637f01dd77c572e67e0f28a9ca...,media/images/Confusion Matrix_10755_736401ff65...,47193.0
4,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 4...",TFBS_NonTFBS_April_09th_2024_07_05_07_logstep=...,ZNF830,20.0,0.338577,68.505405,0.511892,1.507313,0.255946,0.541137,...,0.000000e+00,1.382536,49393.0,image-file,1200.0,png,1200.0,af24858423cc896b0236c1ba6394a82524545e7a9f73d6...,media/images/Confusion Matrix_19_af24858423cc8...,72.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5346,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_February_27th_2024_19_05_22_logst...,SAP30,327.0,0.335048,606.565690,0.503869,0.697234,0.251934,0.505400,...,3.873874e-04,0.693131,49619.0,image-file,1200.0,png,1200.0,8bb7708fa5de7feeb1d06e439d1d08483f9f17af9479c6...,media/images/Confusion Matrix_218_8bb7708fa5de...,692.0
5347,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_February_27th_2024_18_56_57_logst...,SAP30,324.0,0.797994,603.007237,0.798092,0.547333,0.799096,0.879813,...,1.381381e-07,0.537199,57947.0,image-file,1200.0,png,1200.0,5a78816cab0e1fa1c6a6f02ac5e67d8fb0dd137cfec7c4...,media/images/Confusion Matrix_216_5a78816cab0e...,693.0
5348,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_February_27th_2024_18_55_50_logst...,SAP30,324.0,0.899477,608.870517,0.899523,0.242748,0.899874,0.962615,...,1.381381e-06,0.245715,59356.0,image-file,1200.0,png,1200.0,265f5743eafafb5593d85b3f5b25a8bde9f25d156c0678...,media/images/Confusion Matrix_216_265f5743eafa...,700.0
5349,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_February_27th_2024_18_55_00_logst...,SAP30,324.0,0.887354,604.127964,0.887546,0.171405,0.889443,0.951327,...,1.381381e-05,0.341513,58933.0,image-file,1200.0,png,1200.0,7375fb8417576ec4fc3d46996dd222308076be3b318ac5...,media/images/Confusion Matrix_216_7375fb841757...,696.0


In [15]:
# Remove rows with any NaN values and keep the original index
expanded_runs_df = expanded_runs_df.dropna().reset_index()
expanded_runs_df

Unnamed: 0,index,config,name,tags,_step,eval_f1,_runtime,eval_acc,train_loss,eval_precision,...,learning_rate,eval_Validation loss,Confusion Matrix.size,Confusion Matrix._type,Confusion Matrix.width,Confusion Matrix.format,Confusion Matrix.height,Confusion Matrix.sha256,Confusion Matrix.path,_wandb.runtime
0,1,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 4...",TFBS_NonTFBS_April_10th_2024_08_31_51_logstep=...,H2AFZ,12906.0,0.333463,41269.819875,0.500292,0.693174,0.250146,...,1.118080e-04,0.693166,54721.0,image-file,1200.0,png,1200.0,736401ff65e7992b9d14637f01dd77c572e67e0f28a9ca...,media/images/Confusion Matrix_10755_736401ff65...,46848.0
1,2,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 4...",TFBS_NonTFBS_April_09th_2024_20_13_20_logstep=...,H2AFZ,12906.0,0.333463,41078.283773,0.500292,0.693245,0.250146,...,3.354239e-04,0.693165,54721.0,image-file,1200.0,png,1200.0,736401ff65e7992b9d14637f01dd77c572e67e0f28a9ca...,media/images/Confusion Matrix_10755_736401ff65...,44296.0
2,3,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 4...",TFBS_NonTFBS_April_09th_2024_07_06_31_logstep=...,H2AFZ,12906.0,0.333463,41565.112284,0.500292,0.693176,0.250146,...,1.118080e-04,0.693169,54721.0,image-file,1200.0,png,1200.0,736401ff65e7992b9d14637f01dd77c572e67e0f28a9ca...,media/images/Confusion Matrix_10755_736401ff65...,47193.0
3,4,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 4...",TFBS_NonTFBS_April_09th_2024_07_05_07_logstep=...,ZNF830,20.0,0.338577,68.505405,0.511892,1.507313,0.255946,...,0.000000e+00,1.382536,49393.0,image-file,1200.0,png,1200.0,af24858423cc896b0236c1ba6394a82524545e7a9f73d6...,media/images/Confusion Matrix_19_af24858423cc8...,72.0
4,5,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 4...",TFBS_NonTFBS_April_09th_2024_07_03_43_logstep=...,ZNF830,20.0,0.338577,68.498296,0.511892,0.697799,0.255946,...,0.000000e+00,0.693117,49393.0,image-file,1200.0,png,1200.0,af24858423cc896b0236c1ba6394a82524545e7a9f73d6...,media/images/Confusion Matrix_19_af24858423cc8...,72.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4901,5346,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_February_27th_2024_19_05_22_logst...,SAP30,327.0,0.335048,606.565690,0.503869,0.697234,0.251934,...,3.873874e-04,0.693131,49619.0,image-file,1200.0,png,1200.0,8bb7708fa5de7feeb1d06e439d1d08483f9f17af9479c6...,media/images/Confusion Matrix_218_8bb7708fa5de...,692.0
4902,5347,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_February_27th_2024_18_56_57_logst...,SAP30,324.0,0.797994,603.007237,0.798092,0.547333,0.799096,...,1.381381e-07,0.537199,57947.0,image-file,1200.0,png,1200.0,5a78816cab0e1fa1c6a6f02ac5e67d8fb0dd137cfec7c4...,media/images/Confusion Matrix_216_5a78816cab0e...,693.0
4903,5348,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_February_27th_2024_18_55_50_logst...,SAP30,324.0,0.899477,608.870517,0.899523,0.242748,0.899874,...,1.381381e-06,0.245715,59356.0,image-file,1200.0,png,1200.0,265f5743eafafb5593d85b3f5b25a8bde9f25d156c0678...,media/images/Confusion Matrix_216_265f5743eafa...,700.0
4904,5349,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_February_27th_2024_18_55_00_logst...,SAP30,324.0,0.887354,604.127964,0.887546,0.171405,0.889443,...,1.381381e-05,0.341513,58933.0,image-file,1200.0,png,1200.0,7375fb8417576ec4fc3d46996dd222308076be3b318ac5...,media/images/Confusion Matrix_216_7375fb841757...,696.0


In [16]:
# Group by 'tags' and find the index of the max 'eval_acc' for each group
idx = expanded_runs_df.groupby('tags')['eval_acc'].idxmax()

# Use the indices to select rows from the original DataFrame
best_acc_df = expanded_runs_df.loc[idx].reset_index(drop=True)
best_acc_df

Unnamed: 0,index,config,name,tags,_step,eval_f1,_runtime,eval_acc,train_loss,eval_precision,...,learning_rate,eval_Validation loss,Confusion Matrix.size,Confusion Matrix._type,Confusion Matrix.width,Confusion Matrix.format,Confusion Matrix.height,Confusion Matrix.sha256,Confusion Matrix.path,_wandb.runtime
0,1290,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 4...",TFBS_NonTFBS_April_02th_2024_01_59_24_logstep=...,ADNP,84.0,0.697605,195.097749,0.697791,0.491164,0.698848,...,0.000074,0.556304,59260.0,image-file,1200.0,png,1200.0,4cd68014477cf9a253946fb3a8e5731064dca356908b5e...,media/images/Confusion Matrix_72_4cd68014477cf...,210.0
1,1510,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 4...",TFBS_NonTFBS_April_01th_2024_00_56_04_logstep=...,AFF1,112.0,0.771229,252.408261,0.771503,0.412298,0.772159,...,0.000074,0.489762,56987.0,image-file,1200.0,png,1200.0,57538da06cb056c9ea737cee9856bb0904daea3344d1ba...,media/images/Confusion Matrix_96_57538da06cb05...,272.0
2,166,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 4...",TFBS_NonTFBS_April_08th_2024_09_51_03_logstep=...,AGO1,91.0,0.934558,214.870189,0.934578,0.173443,0.934964,...,0.000100,0.217270,54813.0,image-file,1200.0,png,1200.0,eb9f9661213cd7aa22f6c54b1df7910ca13684deb9f6da...,media/images/Confusion Matrix_78_eb9f9661213cd...,237.0
3,3110,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_March_11th_2024_07_48_56_logstep=...,AGO2,633.0,0.895784,1204.465479,0.895999,0.264199,0.898803,...,0.000004,0.298919,60413.0,image-file,1200.0,png,1200.0,958cd42b536bc3994165310ae2cbd90fbd514c41e93d89...,media/images/Confusion Matrix_422_958cd42b536b...,1362.0
4,2474,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_March_22th_2024_03_02_22_logstep=...,ARHGAP35,66.0,0.821313,125.135130,0.821501,0.333016,0.823799,...,0.000019,0.402552,57200.0,image-file,1200.0,png,1200.0,05bda9ec50bc339493dca11b5d394511f04b407205e0ee...,media/images/Confusion Matrix_44_05bda9ec50bc3...,154.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
419,1554,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 4...",TFBS_NonTFBS_March_29th_2024_22_13_39_logstep=...,ZSCAN4,105.0,0.872177,241.394078,0.872801,0.235353,0.879305,...,0.000051,0.378305,55101.0,image-file,1200.0,png,1200.0,8c84502b2c48ab7e127586b4b2de9ef4de9a25c6b57cda...,media/images/Confusion Matrix_90_8c84502b2c48a...,255.0
420,1496,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 4...",TFBS_NonTFBS_April_01th_2024_01_31_02_logstep=...,ZSCAN5A,40.0,0.868224,94.731095,0.868922,0.300181,0.874865,...,0.000000,0.311428,58513.0,image-file,1200.0,png,1200.0,15e3928e27a81126a8756b7a2874f005f41fb2d662555a...,media/images/Confusion Matrix_36_15e3928e27a81...,98.0
421,1862,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 4...",TFBS_NonTFBS_March_28th_2024_00_35_12_logstep=...,ZSCAN5C,80.0,0.781919,187.623710,0.781928,0.422188,0.782058,...,0.000000,0.471170,55333.0,image-file,1200.0,png,1200.0,93e14e1228775373425ea1ed716add7bee037c49401919...,media/images/Confusion Matrix_70_93e14e1228775...,192.0
422,4684,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_March_01th_2024_02_09_02_logstep=...,ZSCAN9,282.0,0.812409,525.080328,0.812469,0.307224,0.812749,...,0.000132,0.483263,54036.0,image-file,1200.0,png,1200.0,e5c3db86ea9e90d0c522f3bbd8795316ce7d735f6f33ba...,media/images/Confusion Matrix_188_e5c3db86ea9e...,601.0


In [17]:
best_acc_df.sort_values(by="eval_acc", ascending=False).to_csv("All_TFBS_Models_stat.tsv", sep="\t")

In [18]:
best_acc_df.sort_values(by="eval_acc", ascending=False)[['tags','eval_acc']]

Unnamed: 0,tags,eval_acc
218,RBM14,0.968908
231,SAFB,0.968090
266,TAF15,0.959936
255,SPI1,0.957358
291,USF1,0.951848
...,...,...
82,H2AFZ,0.500292
95,H3K9me3,0.499966
89,H3K27ac,0.499856
86,H3F3A,0.499651


In [15]:
best_acc_df.sort_values(by="eval_acc", ascending=False)[['tags','eval_acc']].to_csv("TFBS_accuracy_Stat.tsv", sep="\t")

In [23]:
best_acc_df[best_acc_df['eval_acc']>=0.85][['tags', 'config', 'eval_acc']].sort_values(by="eval_acc", ascending=False).to_csv("top_TFBS_models.tsv", sep="\t", index=False)