In [1]:
import pandas as pd
import wandb

In [2]:
api = wandb.Api()
entity, project = "pratik24111991", "TFBS_Finetuned_Models_300bp_balanced_Work"
runs = api.runs(entity + "/" + project)

In [3]:
summary_list, config_list, name_list, tag_list = [], [], [], []
for run in runs:
    # .summary contains output keys/values for
    # metrics such as accuracy.
    #  We call ._json_dict to omit large files
    summary_list.append(run.summary._json_dict)

    # .config contains the hyperparameters.
    #  We remove special values that start with _.
    config_list.append({k: v for k, v in run.config.items() if not k.startswith("_")})

    # .name is the human-readable name of the run.
    name_list.append(run.name)
    tag_list.append(run.tags)

runs_df = pd.DataFrame(
    {"summary": summary_list, "config": config_list, "name": name_list, "tags":tag_list}
)

In [4]:
runs_df

Unnamed: 0,summary,config,name,tags
0,"{'eval_acc': 0.9300666666666668, '_timestamp':...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 8...",TFBS_NonTFBS_June_19th_2024_08_10_59_logstep=1...,"[SCRT2, TFBS_NonTFBS, e-6]"
1,"{'_wandb': {'runtime': 13694}, 'eval_f1': 0.89...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 8...",TFBS_NonTFBS_June_19th_2024_04_22_27_logstep=1...,"[SCRT2, TFBS_NonTFBS, e-6]"
2,"{'_runtime': 13608.572161197662, 'eval_acc': 0...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 8...",TFBS_NonTFBS_June_19th_2024_00_33_46_logstep=1...,"[SCRT2, TFBS_NonTFBS, e-6]"
3,"{'eval_f1': 0.89983610822139, 'eval_mcc': 0.80...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 8...",TFBS_NonTFBS_June_18th_2024_20_43_22_logstep=1...,"[SCRT2, TFBS_NonTFBS, e-6]"
4,"{'eval_recall': 0.9301429061214423, 'learning_...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 8...",TFBS_NonTFBS_June_18th_2024_16_49_31_logstep=1...,"[SCRT2, TFBS_NonTFBS, e-6]"
...,...,...,...,...
1156,{'_wandb': {'runtime': 32}},"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_23th_2024_15_17_30_logstep=12...,"[CEBPA, TFBS_NonTFBS, e-4]"
1157,"{'_step': 1210, '_timestamp': 1716491787.54384...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_23th_2024_14_25_44_logstep=12...,"[CEBPA, TFBS_NonTFBS, e-4]"
1158,"{'eval_f1': 0.9104051651882584, 'eval_acc': 0....","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_23th_2024_13_41_27_logstep=12...,"[CEBPA, TFBS_NonTFBS, e-4]"
1159,"{'_wandb': {'runtime': 2349}, '_timestamp': 17...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_23th_2024_13_02_04_logstep=12...,"[CEBPA, TFBS_NonTFBS, e-4]"


In [5]:
# Define a function to filter tags
def filter_tags(tags):
    return [tag for tag in tags if '-' not in tag and 'TFBS' not in tag]

In [6]:
runs_df['tags'] = runs_df['tags'].apply(filter_tags)

In [7]:
runs_df

Unnamed: 0,summary,config,name,tags
0,"{'eval_acc': 0.9300666666666668, '_timestamp':...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 8...",TFBS_NonTFBS_June_19th_2024_08_10_59_logstep=1...,[SCRT2]
1,"{'_wandb': {'runtime': 13694}, 'eval_f1': 0.89...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 8...",TFBS_NonTFBS_June_19th_2024_04_22_27_logstep=1...,[SCRT2]
2,"{'_runtime': 13608.572161197662, 'eval_acc': 0...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 8...",TFBS_NonTFBS_June_19th_2024_00_33_46_logstep=1...,[SCRT2]
3,"{'eval_f1': 0.89983610822139, 'eval_mcc': 0.80...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 8...",TFBS_NonTFBS_June_18th_2024_20_43_22_logstep=1...,[SCRT2]
4,"{'eval_recall': 0.9301429061214423, 'learning_...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 8...",TFBS_NonTFBS_June_18th_2024_16_49_31_logstep=1...,[SCRT2]
...,...,...,...,...
1156,{'_wandb': {'runtime': 32}},"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_23th_2024_15_17_30_logstep=12...,[CEBPA]
1157,"{'_step': 1210, '_timestamp': 1716491787.54384...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_23th_2024_14_25_44_logstep=12...,[CEBPA]
1158,"{'eval_f1': 0.9104051651882584, 'eval_acc': 0....","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_23th_2024_13_41_27_logstep=12...,[CEBPA]
1159,"{'_wandb': {'runtime': 2349}, '_timestamp': 17...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_23th_2024_13_02_04_logstep=12...,[CEBPA]


In [8]:
# Normalize the summary column to create a DataFrame of summaries
summaries_df = pd.json_normalize(runs_df['summary'])

# Concatenate the new summary columns with the original DataFrame (minus the old 'summary' column)
expanded_runs_df = pd.concat([runs_df.drop('summary', axis=1), summaries_df], axis=1)
# Assuming expanded_runs_df is your DataFrame
expanded_runs_df['tags'] = expanded_runs_df['tags'].apply(lambda x: x[0] if x else None)

In [9]:
expanded_runs_df

Unnamed: 0,config,name,tags,eval_acc,_timestamp,train_loss,eval_recall,learning_rate,_step,_runtime,...,eval_mcc,_wandb.runtime,Confusion Matrix.path,Confusion Matrix.size,Confusion Matrix._type,Confusion Matrix.width,Confusion Matrix.format,Confusion Matrix.height,Confusion Matrix.sha256,eval_auc
0,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 8...",TFBS_NonTFBS_June_19th_2024_08_10_59_logstep=1...,SCRT2,0.930067,1.718813e+09,0.174585,0.930143,3.135670e-09,15930.0,13577.849416,...,0.860312,13676.0,media/images/Confusion Matrix_14337_58edeeb833...,56315.0,image-file,1200.0,png,1200.0,58edeeb83310816c653e5c169acebd84134eed71b5f882...,
1,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 8...",TFBS_NonTFBS_June_19th_2024_04_22_27_logstep=1...,SCRT2,0.899867,1.718799e+09,0.251097,0.900073,1.045223e-09,15930.0,13596.038440,...,0.800865,13694.0,media/images/Confusion Matrix_14337_ac55e7b8a8...,55230.0,image-file,1200.0,png,1200.0,ac55e7b8a8663037e00e98c46d9b8d879fc3a63523f3b8...,
2,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 8...",TFBS_NonTFBS_June_19th_2024_00_33_46_logstep=1...,SCRT2,0.930133,1.718785e+09,0.174591,0.930209,3.135670e-09,15930.0,13608.572161,...,0.860443,13705.0,media/images/Confusion Matrix_14337_12272ff236...,55664.0,image-file,1200.0,png,1200.0,12272ff236a53e5de41b1ab740d4fc725a5eea17ff0b84...,
3,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 8...",TFBS_NonTFBS_June_18th_2024_20_43_22_logstep=1...,SCRT2,0.899867,1.718772e+09,0.251097,0.900073,1.045223e-09,15930.0,13709.952305,...,0.800865,13807.0,media/images/Confusion Matrix_14337_ac55e7b8a8...,55230.0,image-file,1200.0,png,1200.0,ac55e7b8a8663037e00e98c46d9b8d879fc3a63523f3b8...,
4,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 8...",TFBS_NonTFBS_June_18th_2024_16_49_31_logstep=1...,SCRT2,0.930067,1.718758e+09,0.174585,0.930143,3.135670e-09,15930.0,13917.680234,...,0.860312,14015.0,media/images/Confusion Matrix_14337_58edeeb833...,56315.0,image-file,1200.0,png,1200.0,58edeeb83310816c653e5c169acebd84134eed71b5f882...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1156,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_23th_2024_15_17_30_logstep=12...,CEBPA,,,,,,,,...,,32.0,,,,,,,,
1157,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_23th_2024_14_25_44_logstep=12...,CEBPA,0.919467,1.716492e+09,0.009189,0.919688,5.420054e-06,1210.0,3042.108180,...,0.839686,3092.0,media/images/Confusion Matrix_1089_75b86fca852...,56594.0,image-file,1200.0,png,1200.0,75b86fca852c54fef06a8df9ab8ebd1b4491d9b20d2a1c...,0.970946
1158,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_23th_2024_13_41_27_logstep=12...,CEBPA,0.910467,1.716489e+09,0.054939,0.910881,1.806685e-06,1210.0,2593.578342,...,0.823382,2643.0,media/images/Confusion Matrix_1089_f6bb2fbf43e...,57386.0,image-file,1200.0,png,1200.0,f6bb2fbf43edc55602f863f690a0207d203b13ae6c95eb...,0.971116
1159,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_23th_2024_13_02_04_logstep=12...,CEBPA,0.918733,1.716486e+09,0.009411,0.919015,5.420054e-06,1210.0,2300.019405,...,0.838657,2349.0,media/images/Confusion Matrix_1089_58f1d4377b2...,56913.0,image-file,1200.0,png,1200.0,58f1d4377b2d3356df720b6e18140f5c4068c5642edb66...,0.970520


In [10]:
# # Remove rows with any NaN values and keep the original index
# expanded_runs_df = expanded_runs_df.dropna().reset_index()
# expanded_runs_df

In [11]:
# Group by 'tags' and find the index of the max 'eval_acc' for each group
idx = expanded_runs_df.groupby('tags')['eval_acc'].idxmax()

# Use the indices to select rows from the original DataFrame
best_acc_df = expanded_runs_df.loc[idx].reset_index(drop=True)
best_acc_df

Unnamed: 0,config,name,tags,eval_acc,_timestamp,train_loss,eval_recall,learning_rate,_step,_runtime,...,eval_mcc,_wandb.runtime,Confusion Matrix.path,Confusion Matrix.size,Confusion Matrix._type,Confusion Matrix.width,Confusion Matrix.format,Confusion Matrix.height,Confusion Matrix.sha256,eval_auc
0,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_23th_2024_15_18_16_logstep=12...,CEBPA,0.919933,1716494000.0,0.008737,0.920247,5.420054e-06,1210.0,2570.453699,...,0.841326,2620.0,media/images/Confusion Matrix_1089_df9d1926090...,58399.0,image-file,1200.0,png,1200.0,df9d19260906878f257e28ae35dc418f57d5c808230699...,0.971719
1,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 8...",TFBS_NonTFBS_May_24th_2024_00_13_39_logstep=15...,CEBPB,0.916867,1716538000.0,0.006428,0.916864,1.045223e-07,15930.0,14336.096091,...,0.834459,14435.0,media/images/Confusion Matrix_14337_666c48561c...,55775.0,image-file,1200.0,png,1200.0,666c48561c8add0cc86e4b7a20d8d7540d7fe207fb8b54...,
2,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_24th_2024_18_45_40_logstep=12...,CTCF,0.8176,1716593000.0,0.352588,0.817715,5.420054e-07,1210.0,2258.373427,...,0.636641,2308.0,media/images/Confusion Matrix_1089_2bd8a632194...,55737.0,image-file,1200.0,png,1200.0,2bd8a6321949b95609b06ccf683718b02d70ffd08ebc9d...,0.891568
3,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 8...",TFBS_NonTFBS_May_29th_2024_17_51_42_logstep=15...,EGR2,0.9568,1717034000.0,0.003934,0.956793,1.045223e-07,15930.0,14154.192645,...,0.914148,14254.0,media/images/Confusion Matrix_14337_323246492d...,55531.0,image-file,1200.0,png,1200.0,323246492de768ce518270c7e3dddfe6ff1aa29bf35441...,
4,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_25th_2024_06_21_28_logstep=12...,FOS,0.914533,1716635000.0,0.010088,0.914539,5.420054e-06,1210.0,2260.462936,...,0.829075,2310.0,media/images/Confusion Matrix_1089_7fe13c815e4...,57062.0,image-file,1200.0,png,1200.0,7fe13c815e46b28953e37ea7141e4f192bc63ae6174076...,0.967287
5,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_26th_2024_21_20_26_logstep=12...,H2AFZ,0.715067,1716775000.0,0.534687,0.715091,1.806685e-07,1210.0,2253.556387,...,0.430295,2303.0,media/images/Confusion Matrix_1089_71ee44a3767...,56301.0,image-file,1200.0,png,1200.0,71ee44a376744cfef29b53679c34e7d96713c4733b1a28...,0.788969
6,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_27th_2024_20_33_10_logstep=12...,H2AK5ac,0.847467,1716859000.0,0.329701,0.848155,1.806685e-07,1210.0,2258.157024,...,0.697268,2308.0,media/images/Confusion Matrix_1089_a12145b661e...,57890.0,image-file,1200.0,png,1200.0,a12145b661e67115163861a58bf769050626f4236683b3...,0.917015
7,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_28th_2024_16_03_05_logstep=12...,H2BK12ac,0.820533,1716929000.0,0.39151,0.82071,1.806685e-07,1210.0,2253.843985,...,0.642041,2303.0,media/images/Confusion Matrix_1089_6f49a1ca92a...,55287.0,image-file,1200.0,png,1200.0,6f49a1ca92ab7c374b1d74e06c80b67899e1c73f8eba8f...,0.893559
8,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_30th_2024_09_30_41_logstep=12...,H2BK5ac,0.7834,1717082000.0,0.456696,0.783443,1.806685e-07,1210.0,5748.704899,...,0.567118,5898.0,media/images/Confusion Matrix_1089_829b457bd29...,59924.0,image-file,1200.0,png,1200.0,829b457bd294de6a7e6b96b7c4d74c951be8a032750c8a...,0.856095
9,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_31th_2024_09_33_20_logstep=12...,H3F3A,0.638467,1717165000.0,0.612258,0.638405,1.806685e-07,1210.0,2256.423315,...,0.276894,2306.0,media/images/Confusion Matrix_1089_7d836b9e8b2...,55122.0,image-file,1200.0,png,1200.0,7d836b9e8b220ddf93a1eeac62d2b7511bee3fddd80ce8...,0.696395


In [19]:
best_acc_df['lr'] = best_acc_df['config'].apply(
    lambda config: format(config['learning_rate'], '.1e').split('e-')[-1].lstrip('0')
)
best_acc_df

Unnamed: 0,config,name,tags,eval_acc,_timestamp,train_loss,eval_recall,learning_rate,_step,_runtime,...,_wandb.runtime,Confusion Matrix.path,Confusion Matrix.size,Confusion Matrix._type,Confusion Matrix.width,Confusion Matrix.format,Confusion Matrix.height,Confusion Matrix.sha256,eval_auc,lr
0,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_23th_2024_15_18_16_logstep=12...,CEBPA,0.919933,1716494000.0,0.008737,0.920247,5.420054e-06,1210.0,2570.453699,...,2620.0,media/images/Confusion Matrix_1089_df9d1926090...,58399.0,image-file,1200.0,png,1200.0,df9d19260906878f257e28ae35dc418f57d5c808230699...,0.971719,4
1,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 8...",TFBS_NonTFBS_May_24th_2024_00_13_39_logstep=15...,CEBPB,0.916867,1716538000.0,0.006428,0.916864,1.045223e-07,15930.0,14336.096091,...,14435.0,media/images/Confusion Matrix_14337_666c48561c...,55775.0,image-file,1200.0,png,1200.0,666c48561c8add0cc86e4b7a20d8d7540d7fe207fb8b54...,,4
2,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_24th_2024_18_45_40_logstep=12...,CTCF,0.8176,1716593000.0,0.352588,0.817715,5.420054e-07,1210.0,2258.373427,...,2308.0,media/images/Confusion Matrix_1089_2bd8a632194...,55737.0,image-file,1200.0,png,1200.0,2bd8a6321949b95609b06ccf683718b02d70ffd08ebc9d...,0.891568,5
3,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 8...",TFBS_NonTFBS_May_29th_2024_17_51_42_logstep=15...,EGR2,0.9568,1717034000.0,0.003934,0.956793,1.045223e-07,15930.0,14154.192645,...,14254.0,media/images/Confusion Matrix_14337_323246492d...,55531.0,image-file,1200.0,png,1200.0,323246492de768ce518270c7e3dddfe6ff1aa29bf35441...,,4
4,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_25th_2024_06_21_28_logstep=12...,FOS,0.914533,1716635000.0,0.010088,0.914539,5.420054e-06,1210.0,2260.462936,...,2310.0,media/images/Confusion Matrix_1089_7fe13c815e4...,57062.0,image-file,1200.0,png,1200.0,7fe13c815e46b28953e37ea7141e4f192bc63ae6174076...,0.967287,4
5,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_26th_2024_21_20_26_logstep=12...,H2AFZ,0.715067,1716775000.0,0.534687,0.715091,1.806685e-07,1210.0,2253.556387,...,2303.0,media/images/Confusion Matrix_1089_71ee44a3767...,56301.0,image-file,1200.0,png,1200.0,71ee44a376744cfef29b53679c34e7d96713c4733b1a28...,0.788969,5
6,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_27th_2024_20_33_10_logstep=12...,H2AK5ac,0.847467,1716859000.0,0.329701,0.848155,1.806685e-07,1210.0,2258.157024,...,2308.0,media/images/Confusion Matrix_1089_a12145b661e...,57890.0,image-file,1200.0,png,1200.0,a12145b661e67115163861a58bf769050626f4236683b3...,0.917015,5
7,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_28th_2024_16_03_05_logstep=12...,H2BK12ac,0.820533,1716929000.0,0.39151,0.82071,1.806685e-07,1210.0,2253.843985,...,2303.0,media/images/Confusion Matrix_1089_6f49a1ca92a...,55287.0,image-file,1200.0,png,1200.0,6f49a1ca92ab7c374b1d74e06c80b67899e1c73f8eba8f...,0.893559,5
8,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_30th_2024_09_30_41_logstep=12...,H2BK5ac,0.7834,1717082000.0,0.456696,0.783443,1.806685e-07,1210.0,5748.704899,...,5898.0,media/images/Confusion Matrix_1089_829b457bd29...,59924.0,image-file,1200.0,png,1200.0,829b457bd294de6a7e6b96b7c4d74c951be8a032750c8a...,0.856095,5
9,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_31th_2024_09_33_20_logstep=12...,H3F3A,0.638467,1717165000.0,0.612258,0.638405,1.806685e-07,1210.0,2256.423315,...,2306.0,media/images/Confusion Matrix_1089_7d836b9e8b2...,55122.0,image-file,1200.0,png,1200.0,7d836b9e8b220ddf93a1eeac62d2b7511bee3fddd80ce8...,0.696395,5


In [20]:
best_acc_df.sort_values(by="eval_acc", ascending=False).to_csv("300BP_TFBS_Models_stat.tsv", sep="\t")

In [21]:
best_acc_df.columns

Index(['config', 'name', 'tags', 'eval_acc', '_timestamp', 'train_loss',
       'eval_recall', 'learning_rate', '_step', '_runtime', 'eval_precision',
       'eval_Validation loss', 'eval_f1', 'eval_mcc', '_wandb.runtime',
       'Confusion Matrix.path', 'Confusion Matrix.size',
       'Confusion Matrix._type', 'Confusion Matrix.width',
       'Confusion Matrix.format', 'Confusion Matrix.height',
       'Confusion Matrix.sha256', 'eval_auc', 'lr'],
      dtype='object')

In [22]:
best_acc_df.sort_values(by="eval_acc", ascending=False)[['tags','eval_acc', 'lr']]

Unnamed: 0,tags,eval_acc,lr
3,EGR2,0.9568,4
33,ZFHX2,0.947333,4
25,NFE2,0.9422,4
34,ZNF121,0.936398,4
30,SCRT2,0.935867,4
0,CEBPA,0.919933,4
1,CEBPB,0.916867,4
29,RUNX3,0.915267,4
4,FOS,0.914533,4
26,PRDM1,0.9062,5


In [28]:
best_acc_df.sort_values(by="eval_acc", ascending=False)[['tags','eval_acc']].to_csv("300bp_TFBS_accuracy_Stat.tsv", sep="\t")

In [29]:
best_acc_df[best_acc_df['eval_acc']>=0.85][['tags', 'config', 'eval_acc']].sort_values(by="eval_acc", ascending=False).to_csv("top_300BP_TFBS_models.tsv", sep="\t", index=False)