In [15]:
import pandas as pd
import wandb

In [16]:
api = wandb.Api()
entity, project = "pratik24111991", "TFBS_Finetuned_Models_300bp_balanced_Work"
runs = api.runs(entity + "/" + project)

In [17]:
summary_list, config_list, name_list, tag_list = [], [], [], []
for run in runs:
    # .summary contains output keys/values for
    # metrics such as accuracy.
    #  We call ._json_dict to omit large files
    summary_list.append(run.summary._json_dict)

    # .config contains the hyperparameters.
    #  We remove special values that start with _.
    config_list.append({k: v for k, v in run.config.items() if not k.startswith("_")})

    # .name is the human-readable name of the run.
    name_list.append(run.name)
    tag_list.append(run.tags)

runs_df = pd.DataFrame(
    {"summary": summary_list, "config": config_list, "name": name_list, "tags":tag_list}
)

In [18]:
runs_df

Unnamed: 0,summary,config,name,tags
0,"{'train_loss': 0.6270707417125544, 'eval_preci...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_June_05th_2024_23_55_10_logstep=1...,"[H3K36me3, TFBS_NonTFBS, e-6]"
1,"{'eval_mcc': 0.1820258751685492, '_wandb': {'r...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_June_05th_2024_23_16_16_logstep=1...,"[H3K36me3, TFBS_NonTFBS, e-6]"
2,"{'learning_rate': 5.4200542005420054e-08, 'Con...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_June_05th_2024_22_37_33_logstep=1...,"[H3K36me3, TFBS_NonTFBS, e-6]"
3,"{'_step': 10620, 'eval_mcc': 0, 'train_loss': ...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 6...",TFBS_NonTFBS_June_05th_2024_22_14_07_logstep=2...,"[RAD21, TFBS_NonTFBS, e-3]"
4,"{'train_loss': 0.6949255400233798, 'eval_preci...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 6...",TFBS_NonTFBS_June_05th_2024_22_05_03_logstep=2...,"[H3K9me3, TFBS_NonTFBS, e-3]"
...,...,...,...,...
903,{'_wandb': {'runtime': 32}},"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_23th_2024_15_17_30_logstep=12...,"[CEBPA, TFBS_NonTFBS, e-4]"
904,"{'train_loss': 0.00918852477642315, 'eval_prec...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_23th_2024_14_25_44_logstep=12...,"[CEBPA, TFBS_NonTFBS, e-4]"
905,"{'eval_mcc': 0.8233823837004726, '_wandb': {'r...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_23th_2024_13_41_27_logstep=12...,"[CEBPA, TFBS_NonTFBS, e-4]"
906,"{'_step': 1210, '_wandb': {'runtime': 2349}, '...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_23th_2024_13_02_04_logstep=12...,"[CEBPA, TFBS_NonTFBS, e-4]"


In [19]:
# Define a function to filter tags
def filter_tags(tags):
    return [tag for tag in tags if '-' not in tag and 'TFBS' not in tag]

In [20]:
runs_df['tags'] = runs_df['tags'].apply(filter_tags)

In [21]:
runs_df

Unnamed: 0,summary,config,name,tags
0,"{'train_loss': 0.6270707417125544, 'eval_preci...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_June_05th_2024_23_55_10_logstep=1...,[H3K36me3]
1,"{'eval_mcc': 0.1820258751685492, '_wandb': {'r...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_June_05th_2024_23_16_16_logstep=1...,[H3K36me3]
2,"{'learning_rate': 5.4200542005420054e-08, 'Con...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_June_05th_2024_22_37_33_logstep=1...,[H3K36me3]
3,"{'_step': 10620, 'eval_mcc': 0, 'train_loss': ...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 6...",TFBS_NonTFBS_June_05th_2024_22_14_07_logstep=2...,[RAD21]
4,"{'train_loss': 0.6949255400233798, 'eval_preci...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 6...",TFBS_NonTFBS_June_05th_2024_22_05_03_logstep=2...,[H3K9me3]
...,...,...,...,...
903,{'_wandb': {'runtime': 32}},"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_23th_2024_15_17_30_logstep=12...,[CEBPA]
904,"{'train_loss': 0.00918852477642315, 'eval_prec...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_23th_2024_14_25_44_logstep=12...,[CEBPA]
905,"{'eval_mcc': 0.8233823837004726, '_wandb': {'r...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_23th_2024_13_41_27_logstep=12...,[CEBPA]
906,"{'_step': 1210, '_wandb': {'runtime': 2349}, '...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_23th_2024_13_02_04_logstep=12...,[CEBPA]


In [22]:
# Normalize the summary column to create a DataFrame of summaries
summaries_df = pd.json_normalize(runs_df['summary'])

# Concatenate the new summary columns with the original DataFrame (minus the old 'summary' column)
expanded_runs_df = pd.concat([runs_df.drop('summary', axis=1), summaries_df], axis=1)
# Assuming expanded_runs_df is your DataFrame
expanded_runs_df['tags'] = expanded_runs_df['tags'].apply(lambda x: x[0] if x else None)

In [23]:
expanded_runs_df

Unnamed: 0,config,name,tags,train_loss,eval_precision,eval_Validation loss,_step,_runtime,eval_mcc,_timestamp,...,eval_acc,eval_auc,Confusion Matrix.path,Confusion Matrix.size,Confusion Matrix._type,Confusion Matrix.width,Confusion Matrix.format,Confusion Matrix.height,Confusion Matrix.sha256,_wandb.runtime
0,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_June_05th_2024_23_55_10_logstep=1...,H3K36me3,0.627071,0.640725,0.624322,968.0,1808.485339,0.278635,1.717648e+09,...,0.638333,0.698923,media/images/Confusion Matrix_847_a4c3c1db8bcd...,57708.0,image-file,1200.0,png,1200.0,a4c3c1db8bcd6fac12f7a87bb9e7db091e7ec783b18c9f...,
1,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_June_05th_2024_23_16_16_logstep=1...,H3K36me3,0.662153,0.592252,0.664515,1210.0,2267.969909,0.182026,1.717646e+09,...,0.590267,0.629178,media/images/Confusion Matrix_1089_177ac8b8f72...,58537.0,image-file,1200.0,png,1200.0,177ac8b8f72f4afbc1bdf7fb875881b835621b6441e295...,2318.0
2,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_June_05th_2024_22_37_33_logstep=1...,H3K36me3,0.625209,0.642480,0.623269,1210.0,2261.715991,0.282269,1.717644e+09,...,0.640200,0.700248,media/images/Confusion Matrix_1089_e7f89ed1d5d...,57685.0,image-file,1200.0,png,1200.0,e7f89ed1d5df6c51249da093ff5f564761f0b36db02ec0...,2312.0
3,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 6...",TFBS_NonTFBS_June_05th_2024_22_14_07_logstep=2...,RAD21,0.694753,0.249767,0.694093,10620.0,6614.299117,0.000000,1.717647e+09,...,0.499533,,media/images/Confusion Matrix_8496_4be120346ff...,49731.0,image-file,1200.0,png,1200.0,4be120346ff340d4ff2ee35c5773eb89746a06f15acd5b...,
4,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 6...",TFBS_NonTFBS_June_05th_2024_22_05_03_logstep=2...,H3K9me3,0.694926,0.249933,0.695659,10620.0,6605.493064,0.000000,1.717646e+09,...,0.499867,,media/images/Confusion Matrix_8496_7e23f820ece...,49818.0,image-file,1200.0,png,1200.0,7e23f820eceecdddea4ede69e17e24ed7876eeee61425b...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
903,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_23th_2024_15_17_30_logstep=12...,CEBPA,,,,,,,,...,,,,,,,,,,32.0
904,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_23th_2024_14_25_44_logstep=12...,CEBPA,0.009189,0.919998,0.472767,1210.0,3042.108180,0.839686,1.716492e+09,...,0.919467,0.970946,media/images/Confusion Matrix_1089_75b86fca852...,56594.0,image-file,1200.0,png,1200.0,75b86fca852c54fef06a8df9ab8ebd1b4491d9b20d2a1c...,3092.0
905,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_23th_2024_13_41_27_logstep=12...,CEBPA,0.054939,0.912503,0.354305,1210.0,2593.578342,0.823382,1.716489e+09,...,0.910467,0.971116,media/images/Confusion Matrix_1089_f6bb2fbf43e...,57386.0,image-file,1200.0,png,1200.0,f6bb2fbf43edc55602f863f690a0207d203b13ae6c95eb...,2643.0
906,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_23th_2024_13_02_04_logstep=12...,CEBPA,0.009411,0.919642,0.472342,1210.0,2300.019405,0.838657,1.716486e+09,...,0.918733,0.970520,media/images/Confusion Matrix_1089_58f1d4377b2...,56913.0,image-file,1200.0,png,1200.0,58f1d4377b2d3356df720b6e18140f5c4068c5642edb66...,2349.0


In [24]:
# # Remove rows with any NaN values and keep the original index
# expanded_runs_df = expanded_runs_df.dropna().reset_index()
# expanded_runs_df

In [25]:
# Group by 'tags' and find the index of the max 'eval_acc' for each group
idx = expanded_runs_df.groupby('tags')['eval_acc'].idxmax()

# Use the indices to select rows from the original DataFrame
best_acc_df = expanded_runs_df.loc[idx].reset_index(drop=True)
best_acc_df

Unnamed: 0,config,name,tags,train_loss,eval_precision,eval_Validation loss,_step,_runtime,eval_mcc,_timestamp,...,eval_acc,eval_auc,Confusion Matrix.path,Confusion Matrix.size,Confusion Matrix._type,Confusion Matrix.width,Confusion Matrix.format,Confusion Matrix.height,Confusion Matrix.sha256,_wandb.runtime
0,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_23th_2024_15_18_16_logstep=12...,CEBPA,0.008737,0.921079,0.449242,1210.0,2570.453699,0.841326,1716494000.0,...,0.919933,0.971719,media/images/Confusion Matrix_1089_df9d1926090...,58399.0,image-file,1200.0,png,1200.0,df9d19260906878f257e28ae35dc418f57d5c808230699...,2620.0
1,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 8...",TFBS_NonTFBS_May_24th_2024_00_13_39_logstep=15...,CEBPB,0.006428,0.917596,0.644254,15930.0,14336.096091,0.834459,1716538000.0,...,0.916867,,media/images/Confusion Matrix_14337_666c48561c...,55775.0,image-file,1200.0,png,1200.0,666c48561c8add0cc86e4b7a20d8d7540d7fe207fb8b54...,14435.0
2,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_24th_2024_18_45_40_logstep=12...,CTCF,0.352588,0.818927,0.431534,1210.0,2258.373427,0.636641,1716593000.0,...,0.8176,0.891568,media/images/Confusion Matrix_1089_2bd8a632194...,55737.0,image-file,1200.0,png,1200.0,2bd8a6321949b95609b06ccf683718b02d70ffd08ebc9d...,2308.0
3,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 8...",TFBS_NonTFBS_May_29th_2024_17_51_42_logstep=15...,EGR2,0.003934,0.957355,0.321884,15930.0,14154.192645,0.914148,1717034000.0,...,0.9568,,media/images/Confusion Matrix_14337_323246492d...,55531.0,image-file,1200.0,png,1200.0,323246492de768ce518270c7e3dddfe6ff1aa29bf35441...,14254.0
4,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_25th_2024_06_21_28_logstep=12...,FOS,0.010088,0.914536,0.478346,1210.0,2260.462936,0.829075,1716635000.0,...,0.914533,0.967287,media/images/Confusion Matrix_1089_7fe13c815e4...,57062.0,image-file,1200.0,png,1200.0,7fe13c815e46b28953e37ea7141e4f192bc63ae6174076...,2310.0
5,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_26th_2024_21_20_26_logstep=12...,H2AFZ,0.534687,0.715204,0.558386,1210.0,2253.556387,0.430295,1716775000.0,...,0.715067,0.788969,media/images/Confusion Matrix_1089_71ee44a3767...,56301.0,image-file,1200.0,png,1200.0,71ee44a376744cfef29b53679c34e7d96713c4733b1a28...,2303.0
6,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_27th_2024_20_33_10_logstep=12...,H2AK5ac,0.329701,0.849114,0.354976,1210.0,2258.157024,0.697268,1716859000.0,...,0.847467,0.917015,media/images/Confusion Matrix_1089_a12145b661e...,57890.0,image-file,1200.0,png,1200.0,a12145b661e67115163861a58bf769050626f4236683b3...,2308.0
7,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_28th_2024_16_03_05_logstep=12...,H2BK12ac,0.39151,0.821331,0.40937,1210.0,2253.843985,0.642041,1716929000.0,...,0.820533,0.893559,media/images/Confusion Matrix_1089_6f49a1ca92a...,55287.0,image-file,1200.0,png,1200.0,6f49a1ca92ab7c374b1d74e06c80b67899e1c73f8eba8f...,2303.0
8,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_30th_2024_09_30_41_logstep=12...,H2BK5ac,0.456696,0.783676,0.477877,1210.0,5748.704899,0.567118,1717082000.0,...,0.7834,0.856095,media/images/Confusion Matrix_1089_829b457bd29...,59924.0,image-file,1200.0,png,1200.0,829b457bd294de6a7e6b96b7c4d74c951be8a032750c8a...,5898.0
9,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_31th_2024_09_33_20_logstep=12...,H3F3A,0.612258,0.63849,0.626708,1210.0,2256.423315,0.276894,1717165000.0,...,0.638467,0.696395,media/images/Confusion Matrix_1089_7d836b9e8b2...,55122.0,image-file,1200.0,png,1200.0,7d836b9e8b220ddf93a1eeac62d2b7511bee3fddd80ce8...,2306.0


In [26]:
best_acc_df.sort_values(by="eval_acc", ascending=False).to_csv("300BP_TFBS_Models_stat.tsv", sep="\t")

In [27]:
best_acc_df.sort_values(by="eval_acc", ascending=False)[['tags','eval_acc']]

Unnamed: 0,tags,eval_acc
3,EGR2,0.9568
27,ZFHX2,0.947333
22,NFE2,0.9422
28,ZNF121,0.936398
0,CEBPA,0.919933
1,CEBPB,0.916867
24,RUNX3,0.915267
4,FOS,0.914533
29,ZNF143,0.8984
25,TRIM22,0.877667


In [28]:
best_acc_df.sort_values(by="eval_acc", ascending=False)[['tags','eval_acc']].to_csv("300bp_TFBS_accuracy_Stat.tsv", sep="\t")

In [29]:
best_acc_df[best_acc_df['eval_acc']>=0.85][['tags', 'config', 'eval_acc']].sort_values(by="eval_acc", ascending=False).to_csv("top_300BP_TFBS_models.tsv", sep="\t", index=False)