In [7]:
import pandas as pd
import wandb
import os, shutil

In [8]:
api = wandb.Api()
entity, project = "pratik24111991", "TFBS_Finetuned_Models_300bp_balanced_Work"
runs = api.runs(entity + "/" + project)

In [9]:
summary_list, config_list, name_list, tag_list = [], [], [], []
for run in runs:
    # .summary contains output keys/values for
    # metrics such as accuracy.
    #  We call ._json_dict to omit large files
    summary_list.append(run.summary._json_dict)

    # .config contains the hyperparameters.
    #  We remove special values that start with _.
    config_list.append({k: v for k, v in run.config.items() if not k.startswith("_")})

    # .name is the human-readable name of the run.
    name_list.append(run.name)
    tag_list.append(run.tags)

runs_df = pd.DataFrame(
    {"summary": summary_list, "config": config_list, "name": name_list, "tags":tag_list}
)

In [10]:
runs_df

Unnamed: 0,summary,config,name,tags
0,"{'eval_recall': 0.9301429061214423, 'learning_...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 8...",TFBS_NonTFBS_June_19th_2024_08_10_59_logstep=1...,"[SCRT2, TFBS_NonTFBS, e-6]"
1,"{'_timestamp': 1718798945.8791134, '_step': 15...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 8...",TFBS_NonTFBS_June_19th_2024_04_22_27_logstep=1...,"[SCRT2, TFBS_NonTFBS, e-6]"
2,"{'_runtime': 13608.572161197662, 'eval_acc': 0...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 8...",TFBS_NonTFBS_June_19th_2024_00_33_46_logstep=1...,"[SCRT2, TFBS_NonTFBS, e-6]"
3,"{'eval_acc': 0.8998666666666667, 'eval_mcc': 0...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 8...",TFBS_NonTFBS_June_18th_2024_20_43_22_logstep=1...,"[SCRT2, TFBS_NonTFBS, e-6]"
4,"{'eval_acc': 0.9300666666666668, 'eval_recall'...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 8...",TFBS_NonTFBS_June_18th_2024_16_49_31_logstep=1...,"[SCRT2, TFBS_NonTFBS, e-6]"
...,...,...,...,...
1156,{'_wandb': {'runtime': 32}},"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_23th_2024_15_17_30_logstep=12...,"[CEBPA, TFBS_NonTFBS, e-4]"
1157,"{'_runtime': 3042.1081800460815, '_timestamp':...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_23th_2024_14_25_44_logstep=12...,"[CEBPA, TFBS_NonTFBS, e-4]"
1158,"{'Confusion Matrix': {'_type': 'image-file', '...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_23th_2024_13_41_27_logstep=12...,"[CEBPA, TFBS_NonTFBS, e-4]"
1159,"{'learning_rate': 5.420054200542004e-06, 'eval...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_23th_2024_13_02_04_logstep=12...,"[CEBPA, TFBS_NonTFBS, e-4]"


In [11]:
# Define a function to filter tags
def filter_tags(tags):
    return [tag for tag in tags if '-' not in tag and 'TFBS' not in tag]

In [12]:
runs_df['tags'] = runs_df['tags'].apply(filter_tags)

In [13]:
runs_df

Unnamed: 0,summary,config,name,tags
0,"{'eval_recall': 0.9301429061214423, 'learning_...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 8...",TFBS_NonTFBS_June_19th_2024_08_10_59_logstep=1...,[SCRT2]
1,"{'_timestamp': 1718798945.8791134, '_step': 15...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 8...",TFBS_NonTFBS_June_19th_2024_04_22_27_logstep=1...,[SCRT2]
2,"{'_runtime': 13608.572161197662, 'eval_acc': 0...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 8...",TFBS_NonTFBS_June_19th_2024_00_33_46_logstep=1...,[SCRT2]
3,"{'eval_acc': 0.8998666666666667, 'eval_mcc': 0...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 8...",TFBS_NonTFBS_June_18th_2024_20_43_22_logstep=1...,[SCRT2]
4,"{'eval_acc': 0.9300666666666668, 'eval_recall'...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 8...",TFBS_NonTFBS_June_18th_2024_16_49_31_logstep=1...,[SCRT2]
...,...,...,...,...
1156,{'_wandb': {'runtime': 32}},"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_23th_2024_15_17_30_logstep=12...,[CEBPA]
1157,"{'_runtime': 3042.1081800460815, '_timestamp':...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_23th_2024_14_25_44_logstep=12...,[CEBPA]
1158,"{'Confusion Matrix': {'_type': 'image-file', '...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_23th_2024_13_41_27_logstep=12...,[CEBPA]
1159,"{'learning_rate': 5.420054200542004e-06, 'eval...","{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_23th_2024_13_02_04_logstep=12...,[CEBPA]


In [14]:
# Normalize the summary column to create a DataFrame of summaries
summaries_df = pd.json_normalize(runs_df['summary'])

# Concatenate the new summary columns with the original DataFrame (minus the old 'summary' column)
expanded_runs_df = pd.concat([runs_df.drop('summary', axis=1), summaries_df], axis=1)
# Assuming expanded_runs_df is your DataFrame
expanded_runs_df['tags'] = expanded_runs_df['tags'].apply(lambda x: x[0] if x else None)

In [15]:
expanded_runs_df

Unnamed: 0,config,name,tags,eval_recall,learning_rate,eval_acc,_timestamp,eval_mcc,train_loss,eval_precision,...,_runtime,Confusion Matrix.width,Confusion Matrix.format,Confusion Matrix.height,Confusion Matrix.sha256,Confusion Matrix.path,Confusion Matrix.size,Confusion Matrix._type,_wandb.runtime,eval_auc
0,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 8...",TFBS_NonTFBS_June_19th_2024_08_10_59_logstep=1...,SCRT2,0.930143,3.135670e-09,0.930067,1.718813e+09,0.860312,0.174585,0.930169,...,13577.849416,1200.0,png,1200.0,58edeeb83310816c653e5c169acebd84134eed71b5f882...,media/images/Confusion Matrix_14337_58edeeb833...,56315.0,image-file,13676.0,
1,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 8...",TFBS_NonTFBS_June_19th_2024_04_22_27_logstep=1...,SCRT2,0.900073,1.045223e-09,0.899867,1.718799e+09,0.800865,0.251097,0.900793,...,13596.038440,1200.0,png,1200.0,ac55e7b8a8663037e00e98c46d9b8d879fc3a63523f3b8...,media/images/Confusion Matrix_14337_ac55e7b8a8...,55230.0,image-file,13694.0,
2,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 8...",TFBS_NonTFBS_June_19th_2024_00_33_46_logstep=1...,SCRT2,0.930209,3.135670e-09,0.930133,1.718785e+09,0.860443,0.174591,0.930234,...,13608.572161,1200.0,png,1200.0,12272ff236a53e5de41b1ab740d4fc725a5eea17ff0b84...,media/images/Confusion Matrix_14337_12272ff236...,55664.0,image-file,13705.0,
3,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 8...",TFBS_NonTFBS_June_18th_2024_20_43_22_logstep=1...,SCRT2,0.900073,1.045223e-09,0.899867,1.718772e+09,0.800865,0.251097,0.900793,...,13709.952305,1200.0,png,1200.0,ac55e7b8a8663037e00e98c46d9b8d879fc3a63523f3b8...,media/images/Confusion Matrix_14337_ac55e7b8a8...,55230.0,image-file,13807.0,
4,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 8...",TFBS_NonTFBS_June_18th_2024_16_49_31_logstep=1...,SCRT2,0.930143,3.135670e-09,0.930067,1.718758e+09,0.860312,0.174585,0.930169,...,13917.680234,1200.0,png,1200.0,58edeeb83310816c653e5c169acebd84134eed71b5f882...,media/images/Confusion Matrix_14337_58edeeb833...,56315.0,image-file,14015.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1156,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_23th_2024_15_17_30_logstep=12...,CEBPA,,,,,,,,...,,,,,,,,,32.0,
1157,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_23th_2024_14_25_44_logstep=12...,CEBPA,0.919688,5.420054e-06,0.919467,1.716492e+09,0.839686,0.009189,0.919998,...,3042.108180,1200.0,png,1200.0,75b86fca852c54fef06a8df9ab8ebd1b4491d9b20d2a1c...,media/images/Confusion Matrix_1089_75b86fca852...,56594.0,image-file,3092.0,0.970946
1158,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_23th_2024_13_41_27_logstep=12...,CEBPA,0.910881,1.806685e-06,0.910467,1.716489e+09,0.823382,0.054939,0.912503,...,2593.578342,1200.0,png,1200.0,f6bb2fbf43edc55602f863f690a0207d203b13ae6c95eb...,media/images/Confusion Matrix_1089_f6bb2fbf43e...,57386.0,image-file,2643.0,0.971116
1159,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_23th_2024_13_02_04_logstep=12...,CEBPA,0.919015,5.420054e-06,0.918733,1.716486e+09,0.838657,0.009411,0.919642,...,2300.019405,1200.0,png,1200.0,58f1d4377b2d3356df720b6e18140f5c4068c5642edb66...,media/images/Confusion Matrix_1089_58f1d4377b2...,56913.0,image-file,2349.0,0.970520


In [22]:
# # Remove rows with any NaN values and keep the original index
# expanded_runs_df = expanded_runs_df.dropna().reset_index()
# expanded_runs_df

In [16]:
# Group by 'tags' and find the index of the max 'eval_acc' for each group
idx = expanded_runs_df.groupby('tags')['eval_acc'].idxmax()

# Use the indices to select rows from the original DataFrame
best_acc_df = expanded_runs_df.loc[idx].reset_index(drop=True)
best_acc_df

Unnamed: 0,config,name,tags,eval_recall,learning_rate,eval_acc,_timestamp,eval_mcc,train_loss,eval_precision,...,_runtime,Confusion Matrix.width,Confusion Matrix.format,Confusion Matrix.height,Confusion Matrix.sha256,Confusion Matrix.path,Confusion Matrix.size,Confusion Matrix._type,_wandb.runtime,eval_auc
0,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_23th_2024_15_18_16_logstep=12...,CEBPA,0.920247,5.420054e-06,0.919933,1716494000.0,0.841326,0.008737,0.921079,...,2570.453699,1200.0,png,1200.0,df9d19260906878f257e28ae35dc418f57d5c808230699...,media/images/Confusion Matrix_1089_df9d1926090...,58399.0,image-file,2620.0,0.971719
1,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 8...",TFBS_NonTFBS_May_24th_2024_00_13_39_logstep=15...,CEBPB,0.916864,1.045223e-07,0.916867,1716538000.0,0.834459,0.006428,0.917596,...,14336.096091,1200.0,png,1200.0,666c48561c8add0cc86e4b7a20d8d7540d7fe207fb8b54...,media/images/Confusion Matrix_14337_666c48561c...,55775.0,image-file,14435.0,
2,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_24th_2024_18_45_40_logstep=12...,CTCF,0.817715,5.420054e-07,0.8176,1716593000.0,0.636641,0.352588,0.818927,...,2258.373427,1200.0,png,1200.0,2bd8a6321949b95609b06ccf683718b02d70ffd08ebc9d...,media/images/Confusion Matrix_1089_2bd8a632194...,55737.0,image-file,2308.0,0.891568
3,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 8...",TFBS_NonTFBS_May_29th_2024_17_51_42_logstep=15...,EGR2,0.956793,1.045223e-07,0.9568,1717034000.0,0.914148,0.003934,0.957355,...,14154.192645,1200.0,png,1200.0,323246492de768ce518270c7e3dddfe6ff1aa29bf35441...,media/images/Confusion Matrix_14337_323246492d...,55531.0,image-file,14254.0,
4,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_25th_2024_06_21_28_logstep=12...,FOS,0.914539,5.420054e-06,0.914533,1716635000.0,0.829075,0.010088,0.914536,...,2260.462936,1200.0,png,1200.0,7fe13c815e46b28953e37ea7141e4f192bc63ae6174076...,media/images/Confusion Matrix_1089_7fe13c815e4...,57062.0,image-file,2310.0,0.967287
5,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_26th_2024_21_20_26_logstep=12...,H2AFZ,0.715091,1.806685e-07,0.715067,1716775000.0,0.430295,0.534687,0.715204,...,2253.556387,1200.0,png,1200.0,71ee44a376744cfef29b53679c34e7d96713c4733b1a28...,media/images/Confusion Matrix_1089_71ee44a3767...,56301.0,image-file,2303.0,0.788969
6,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_27th_2024_20_33_10_logstep=12...,H2AK5ac,0.848155,1.806685e-07,0.847467,1716859000.0,0.697268,0.329701,0.849114,...,2258.157024,1200.0,png,1200.0,a12145b661e67115163861a58bf769050626f4236683b3...,media/images/Confusion Matrix_1089_a12145b661e...,57890.0,image-file,2308.0,0.917015
7,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_28th_2024_16_03_05_logstep=12...,H2BK12ac,0.82071,1.806685e-07,0.820533,1716929000.0,0.642041,0.39151,0.821331,...,2253.843985,1200.0,png,1200.0,6f49a1ca92ab7c374b1d74e06c80b67899e1c73f8eba8f...,media/images/Confusion Matrix_1089_6f49a1ca92a...,55287.0,image-file,2303.0,0.893559
8,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_30th_2024_09_30_41_logstep=12...,H2BK5ac,0.783443,1.806685e-07,0.7834,1717082000.0,0.567118,0.456696,0.783676,...,5748.704899,1200.0,png,1200.0,829b457bd294de6a7e6b96b7c4d74c951be8a032750c8a...,media/images/Confusion Matrix_1089_829b457bd29...,59924.0,image-file,5898.0,0.856095
9,"{'epochs': 15, 'Dropout': 0.1, 'batch_size': 1...",TFBS_NonTFBS_May_31th_2024_09_33_20_logstep=12...,H3F3A,0.638405,1.806685e-07,0.638467,1717165000.0,0.276894,0.612258,0.63849,...,2256.423315,1200.0,png,1200.0,7d836b9e8b220ddf93a1eeac62d2b7511bee3fddd80ce8...,media/images/Confusion Matrix_1089_7d836b9e8b2...,55122.0,image-file,2306.0,0.696395


In [17]:
best_acc_df.iloc[1]['config']

{'epochs': 15,
 'Dropout': 0.1,
 'batch_size': 80,
 'Weight Decay': 0.005,
 'architecture': 'TFBS_Finetuned_Models_300bp_balanced',
 'learning_rate': 0.0001,
 'warm up percentage': 0.1,
 'Maximum sequence length': 300}

In [20]:
base_path= "/data/private/pdutta_new/DNABERT_output/TFBS_300bp_balanced/Finetuned_models"

In [21]:
for index, row in best_acc_df.iterrows():
    config = row['config']
    
    # Format the learning rate string for the folder name
    learning_rate_str = format(config['learning_rate'], '.1e').split('e-')[-1].lstrip('0')
    learning_rate_folder = f"e-{learning_rate_str}"

    tag = row['tags']
    
    # Construct the pattern to match the best model directory
    best_model_dir_pattern = f"_bs={config['batch_size']}_lr={config['learning_rate']}_wp={config['warm up percentage']}_dp={config['Dropout']}_wd={config['Weight Decay']}_len={config['Maximum sequence length']}_epoch={config['epochs']}.0"

    tag_directory = os.path.join(base_path, tag)
    learning_rate_directory = os.path.join(tag_directory, learning_rate_folder)
    
    if os.path.exists(learning_rate_directory):
        # Find the best model in the learning rate directory
        best_model_path = ""
        for model_dir in os.listdir(learning_rate_directory):
            if best_model_dir_pattern in model_dir:
                best_model_path = os.path.join(learning_rate_directory, model_dir)
                break

        # Delete other models in the learning rate directory
        if best_model_path:
            for model_dir in os.listdir(learning_rate_directory):
                full_model_dir = os.path.join(learning_rate_directory, model_dir)
                if full_model_dir != best_model_path:
                    print(f"Deleting model: {full_model_dir}")
                    shutil.rmtree(full_model_dir)
        else:
            print("No best model match found in the learning rate directory.")

        # Now, delete other learning rate directories under the tag
        for lr_dir in os.listdir(tag_directory):
            full_lr_dir_path = os.path.join(tag_directory, lr_dir)
            if full_lr_dir_path != learning_rate_directory:
                print(f"Deleting learning rate directory: {full_lr_dir_path}")
                shutil.rmtree(full_lr_dir_path)
    else:
        #print(f"Learning rate directory does not exist: {learning_rate_directory}")
        continue

Deleting model: /data/private/pdutta_new/DNABERT_output/TFBS_300bp_balanced/Finetuned_models/CEBPA/e-4/.ipynb_checkpoints
Deleting model: /data/private/pdutta_new/DNABERT_output/TFBS_300bp_balanced/Finetuned_models/CEBPA/e-4/logstep=121_bs=1040_lr=0.0001_wp=0.1_dp=0.1_wd=0.001_len=300_epoch=15.0
Deleting model: /data/private/pdutta_new/DNABERT_output/TFBS_300bp_balanced/Finetuned_models/CEBPA/e-4/logstep=121_bs=1040_lr=0.0003_wp=0.1_dp=0.1_wd=0.001_len=300_epoch=15.0
Deleting model: /data/private/pdutta_new/DNABERT_output/TFBS_300bp_balanced/Finetuned_models/CEBPA/e-4/logstep=121_bs=1040_lr=0.0003_wp=0.1_dp=0.1_wd=0.005_len=300_epoch=15.0
Deleting model: /data/private/pdutta_new/DNABERT_output/TFBS_300bp_balanced/Finetuned_models/CEBPA/e-4/logstep=121_bs=1040_lr=0.0001_wp=0.1_dp=0.1_wd=0.0005_len=300_epoch=15.0
Deleting model: /data/private/pdutta_new/DNABERT_output/TFBS_300bp_balanced/Finetuned_models/CEBPA/e-4/logstep=121_bs=1040_lr=0.0001_wp=0.1_dp=0.1_wd=0.005_len=300_epoch=15.0
De

In [19]:
best_acc_df.sort_values(by="eval_acc", ascending=False)[['tags','eval_acc']]

Unnamed: 0,tags,eval_acc
218,RBM14,0.968908
231,SAFB,0.968090
266,TAF15,0.959936
255,SPI1,0.957358
291,USF1,0.951848
...,...,...
82,H2AFZ,0.500292
95,H3K9me3,0.499966
89,H3K27ac,0.499856
86,H3F3A,0.499651


In [20]:
best_acc_df.sort_values(by="eval_acc", ascending=False)[['tags','eval_acc']].to_csv("TFBS_accuracy_Stat.tsv", sep="\t")

In [None]:
best_acc_df