In [1]:
import os
import time
import pandas as pd


In [2]:
MODELS = [
    "swiss_model",
    "colab_alphafold2",
    "modeller",
    "phyre2",
    "i_tasser",
    "rosetta",
    "alphafold3",
    "foldX"
]

CSV_PATH = "../../data/csv/"  # Define the path to the CSV files
PDB_PATH = "../../data/pdb/"

In [3]:
variant_df = pd.read_csv(f'{CSV_PATH}fasta_variant.csv', sep=';')  # Read the CSV file into a DataFrame
variant_df.head()  # Display the first few rows of the DataFrame

Unnamed: 0,gene,identifier,variant,fasta,swiss_model,phyre2,colab_alphafold2,i_tasser,modeller,rosetta,alphafold3,fold_x
0,atpE,Rv1305,atpE_p.Ala63Pro,MDPTIAAGALIGGGLIMAGGAIGAGIGDGVAGNALISGVARQPEAQ...,concluded,concluded,concluded,not_concluded,concluded,https://yanglab.qd.sdu.edu.cn/trRosetta/output...,concluded,not_completed
1,atpE,Rv1305,atpE_p.Asp28Ala,MDPTIAAGALIGGGLIMAGGAIGAGIGAGVAGNALISGVARQPEAQ...,concluded,concluded,concluded,not_concluded,concluded,https://yanglab.qd.sdu.edu.cn/trRosetta/output...,concluded,not_completed
2,atpE,Rv1305,atpE_p.Asp28Gly,MDPTIAAGALIGGGLIMAGGAIGAGIGGGVAGNALISGVARQPEAQ...,concluded,concluded,concluded,not_concluded,concluded,https://yanglab.qd.sdu.edu.cn/trRosetta/output...,concluded,not_completed
3,atpE,Rv1305,atpE_p.Asp28Val,MDPTIAAGALIGGGLIMAGGAIGAGIGVGVAGNALISGVARQPEAQ...,concluded,concluded,concluded,not_concluded,concluded,https://yanglab.qd.sdu.edu.cn/trRosetta/output...,concluded,not_completed
4,atpE,Rv1305,atpE_p.Glu61Asp,MDPTIAAGALIGGGLIMAGGAIGAGIGDGVAGNALISGVARQPEAQ...,concluded,concluded,concluded,not_concluded,concluded,https://yanglab.qd.sdu.edu.cn/trRosetta/output...,concluded,not_completed


In [4]:
data = []
columns = variant_df.columns

for index, row in variant_df.iterrows():
    gene = row["gene"]
    identifier = row["identifier"]
    variant = row["variant"].replace('_p.', '_')
    fasta = row["fasta"]
    for model in MODELS:
        if model in columns:
            status_model = row[model]
            if status_model == "concluded":
                pdb = os.path.join(PDB_PATH, model, f"{variant}.pdb")  
            else:
                pdb = "File not found"
        else:
            pdb = "File not found"
        
        data.append([gene, identifier, variant, fasta, model, pdb])

validation_df = pd.DataFrame(data, columns=["gene", "identifier", "variant", "fasta", "model", "pdb"])
validation_df.head()

Unnamed: 0,gene,identifier,variant,fasta,model,pdb
0,atpE,Rv1305,atpE_Ala63Pro,MDPTIAAGALIGGGLIMAGGAIGAGIGDGVAGNALISGVARQPEAQ...,swiss_model,../../data/pdb/swiss_model/atpE_Ala63Pro.pdb
1,atpE,Rv1305,atpE_Ala63Pro,MDPTIAAGALIGGGLIMAGGAIGAGIGDGVAGNALISGVARQPEAQ...,colab_alphafold2,../../data/pdb/colab_alphafold2/atpE_Ala63Pro.pdb
2,atpE,Rv1305,atpE_Ala63Pro,MDPTIAAGALIGGGLIMAGGAIGAGIGDGVAGNALISGVARQPEAQ...,modeller,../../data/pdb/modeller/atpE_Ala63Pro.pdb
3,atpE,Rv1305,atpE_Ala63Pro,MDPTIAAGALIGGGLIMAGGAIGAGIGDGVAGNALISGVARQPEAQ...,phyre2,../../data/pdb/phyre2/atpE_Ala63Pro.pdb
4,atpE,Rv1305,atpE_Ala63Pro,MDPTIAAGALIGGGLIMAGGAIGAGIGDGVAGNALISGVARQPEAQ...,i_tasser,File not found


In [None]:
# check if already exist a validation.csv
if not os.path.exists(f"{CSV_PATH}validation.csv"):
    validation_df.to_csv(f"{CSV_PATH}validation.csv", index=False, sep=';')