In [2]:
import os
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException

import time


In [3]:
CSV_PATH = "../../data/csv/"  # Define the path to the CSV files
PDB_PATH = "../../data/pdb/" # Define the path to the pdb files
SAVES_PATH = "../../data/saves/" # Define the path to the saves files

options = webdriver.ChromeOptions() 
options.add_argument('--headless') 
options.add_argument('--no-sandbox') 
options.add_argument('--disable-dev-shm-usage') 


In [40]:
def process_file(driver, row):
    model = row['model']
    variant = row['variant']

    # if variant == "atpE_Glu61Asp" and model == "alphafold3":
    #     input() 
    pdb_file = row['pdb']
    file_path = os.path.abspath(pdb_file)

    wait = WebDriverWait(driver, 12600000)
    driver.get('https://saves.mbi.ucla.edu/')
    file_field = wait.until(EC.presence_of_element_located((By.ID, "pdbfile")))
    file_field.send_keys(file_path)
    
    # click buttons
    start_button = wait.until(EC.element_to_be_clickable((By.ID, 'startjob')))
    driver.execute_script("arguments[0].scrollIntoView(true);", start_button)
    start_button.click()
    
    errat_button = wait.until(EC.element_to_be_clickable((By.ID, 'errat')))
    driver.execute_script("arguments[0].scrollIntoView(true);", errat_button)
    errat_button.click()
    
    verify_button = wait.until(EC.element_to_be_clickable((By.ID, 'verify')))
    driver.execute_script("arguments[0].scrollIntoView(true);", verify_button)
    verify_button.click()

    # wait results
    xpath = "//u[text()='Overall Quality Factor']"
    wait.until(EC.presence_of_element_located((By.XPATH, xpath)))
    xpath = "/html/body/table/tbody/tr[1]/td[2]/div[1]/div/span"
    wait.until(EC.presence_of_element_located((By.XPATH, xpath)))

    driver.save_screenshot(f'{SAVES_PATH}{model}/{variant}.png')

    # get values
    xpath = '/html/body/table/tbody/tr[1]/td[1]/div[1]/div/center/center/h1'
    errat_value = wait.until(EC.presence_of_element_located((By.XPATH, xpath))).text
    
    xpath = '/html/body/table/tbody/tr[1]/td[2]/div[1]/div/center/div[1]'
    raw_value = wait.until(EC.presence_of_element_located((By.XPATH, xpath))).text
    verify_value = f"{raw_value.split(' ')[0]} {raw_value.split(' ')[-2]} {raw_value.split(' ')[-1]}"
    return errat_value, verify_value


In [51]:
validation_df = pd.read_csv(f'{CSV_PATH}validation.csv', sep=';')  # Read the CSV file into a DataFrame
validation_df.head()  # Display the first few rows of the DataFrame

Unnamed: 0,gene,identifier,variant,fasta,model,pdb,errat,verify
0,atpE,Rv1305,atpE_Ala63Pro,MDPTIAAGALIGGGLIMAGGAIGAGIGDGVAGNALISGVARQPEAQ...,swiss_model,../../data/pdb/swiss_model/atpE_Ala63Pro.pdb,100.0,29.63% >= 0.1
1,atpE,Rv1305,atpE_Ala63Pro,MDPTIAAGALIGGGLIMAGGAIGAGIGDGVAGNALISGVARQPEAQ...,colab_alphafold2,../../data/pdb/colab_alphafold2/atpE_Ala63Pro.pdb,100.0,48.15% >= 0.1
2,atpE,Rv1305,atpE_Ala63Pro,MDPTIAAGALIGGGLIMAGGAIGAGIGDGVAGNALISGVARQPEAQ...,modeller,../../data/pdb/modeller/atpE_Ala63Pro.pdb,100.0,28.40% >= 0.1
3,atpE,Rv1305,atpE_Ala63Pro,MDPTIAAGALIGGGLIMAGGAIGAGIGDGVAGNALISGVARQPEAQ...,phyre2,../../data/pdb/phyre2/atpE_Ala63Pro.pdb,100.0,24.69% >= 0.1
4,atpE,Rv1305,atpE_Ala63Pro,MDPTIAAGALIGGGLIMAGGAIGAGIGDGVAGNALISGVARQPEAQ...,i_tasser,File not found,,


In [52]:
len(validation_df)

3072

In [62]:
total = len(validation_df)
todo = int(validation_df["errat"].isna().sum())
done = total - todo
print(f"Total: {total}\n Done: {done}\n Todo: {todo}")
      

Total: 3072
 Done: 1813
 Todo: 1259


In [63]:
error_df = validation_df[(validation_df["errat"]=="Error") | (validation_df["verify"]=="Error")]
error_df.head()

Unnamed: 0,gene,identifier,variant,fasta,model,pdb,errat,verify
2182,rpoB,Rv0667,rpoB_Arg448Lys,LADSRQSKTAASPSPSRPQSSSNNSVPGAPNRVSFAKLREPLEVPG...,alphafold3,../../data/pdb/alphafold3/rpoB_Arg448Lys.pdb,,Error
2190,rpoB,Rv0667,rpoB_Asn437Asp,LADSRQSKTAASPSPSRPQSSSNNSVPGAPNRVSFAKLREPLEVPG...,alphafold3,../../data/pdb/alphafold3/rpoB_Asn437Asp.pdb,,Error
2198,rpoB,Rv0667,rpoB_Asn437His,LADSRQSKTAASPSPSRPQSSSNNSVPGAPNRVSFAKLREPLEVPG...,alphafold3,../../data/pdb/alphafold3/rpoB_Asn437His.pdb,,Error
2206,rpoB,Rv0667,rpoB_Asn437Ile,LADSRQSKTAASPSPSRPQSSSNNSVPGAPNRVSFAKLREPLEVPG...,alphafold3,../../data/pdb/alphafold3/rpoB_Asn437Ile.pdb,,Error
2214,rpoB,Rv0667,rpoB_Asn437Ser,LADSRQSKTAASPSPSRPQSSSNNSVPGAPNRVSFAKLREPLEVPG...,alphafold3,../../data/pdb/alphafold3/rpoB_Asn437Ser.pdb,,Error


In [65]:
not_concluded_df = validation_df[(validation_df['errat'].isna()) | (validation_df['verify'].isna())]
not_concluded_df.head()

Unnamed: 0,gene,identifier,variant,fasta,model,pdb,errat,verify
4,atpE,Rv1305,atpE_Ala63Pro,MDPTIAAGALIGGGLIMAGGAIGAGIGDGVAGNALISGVARQPEAQ...,i_tasser,File not found,,
5,atpE,Rv1305,atpE_Ala63Pro,MDPTIAAGALIGGGLIMAGGAIGAGIGDGVAGNALISGVARQPEAQ...,rosetta,File not found,,
7,atpE,Rv1305,atpE_Ala63Pro,MDPTIAAGALIGGGLIMAGGAIGAGIGDGVAGNALISGVARQPEAQ...,foldX,File not found,,
12,atpE,Rv1305,atpE_Asp28Ala,MDPTIAAGALIGGGLIMAGGAIGAGIGAGVAGNALISGVARQPEAQ...,i_tasser,File not found,,
13,atpE,Rv1305,atpE_Asp28Ala,MDPTIAAGALIGGGLIMAGGAIGAGIGAGVAGNALISGVARQPEAQ...,rosetta,File not found,,


In [66]:
models_to_keep = ["alphafold3", "colab_alphafold2", "swiss_model", "modeller", "phyre2"]
not_concluded_df = not_concluded_df[not_concluded_df['model'].isin(models_to_keep)]
error_df = error_df[error_df['model'].isin(models_to_keep)]

In [67]:
print(f"Error: {int(len(error_df))}")

Error: 107


In [68]:
print(f"ToDo: {int(len(not_concluded_df))}")

ToDo: 107


In [47]:
# not_concluded_df = not_concluded_df.head(1)

In [None]:
driver = webdriver.Chrome(options=options)
# driver = webdriver.Chrome()
try:
    for i, (index, row) in enumerate(not_concluded_df.iterrows()):
        print(f"Evaluating model {i+1} from {len(not_concluded_df)}")

        try:
            errat_value, verify_value = process_file(driver, row)
            validation_df.loc[index, 'errat'] = errat_value
            validation_df.loc[index, 'verify'] = verify_value
            print(f"Model {row['model']} {row['variant']} processed")
        except StaleElementReferenceException as e:
            print(f"Stale element reference error processing model {row['model']} {row['variant']}: {e}")
            # Retry the operation
            try:
                print(f"Model {row['model']} {row['variant']} retrying")
                errat_value, verify_value = process_file(driver, row)
                validation_df.loc[index, 'errat'] = errat_value
                validation_df.loc[index, 'verify'] = verify_value
            except Exception as e:
                print(f"Error processing model {row['model']} {row['variant']}: {e}")
                validation_df.loc[index, 'errat'] = "Error"
                validation_df.loc[index, 'verify'] = "Error"
                # input("ERROR")
        except Exception as e:
            print(f"Error processing model {row['model']} {row['variant']}: {e}")
            validation_df.loc[index, 'errat'] = "Error"
            validation_df.loc[index, 'verify'] = "Error"
            # input("ERROR")
        finally:
            if validation_df.loc[index, 'errat']=="ERROR" or validation_df.loc[index, 'verify']=="ERROR" or validation_df.loc[index, 'errat']==None or validation_df.loc[index, 'verify']==None:
                validation_df.loc[index, 'errat'] = "Error"
                validation_df.loc[index, 'verify'] = "Error"
            validation_df.to_csv(f"{CSV_PATH}validation.csv", index=False, sep=';')
finally:
    driver.quit()

Evaluating model 1 from 107


In [49]:
validation_df.head()

Unnamed: 0,gene,identifier,variant,fasta,model,pdb,errat,verify
0,atpE,Rv1305,atpE_Ala63Pro,MDPTIAAGALIGGGLIMAGGAIGAGIGDGVAGNALISGVARQPEAQ...,swiss_model,../../data/pdb/swiss_model/atpE_Ala63Pro.pdb,100.0,29.63% >= 0.1
1,atpE,Rv1305,atpE_Ala63Pro,MDPTIAAGALIGGGLIMAGGAIGAGIGDGVAGNALISGVARQPEAQ...,colab_alphafold2,../../data/pdb/colab_alphafold2/atpE_Ala63Pro.pdb,100.0,48.15% >= 0.1
2,atpE,Rv1305,atpE_Ala63Pro,MDPTIAAGALIGGGLIMAGGAIGAGIGDGVAGNALISGVARQPEAQ...,modeller,../../data/pdb/modeller/atpE_Ala63Pro.pdb,100.0,28.40% >= 0.1
3,atpE,Rv1305,atpE_Ala63Pro,MDPTIAAGALIGGGLIMAGGAIGAGIGDGVAGNALISGVARQPEAQ...,phyre2,../../data/pdb/phyre2/atpE_Ala63Pro.pdb,100.0,24.69% >= 0.1
4,atpE,Rv1305,atpE_Ala63Pro,MDPTIAAGALIGGGLIMAGGAIGAGIGDGVAGNALISGVARQPEAQ...,i_tasser,File not found,,


In [23]:
validation_df.to_csv(f"{CSV_PATH}validation.csv", index=False, sep=';')