In [34]:
import os
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.alert import Alert
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException

from pandarallel import pandarallel

import time
import multiprocessing
from tqdm import tqdm
import csv



In [35]:
CSV_PATH = "../../data/csv/"  # Define the path to the CSV files
PDB_PATH = "../../data/pdb/" # Define the path to the pdb files
MOLPROBITY_PATH = "../../data/molprobity/" # Define the path to the saves files

options = webdriver.ChromeOptions() 
options.add_argument('--headless') 
options.add_argument('--no-sandbox') 
options.add_argument('--disable-dev-shm-usage') 

# Set the start method to 'spawn'
# multiprocessing.set_start_method('spawn', force=True)

pandarallel.initialize(progress_bar=True, nb_workers=12)


INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [36]:
# import pandas as pd

# def update_alphafold3_rows(df):
#     columns_to_update = [
#         'poor_rotamers',
#         'poor_rotamers_percentage',
#         'favored_rotamers',
#         'favored_rotamers_percentage',
#         'ramachandran_outliers',
#         'ramachandran_outliers_percentage',
#         'ramachandran_favored',
#         'ramachandran_favored_percentage',
#         'rama_distribution_z_score',
#         'molprobity',
#         'cb_deviations',
#         'cb_deviations_percentage',
#         'bad_bonds',
#         'bad_bonds_percentage',
#         'bad_angles',
#         'bad_angles_percentage',
#         'errat',
#         'verify'
#     ]
    
#     # Update the specified columns with None for rows with 'alphafold3' in the 'model' column
#     df.loc[df['model'] == 'alphafold3', columns_to_update] = None
#     return df

# validation_df = pd.read_csv(f'{CSV_PATH}validation.csv', sep=';')  # Read the CSV file into a DataFrame
# validation_df = update_alphafold3_rows(validation_df)
# validation_df.head(10) 


In [38]:
def click_button(xpath, wait):
    button = wait.until(EC.presence_of_element_located((By.XPATH, xpath)))
    button.click()

def get_value(xpath, wait):
    return wait.until(EC.presence_of_element_located((By.XPATH, xpath))).text

def process_file(driver, row):
    try:
        model = row['model']
        variant = row['variant']


        pdb_file = row['pdb']
        file_path = os.path.abspath(pdb_file)

        wait = WebDriverWait(driver, 12600000)
        driver.get('http://molprobity.biochem.duke.edu/')
        
        # print(f"Sending file: {file_path}")
        xpath = '/html/body/table/tbody/tr[2]/td[2]/div[1]/form/div/table/tbody/tr[3]/td[1]/input'
        file_field = wait.until(EC.presence_of_element_located((By.XPATH, xpath)))
        file_field.send_keys(file_path)
    
        
        # click buttons
        # print()
        # print("Pressing first button")
        click_button('/html/body/table/tbody/tr[2]/td[2]/div[1]/form/div/table/tbody/tr[3]/td[3]/input', wait)
        # print("Pressing second button")
        click_button('/html/body/table/tbody/tr[2]/td/div/form/input[3]', wait)
        # print("Pressing third button")
        click_button('/html/body/table/tbody/tr[2]/td[2]/div[1]/div/table/tbody/tr/td[1]/table/tbody/tr[1]/td[2]/a', wait)
        # print("Pressing forth button")
        click_button('/html/body/table/tbody/tr[2]/td/div/form/p[4]/table/tbody/tr/td[1]/input', wait)
        # click_button('/html/body/table/tbody/tr[2]/td/div/form/p[2]/input', wait)
        # click_button('/html/body/table/tbody/tr[2]/td/div/form/p/input', wait)
        button = wait.until(EC.presence_of_element_located((By.NAME, 'cmd')))
        button.click()
        WebDriverWait(driver, 10).until(EC.alert_is_present())
        alert = Alert(driver)
        alert.dismiss() 
        click_button('/html/body/table/tbody/tr[2]/td/div/form/input[3]', wait)
        click_button('/html/body/table/tbody/tr[2]/td[2]/div[1]/div/table/tbody/tr/td[1]/table/tbody/tr[1]/td[2]/a', wait)
        click_button('/html/body/table/tbody/tr[2]/td/div/form/p[2]/table/tbody/tr/td[1]/input', wait)

        # wait results
        xpath = '/html/body/table/tbody/tr[2]/td/div/p[1]/table/tbody'
        wait.until(EC.presence_of_element_located((By.XPATH, xpath)))

        driver.save_screenshot(f'{MOLPROBITY_PATH}{model}/{variant}.png')

        # get values
        return {
            'poor_rotamers':get_value('/html/body/table/tbody/tr[2]/td/div/p[1]/table/tbody/tr[3]/td[3]',wait),
            'poor_rotamers_percentage':get_value('/html/body/table/tbody/tr[2]/td/div/p[1]/table/tbody/tr[3]/td[4]',wait),
            'favored_rotamers':get_value('/html/body/table/tbody/tr[2]/td/div/p[1]/table/tbody/tr[4]/td[2]',wait),
            'favored_rotamers_percentage':get_value('/html/body/table/tbody/tr[2]/td/div/p[1]/table/tbody/tr[4]/td[3]',wait),
            'ramachandran_outliers':get_value('/html/body/table/tbody/tr[2]/td/div/p[1]/table/tbody/tr[5]/td[2]',wait),
            'ramachandran_outliers_percentage':get_value('/html/body/table/tbody/tr[2]/td/div/p[1]/table/tbody/tr[5]/td[3]',wait),
            'ramachandran_favored':get_value('/html/body/table/tbody/tr[2]/td/div/p[1]/table/tbody/tr[6]/td[2]',wait),
            'ramachandran_favored_percentage':get_value('/html/body/table/tbody/tr[2]/td/div/p[1]/table/tbody/tr[6]/td[3]',wait),
            'rama_distribution_z_score':get_value('/html/body/table/tbody/tr[2]/td/div/p[1]/table/tbody/tr[7]/td[2]',wait),
            'molprobity':get_value('/html/body/table/tbody/tr[2]/td/div/p[1]/table/tbody/tr[8]/td[2]',wait),
            'cb_deviations':get_value('/html/body/table/tbody/tr[2]/td/div/p[1]/table/tbody/tr[9]/td[2]',wait),
            'cb_deviations_percentage':get_value('/html/body/table/tbody/tr[2]/td/div/p[1]/table/tbody/tr[9]/td[3]',wait),
            'bad_bonds':get_value('/html/body/table/tbody/tr[2]/td/div/p[1]/table/tbody/tr[10]/td[2]',wait),
            'bad_bonds_percentage':get_value('/html/body/table/tbody/tr[2]/td/div/p[1]/table/tbody/tr[10]/td[3]',wait),
            'bad_angles':get_value('/html/body/table/tbody/tr[2]/td/div/p[1]/table/tbody/tr[11]/td[2]',wait),
            'bad_angles_percentage':get_value('/html/body/table/tbody/tr[2]/td/div/p[1]/table/tbody/tr[11]/td[3]',wait)
        }
        # print(result)
        # input()
    except Exception as e:
        print(e)
        # input()
        return {
            'poor_rotamers':None,
            'poor_rotamers_percentage':None,
            'favored_rotamers':None,
            'favored_rotamers_percentage':None,
            'ramachandran_outliers':None,
            'ramachandran_outliers_percentage':None,
            'ramachandran_favored':None,
            'ramachandran_favored_percentage':None,
            'rama_distribution_z_score':None,
            'molprobity':None,
            'cb_deviations':None,
            'cb_deviations_percentage':None,
            'bad_bonds':None,
            'bad_bonds_percentage':None,
            'bad_angles':None,
            'bad_angles_percentage':None
        }

# global_results_df = pd.DataFrame()

def process_row(row):
    driver = webdriver.Chrome(options=options)
    result = process_file(driver, row)
    driver.quit()
    return result
    
    # # Merge the result with the row, replacing NaN values
    # for key, value in result.items():
    #     row[key] = value
    
    # # Save the result to a CSV file
    # file_exists = os.path.isfile('results.csv')
    # with open('results.csv', mode='a', newline='') as file:
    #     writer = csv.writer(file)
    #     if not file_exists:
    #         # Write the header
    #         header = list(row.index)
    #         writer.writerow(header)
        
    #     # Write the row data
    #     row_data = list(row.values)
    #     writer.writerow(row_data)

    
# def process_row(row):
#     driver = webdriver.Chrome(options=options)
#     result = process_file(driver, row)
#     driver.quit()
#     for key, value in result.items():
#         row[key] = value

#     return row
    

In [49]:
validation_df = pd.read_csv(f'{CSV_PATH}validation.csv', sep=';')  # Read the CSV file into a DataFrame
validation_df.head()  # Display the first few rows of the DataFrame

Unnamed: 0,gene,identifier,variant,fasta,model,pdb,errat,verify,poor_rotamers,poor_rotamers_percentage,...,ramachandran_favored,ramachandran_favored_percentage,rama_distribution_z_score,molprobity,cb_deviations,cb_deviations_percentage,bad_bonds,bad_bonds_percentage,bad_angles,bad_angles_percentage
0,atpE,Rv1305,atpE_Ala63Pro,MDPTIAAGALIGGGLIMAGGAIGAGIGDGVAGNALISGVARQPEAQ...,swiss_model,../../data/pdb/swiss_model/atpE_Ala63Pro.pdb,100.0,29.63% >= 0.1,0.0,0.00%,...,77.0,97.47%,3.15 ± 0.90,0.61,0.0,0.00%,0 / 582,0.00%,8 / 791,1.01%
1,atpE,Rv1305,atpE_Ala63Pro,MDPTIAAGALIGGGLIMAGGAIGAGIGDGVAGNALISGVARQPEAQ...,colab_alphafold2,../../data/pdb/colab_alphafold2/atpE_Ala63Pro.pdb,100.0,48.15% >= 0.1,2.0,3.77%,...,75.0,94.94%,1.17 ± 0.93,2.66,0.0,0.00%,20 / 581,3.44%,3 / 789,0.38%
2,atpE,Rv1305,atpE_Ala63Pro,MDPTIAAGALIGGGLIMAGGAIGAGIGDGVAGNALISGVARQPEAQ...,modeller,../../data/pdb/modeller/atpE_Ala63Pro.pdb,100.0,28.40% >= 0.1,1.0,1.89%,...,78.0,98.73%,4.56 ± 0.91,2.31,1.0,1.49%,0 / 582,0.00%,12 / 791,1.52%
3,atpE,Rv1305,atpE_Ala63Pro,MDPTIAAGALIGGGLIMAGGAIGAGIGDGVAGNALISGVARQPEAQ...,phyre2,../../data/pdb/phyre2/atpE_Ala63Pro.pdb,100.0,24.69% >= 0.1,0.0,0.00%,...,74.0,93.67%,-1.94 ± 0.86,2.18,0.0,0.00%,3 / 581,0.52%,0 / 789,0.00%
4,atpE,Rv1305,atpE_Ala63Pro,MDPTIAAGALIGGGLIMAGGAIGAGIGDGVAGNALISGVARQPEAQ...,i_tasser,File not found,,,,,...,,,,,,,,,,


In [50]:
len(validation_df)

3072

In [51]:
total = len(validation_df)
todo = int(validation_df["poor_rotamers"].isna().sum())
done = total - todo
print(f"Total: {total}\n Done: {done}\n Todo: {todo}")
      

Total: 3072
 Done: 1558
 Todo: 1514


In [52]:
not_concluded_df = validation_df[(validation_df['poor_rotamers'].isna())]
not_concluded_df.head()

Unnamed: 0,gene,identifier,variant,fasta,model,pdb,errat,verify,poor_rotamers,poor_rotamers_percentage,...,ramachandran_favored,ramachandran_favored_percentage,rama_distribution_z_score,molprobity,cb_deviations,cb_deviations_percentage,bad_bonds,bad_bonds_percentage,bad_angles,bad_angles_percentage
4,atpE,Rv1305,atpE_Ala63Pro,MDPTIAAGALIGGGLIMAGGAIGAGIGDGVAGNALISGVARQPEAQ...,i_tasser,File not found,,,,,...,,,,,,,,,,
5,atpE,Rv1305,atpE_Ala63Pro,MDPTIAAGALIGGGLIMAGGAIGAGIGDGVAGNALISGVARQPEAQ...,rosetta,File not found,,,,,...,,,,,,,,,,
7,atpE,Rv1305,atpE_Ala63Pro,MDPTIAAGALIGGGLIMAGGAIGAGIGDGVAGNALISGVARQPEAQ...,foldX,File not found,,,,,...,,,,,,,,,,
12,atpE,Rv1305,atpE_Asp28Ala,MDPTIAAGALIGGGLIMAGGAIGAGIGAGVAGNALISGVARQPEAQ...,i_tasser,File not found,,,,,...,,,,,,,,,,
13,atpE,Rv1305,atpE_Asp28Ala,MDPTIAAGALIGGGLIMAGGAIGAGIGAGVAGNALISGVARQPEAQ...,rosetta,File not found,,,,,...,,,,,,,,,,


In [53]:
models_to_keep = ["alphafold3", "colab_alphafold2", "swiss_model", "modeller", "phyre2"]
not_concluded_df = not_concluded_df[not_concluded_df['model'].isin(models_to_keep)]

In [54]:
print(f"ToDo filtered models: {int(not_concluded_df["poor_rotamers"].isna().sum())}")

ToDo filtered models: 362


In [55]:
not_concluded_df = not_concluded_df.astype(str)

In [None]:
# # driver = webdriver.Chrome()
for i,(index, row) in enumerate(not_concluded_df.iterrows()):
    print(f"Processing {i+1}/{len(not_concluded_df)}")
    result = process_row(row)
    for key, value in result.items():
        validation_df.loc[row.name, key] = value
# # driver.quit()


# not_concluded_df = not_concluded_df.head(100)
# not_concluded_df.parallel_apply(process_row, axis=1)


Processing 1/362


The chromedriver version (132.0.6834.83) detected in PATH at /usr/local/bin/chromedriver might not be compatible with the detected chrome version (133.0.6943.54); currently, chromedriver 133.0.6943.53 is recommended for chrome 133.*, so it is advised to delete the driver in PATH and retry


In [None]:

# def update_validation_df_from_csv(validation_df, csv_file='results.csv'):
#     with open(csv_file, mode='r') as file:
#         reader = csv.DictReader(file)
#         for row in reader:
#             index = int(row['index'])  # Assuming 'index' is the name of the index column
#             for key, value in row.items():
#                 if key != 'index':
#                     validation_df.loc[index, key] = value
#     return validation_df
# # Update the validation_df with the updated rows
# validation_df = update_validation_df_from_csv(validation_df)

In [47]:
total = len(validation_df)
todo = int(validation_df["poor_rotamers"].isna().sum())
done = total - todo
print(f"Total: {total}\n Done: {done}\n Todo: {todo}")
      

Total: 3072
 Done: 1558
 Todo: 1514


In [48]:
validation_df.to_csv(f"{CSV_PATH}validation.csv", index=False, sep=';')