# PDB Mycobrowser analysis

### Import libraries and load dependencies:

In [None]:
import pandas as pd  # Import pandas for data manipulation

from selenium import webdriver  # Import webdriver for browser automation
from selenium.webdriver.common.by import By  # Import By for locating elements
from selenium.webdriver.chrome.service import Service  # Import Service for Chrome driver
from selenium.webdriver.support.ui import WebDriverWait  # Import WebDriverWait for waiting for elements
from selenium.webdriver.support import expected_conditions as EC  # Import expected_conditions for waiting for conditions

options = webdriver.ChromeOptions()  # Initialize Chrome options
options.add_argument('--headless')  # Run Chrome in headless mode
options.add_argument('--no-sandbox')  # Disable the sandbox for Chrome
options.add_argument('--disable-dev-shm-usage')  # Disable the /dev/shm usage for Chrome

driver = webdriver.Chrome(options=options)  # Create a new instance of the Chrome driver with the specified options


### Define global variables

In [None]:
CSV_PATH = "../../data/csv/"  # Path to CSV files

### Load dataset

In [None]:
wild_df = pd.read_csv(f'{CSV_PATH}complete_wild.csv', sep=';')  # Load complete wild dataset


### Getting PDB information from Mycobrowser url

In [None]:
wild_df.head()  # Display the first few rows of the dataset


In [None]:
print(f"Dataset shape: {wild_df.shape}")  # Print the shape of the dataset
print(f"Rows: {wild_df.shape[0]}")  # Print the number of rows
print(f"Columns: {wild_df.shape[1]}")  # Print the number of columns


In [None]:
wild_df.describe()  # Display summary statistics of the dataset


Function to fetch PDB URL from MycoBrowser and save into a file

In [None]:
def wait_column_menu(driver):
    column_xpath = '//*[@id="main"]/div[5]/div[1]'  # XPath for the column menu
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, column_xpath))  # Wait until the column menu is present
    )

In [None]:
def find_structural_information(driver):
    structural_information_xpath = "//div[@class='panel-heading' and contains(text(), 'Structural information')]"  # XPath for structural information
    elements = driver.find_elements(By.XPATH, structural_information_xpath)  # Find elements matching the XPath
    return elements[0] if elements else None  # Return the first element if found, otherwise return None


In [None]:
def find_pdb_information(structural_information):
    parent_div = structural_information.find_element(By.XPATH, "..")  # Find the parent div of the structural information
    pdb_information_xpath = ".//tr[td[contains(text(), 'Protein Data Bank')]]"  # XPath for PDB information
    elements = parent_div.find_elements(By.XPATH, pdb_information_xpath)  # Find elements matching the XPath
    return elements[0] if elements else None  # Return the first element if found, otherwise return None


In [None]:
def find_pdb_links(pdb_information):
    links = pdb_information.find_elements(By.XPATH, ".//td[2]//a")  # Find links within the PDB information
    return links if links else None  # Return the links if found, otherwise return None

In [None]:
def get_pdb_info(driver):
    wait_column_menu(driver)  # Wait for the column menu to be present
    structural_information = find_structural_information(driver)  # Find structural information
    if not structural_information:
        return "No structural information found"  # Return message if no structural information is found
    else:
        pdb_information = find_pdb_information(structural_information)  # Find PDB information
        if not pdb_information:
            return "No PDB information found"  # Return message if no PDB information is found
        else:
            links = find_pdb_links(pdb_information)  # Find PDB links
            if links:
                return "PDB information found"  # Return message if PDB links are found
            else:
                return "No url found."  # Return message if no PDB links are found

Process each gene in the dataset

In [None]:
pdb_table = []  # Initialize an empty list to store PDB information
for index, row in wild_df.iterrows():
    print(f"Processing genes... ({index+1} from {wild_df.shape[0]})")  # Print progress
    gene = row["gene"]  # Get the gene name
    url = row["mycobrowser_url"]  # Get the MycoBrowser URL
    try:
        driver.get(url)  # Navigate to the URL
        pdb_info = get_pdb_info(driver)  # Get PDB information
        pdb_table.append(
            {
                "gene": gene,  # Add gene name to the dictionary
                "mycobrowser_URL": url,  # Add MycoBrowser URL to the dictionary
                "has_PDB_info": pdb_info,  # Add PDB information status to the dictionary
            }
        )
    except Exception as e:
        print(f"Error processing gene {gene}: {e}")  # Print error message if an exception occurs


Create a DataFrame from the PDB information

In [None]:
pdb_df = pd.DataFrame(pdb_table)  # Convert the list of dictionaries to a DataFrame
pdb_df.head()  # Display the first few rows of the PDB DataFrame


In [None]:
pdb_df.describe()  # Display summary statistics of the PDB DataFrame


In [None]:
pdb_df["has_PDB_info"].value_counts()  # Count the occurrences of each PDB information status


Save the PDB information to a CSV file


In [None]:
pdb_df.to_csv(f"{CSV_PATH}complete_wild_pdb_mycobrowser.csv", index=False, sep=';')  # Save the DataFrame to a CSV file