# Rosetta

## The following notebook is an attempt to process data by accessing the website via scraping using Selenium. However, the approach that will be used is running locally using the software.

### Import libraries and load dependencies

In [None]:
import time  # Import time library for sleep functionality
import pandas as pd  # Import pandas library for data manipulation
import requests  # Import requests library for making HTTP requests
import gzip  # Import gzip library for handling gzip files
import os  # Import os library for file and directory operations
import shutil  # Import shutil library for file operations
from pandarallel import pandarallel  # Import pandarallel for parallel processing with pandas
pandarallel.initialize(progress_bar=True)  # Initialize pandarallel with a progress bar
from selenium import webdriver  # Import webdriver from selenium for web scraping
from selenium.webdriver.common.by import By  # Import By for locating elements
from selenium.webdriver.chrome.service import Service  # Import Service for Chrome driver
from selenium.webdriver.support.ui import WebDriverWait  # Import WebDriverWait for waiting for elements
from selenium.webdriver.support import expected_conditions as EC  # Import expected_conditions for waiting for elements

### Define global variables

In [None]:
CSV_PATH = "../../data/csv/"  # Define the path to the CSV files
PDB_PATH = "../../data/pdb/rosetta/"  # Define the path to the PDB files
FASTA_PATH = '../../data/fasta/variant'  # Define the path to the FASTA files
DOWNLOAD_PATH = os.path.expanduser("~/Downloads")  # Define the download path
ROSETTA_URL = "https://yanglab.qd.sdu.edu.cn/trRosetta/"  # Define the Rosetta URL
options = webdriver.ChromeOptions()  # Create Chrome options
options.add_argument('--headless')  # Run Chrome in headless mode
options.add_argument('--no-sandbox')  # Disable sandbox mode
options.add_argument('--disable-dev-shm-usage')  # Disable dev/shm usage

### Get models

In [None]:
def check_and_update_status(row):
    variant = row["variant"]  # Get the variant from the row
    filename = variant.replace('p.', '')  # Replace 'p.' with '' in the variant name
    file_path = f"{PDB_PATH}{filename}.pdb"  # Define the file path for the PDB file
    
    if os.path.isfile(file_path):  # Check if the PDB file exists
        return 'concluded'  # Return 'concluded' if the file exists

    current_status = row.get("rosetta", "not_concluded")  # Get the current status from the row
    if current_status != "not_concluded":  # Check if the current status is not 'not_concluded'
        return current_status  # Return the current status
    
    return "not_concluded"  # Return 'not_concluded' if the file does not exist and the current status is 'not_concluded'

def update_rosetta_status(df):
    if 'rosetta' not in df.columns:  # Check if 'rosetta' column is not in the DataFrame
        df['rosetta'] = 'not_concluded'  # Add 'rosetta' column with default value 'not_concluded'
    df['rosetta'] = df.apply(check_and_update_status, axis=1)  # Update 'rosetta' status for each row
    print("Status updated based on existing files")  # Print a status update message
    return df  # Return the updated DataFrame

def get_fasta(variant):
    fasta_file = os.path.join(FASTA_PATH, f'{variant}.fasta')  # Define the path to the FASTA file
    if os.path.isfile(fasta_file):  # Check if the FASTA file exists
        with open(fasta_file, 'r') as file:  # Open the FASTA file for reading
            fasta = file.read()  # Read the FASTA content
            return fasta  # Return the FASTA content
    else:
        print(f'File {fasta_file} does not exist.')  # Print a message if the file does not exist



In [None]:
variant_df = pd.read_csv(f'{CSV_PATH}fasta_variant.csv', sep=';')  # Read the CSV file into a DataFrame
variant_df.head()  # Display the first few rows of the DataFrame


In [None]:
variant_df = update_rosetta_status(variant_df)  # Update the rosetta status in the DataFrame
variant_df['rosetta'].value_counts()  # Count the values in the rosetta column


In [None]:
def submit_to_rosetta(driver, fasta_content, variant):
    time.sleep(5)  # Wait for 10 seconds for the job to be submitted
    driver.get(ROSETTA_URL)  # Open the Rosetta URL
    wait = WebDriverWait(driver, 600)  # Create a WebDriverWait object with a timeout of 600 seconds

    fasta_input = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="PDB"]')))  # Wait for the FASTA input element to be present
    fasta_input.send_keys(fasta_content)  # Enter the FASTA content

    infas_radio_button = driver.find_element(By.XPATH, '//*[@id="infas"]')  # Find the radio button
    infas_radio_button.click()  # Click the radio button

    msa_select = driver.find_element(By.XPATH, '//*[@id="form1"]/select')  # Find the dropdown
    msa_select.click()  # Click the dropdown
    msa_option = driver.find_element(By.XPATH, '//*[@id="msafas"]')  # Find the option in the dropdown
    msa_option.click()  # Click the option

    variant_input = driver.find_element(By.XPATH, '//*[@id="form1"]/input[3]')  # Find the variant input element
    variant_input.send_keys(variant.replace("_p.", "_"))  # Enter the variant name

    checkbox1 = driver.find_element(By.XPATH, '//*[@id="form1"]/p[1]/input')  # Find the first checkbox
    checkbox1.click()  # Click the first checkbox
    checkbox2 = driver.find_element(By.XPATH, '//*[@id="form1"]/p[2]/input')  # Find the second checkbox
    checkbox2.click()  # Click the second checkbox

    submit_button = driver.find_element(By.XPATH, '//*[@id="submit"]')  # Find the submit button
    submit_button.click()  # Click the submit button
    time.sleep(5)
    try:
        result_element = wait.until(EC.presence_of_element_located((By.XPATH, "/html/body/a")))  # Wait for the element with the specified XPath to appear
        result_url = result_element.get_attribute("href")  # Get the href attribute of the result element
        if result_url is "https://yanglab.qd.sdu.edu.cn/trRosetta":
            print(f"Error for sequence {fasta_content}, variant {variant}.")  # Print an error message
            return 'not_concluded'
        return result_url  # Return the result URL
    except Exception as e:
        print(f"Error: {e}")  # Print the error message
        return None


In [None]:
not_concluded_df = variant_df[variant_df['rosetta'] == 'not_concluded']  # Filter rows with 'not_concluded' status
not_concluded_df['rosetta'].value_counts()  # Count the values in the filtered rosetta column


Submit sequence to Rosetta

In [None]:
driver = webdriver.Chrome(options=options)  # Create a Chrome driver with the specified options
# driver = webdriver.Chrome()  # Create a Chrome driver with the specified options
try:
    for i, (index, row) in enumerate(not_concluded_df.iterrows()):  # Iterate over the rows of the filtered DataFrame
        variant = row['variant'].replace("_p.", "_")  # Replace '_p.' with '_' in the variant name
        print(f"Processing variant {variant} ---------------- {i+1} from {len(not_concluded_df)} not concluded")  # Print the progress
        fasta = get_fasta(variant)  # Get the FASTA content
        print(f"Submitting variant {variant} to Rosetta")  # Print a message
        url = submit_to_rosetta(driver, fasta, variant)  # Submit to Rosetta and get the URL
        variant_df.at[index, 'rosetta'] = str(url)  # Update the URL in the DataFrame
        print(f"URL: {url}")  # Print the URL
except Exception as e:
    print(f"Error processing variants: {e}")  # Print an error message
finally:
    driver.quit()  # Quit the driver



In [None]:
variant_df.to_csv(f'{CSV_PATH}fasta_variant.csv', sep=';', index=False)  # Save the updated DataFrame to CSV

In [None]:
variant_df['rosetta'].nunique()# count how many different values in the rosetta column

In [None]:
variant_df['rosetta'].value_counts()  # Count the values in the rosetta column


In [None]:
submitted_df = variant_df[variant_df['rosetta'].str.contains('https')]  # Filter rows where 'rosetta' column contains 'https'

In [None]:
submitted_df.head()

In [None]:
submitted_df['rosetta'].nunique() # count how many different values in the rosetta column

In [None]:
submitted_df['rosetta'].value_counts()  # Count the values in the rosetta column


Submit sequence to Rosetta

In [None]:
def download_from_rosetta(driver, url, variant):
    time.sleep(3)  # Wait for 3 seconds before starting the process
    driver.get(url)  # Open the Rosetta URL

    try:
        # Check if the element with href containing 'model1.pdb' is present
        result_element = driver.find_element(By.XPATH, "//a[contains(@href, 'model1.pdb')]")
        result_url = result_element.get_attribute("href")  # Get the href attribute of the result element
        print(f"download url: {result_url}")

        # Make a request to the download link
        response = requests.get(result_url)
        response.raise_for_status()  # Raise an exception for HTTP errors

        # Save the content of the download to a file
        with open(f'{variant}.pdb', 'wb') as file:
            file.write(response.content)
        
        return "concluded"  # Return 'concluded' if the file is found and downloaded
    except Exception as e:
        print(f"{url} not ready yet")  # Print the message if the file is not found
        return url  # Return the URL if the file is not found



In [None]:

driver = webdriver.Chrome(options=options)  # Create a Chrome driver with the specified options
# driver = webdriver.Chrome()  # Create a Chrome driver with the specified options
try:
    for i, (index, row) in enumerate(submitted_df.iterrows()):  # Iterate over the rows of the filtered DataFrame
        variant = row['variant'].replace("_p.", "_")  # Replace '_p.' with '_' in the variant name
        print(f"Processing variant {variant} ---------------- {i+1} from {len(not_concluded_df)} not concluded")  # Print the progress
        url = row['rosetta']  
        print(f"URL: {url}")  # Print the URL
        status = download_from_rosetta(driver, url, variant)
        variant_df.at[index, 'rosetta'] = status


        
except Exception as e:
    print(f"Error processing variants: {e}")  # Print an error message
finally:
    driver.quit()  # Quit the driver


In [None]:
variant_df.to_csv(f'{CSV_PATH}fasta_variant.csv', sep=';', index=False)  # Save the updated DataFrame to CSV
