# I-Tasser

## The following notebook is an attempt to process data by accessing the website via scraping using Selenium. However, the approach that will be used is running locally using the software.

### Import libraries and load dependencies

In [None]:
import time  # Import time library for sleep functionality
import pandas as pd  # Import pandas library for data manipulation
import requests  # Import requests library for making HTTP requests
import gzip  # Import gzip library for handling gzip files
import os  # Import os library for file and directory operations
import shutil  # Import shutil library for file operations

from pandarallel import pandarallel  # Import pandarallel for parallel processing with pandas

pandarallel.initialize(progress_bar=True)  # Initialize pandarallel with a progress bar
from selenium import webdriver  # Import webdriver from selenium for web scraping
from selenium.webdriver.common.by import By  # Import By for locating elements
from selenium.webdriver.chrome.service import Service  # Import Service for Chrome driver
from selenium.webdriver.support.ui import WebDriverWait  # Import WebDriverWait for waiting for elements
from selenium.webdriver.support import expected_conditions as EC  # Import expected_conditions for waiting for elements

### Define global variables

In [None]:
CSV_PATH = "../../data/csv/"  # Define the path to the CSV files
PDB_PATH = "../../data/pdb/i_tasser/"  # Define the path to the PDB files
FASTA_PATH = '../../data/fasta/variant'  # Define the path to the FASTA files
DOWNLOAD_PATH = os.path.expanduser("~/Downloads")  # Define the download path
I_TASSER_URL = 'https://zhanglab.comp.nus.edu.sg/I-TASSER/'  # Define the I-TASSER URL

EMAIL = "veri.piva@furg.br"  # Define the email for I-TASSER login
PASSWORD = "IT_lnu1i"  # Define the password for I-TASSER login

options = webdriver.ChromeOptions()  # Create Chrome options
options.add_argument('--headless')  # Run Chrome in headless mode
options.add_argument('--no-sandbox')  # Disable sandbox mode
options.add_argument('--disable-dev-shm-usage')  # Disable dev/shm usage

### Get models

In [None]:
def check_and_update_status(row):
    variant = row["variant"]  # Get the variant from the row
    filename = variant.replace('p.', '')  # Replace 'p.' with '' in the variant name
    file_path = f"{PDB_PATH}{filename}.pdb"  # Define the file path for the PDB file
    
    if os.path.isfile(file_path):  # Check if the PDB file exists
        return 'concluded'  # Return 'concluded' if the file exists

    current_status = row.get("i_tasser", "not_concluded")  # Get the current status from the row
    if current_status != "not_concluded":  # Check if the current status is not 'not_concluded'
        return current_status  # Return the current status
    
    return "not_concluded"  # Return 'not_concluded' if the file does not exist and the current status is 'not_concluded'

def update_i_tasser_status(df):
    if 'i_tasser' not in df.columns:  # Check if 'i_tasser' column is not in the DataFrame
        df['i_tasser'] = 'not_concluded'  # Add 'i_tasser' column with default value 'not_concluded'
    df['i_tasser'] = df.apply(check_and_update_status, axis=1)  # Update 'i_tasser' status for each row
    print("Status updated based on existing files")  # Print a status update message
    return df  # Return the updated DataFrame

def get_fasta(variant):
    fasta_file = os.path.join(FASTA_PATH, f'{variant}.fasta')  # Define the path to the FASTA file
    if os.path.isfile(fasta_file):  # Check if the FASTA file exists
        with open(fasta_file, 'r') as file:  # Open the FASTA file for reading
            fasta = file.read()  # Read the FASTA content
            return fasta  # Return the FASTA content
    else:
        print(f'File {fasta_file} does not exist.')  # Print a message if the file does not exist

def submit_to_itasser(driver, fasta_content, variant):
    driver.get(I_TASSER_URL)  # Open the I-TASSER URL
    wait = WebDriverWait(driver, 600)  # Create a WebDriverWait object with a timeout of 600 seconds
    fasta_input = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="form1"]/textarea')))  # Wait for the FASTA input element to be present
    fasta_input = driver.find_element(By.XPATH, '//*[@id="form1"]/textarea')  # Find the FASTA input element
    fasta_input.send_keys(fasta_content)  # Enter the FASTA content

    email_input = wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="form1"]/p[2]/input')))  # Wait for the email input element to be present
    email_input = driver.find_element(By.XPATH, '//*[@id="form1"]/p[2]/input')  # Find the email input element
    email_input.send_keys(EMAIL)  # Enter the email
    
    id_input = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="form1"]/p[3]/input')))  # Wait for the ID input element to be present
    id_input = driver.find_element(By.XPATH, '//*[@id="form1"]/p[3]/input')  # Find the ID input element
    id_input.send_keys(variant)  # Enter the variant ID

    submit_button = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="form1"]/p[8]/input[1]')))  # Wait for the submit button to be present
    submit_button = driver.find_element(By.XPATH, '//*[@id="form1"]/p[8]/input[1]')  # Find the submit button
    submit_button.click()  # Click the submit button

    try:
        success_message = wait.until(EC.presence_of_element_located((By.XPATH, '/html/body/text()[1][contains(., "The sequence has been successfully submitted to the I-TASSER server.")]')))  # Wait for the success message
        link_element = driver.find_element(By.XPATH, '/html/body/p/a')  # Find the link element
        href_value = link_element.get_attribute('href')  # Get the href attribute of the link element
        return href_value  # Return the href value
    except:
        return "error"  # Return "error" if an exception occurs

def wait_result(driver, result_url, variant):
    download_dir = os.path.expanduser("~/Downloads")  # Define the download directory
    driver.get(result_url)  # Open the result URL
    while True:
        try:
            element = WebDriverWait(driver, 600).until(
                EC.presence_of_element_located((By.XPATH, "//a[@href='model1.pdb' and @download]"))
            )  # Wait for the PDB file link to be present
            break  # Exit the loop if the element is found
        except Exception as e:
            print(f"Result not ready yet. Waiting 10 seconds. {e}")  # Print a message if the result is not ready
            time.sleep(10)  # Wait for 10 seconds
            
    pdb_url = element.get_attribute("href")  # Get the href attribute of the PDB file link
    driver.get(pdb_url)  # Open the PDB file URL

    time.sleep(10)  # Adjust the sleep time if necessary
    pdb_file = os.path.join(download_dir, "model1.pdb")  # Define the path to the PDB file
    if os.path.exists(pdb_file):  # Check if the PDB file exists
        os.makedirs(PDB_PATH, exist_ok=True)  # Create the PDB path if it does not exist
        shutil.move(pdb_file, os.path.join(PDB_PATH, f"{variant}.pdb"))  # Move the PDB file to the PDB path
        print(f"Moved file: {pdb_file} to {PDB_PATH}")  # Print a success message
        return True  # Return True if the file is moved successfully
    else:
        print("PDB file not found in the download directory.")  # Print a message if the file is not found
        return False  # Return False if the file is not found


In [None]:
variant_df = pd.read_csv(f'{CSV_PATH}fasta_variant.csv', sep=';')  # Read the CSV file into a DataFrame
variant_df.head()  # Display the first few rows of the DataFrame


In [None]:
variant_df = update_i_tasser_status(variant_df)  # Update the i_tasser status in the DataFrame
variant_df['i_tasser'].value_counts()  # Count the values in the i_tasser column


Status updated based on existing files


i_tasser
not_concluded    383
concluded          1
Name: count, dtype: int64

In [None]:
variant_df.head()  # Display the first few rows of the DataFrame


In [None]:
variant_df["i_tasser"] = "not_concluded"  # Set the i_tasser column to 'not_concluded'
variant_df.loc[0, 'i_tasser'] = 'concluded'  # Set the first row to 'concluded'
variant_df.loc[1, 'i_tasser'] = 'https://zhanglab.comp.nus.edu.sg/I-TASSER/output/S44/'  # Set the second row to a URL
variant_df.head()  # Display the first few rows of the DataFrame

In [None]:
driver = webdriver.Chrome(options=options)  # Create a Chrome driver with the specified options

filtered_df = variant_df[variant_df['i_tasser'] != 'concluded']  # Filter rows with 'not_concluded' status

try:
    for i, (index, row) in enumerate(filtered_df.iterrows()):  # Iterate over the rows of the filtered DataFrame
        variant = row['variant']  # Get the variant from the row
        print(f"Processing variant {variant} ---------------- {i+1} from {len(filtered_df)}")  # Print the progress
        if row['i_tasser'] == 'not_concluded':  # Check if the status is 'not_concluded'
            print("Model not processed yet.")  # Print a message
            fasta = get_fasta(variant)  # Get the FASTA content
            print(f"Submitting variant {variant} to I-TASSER")  # Print a message
            id = submit_to_itasser(driver, fasta, variant).split('/')[-2]  # Submit to I-TASSER and get the ID
            result_url = f"https://zhanglab.comp.nus.edu.sg/I-TASSER/output/{id}/"  # Define the result URL
        else:
            print("Model already submitted to I-TASSER")  # Print a message
            result_url = row['i_tasser']  # Get the result URL from the row
        print(f"Model processing... You can check the current status on {result_url}")  # Print a message
        if wait_result(driver, result_url, variant):  # Wait for the result
            print(f"Model processed successfully for variant {variant}")  # Print a success message
            variant_df.at[index, 'i_tasser'] = f"concluded"  # Update the status to 'concluded'
            filtered_df.at[index, 'i_tasser'] = f"concluded"  # Update the status to 'concluded'
except Exception as e:
    print(f"Error processing variants: {e}")  # Print an error message
finally:
    driver.quit()  # Quit the driver

variant_df['i_tasser'].value_counts()  # Count the values in the i_tasser column

Unnamed: 0,gene,identifier,variant,fasta,swiss_model,phyre2,colab_alphafold2,i_tasser,modeller,roseta,alphafold3
0,atpE,Rv1305,atpE_p.Ala63Pro,MDPTIAAGALIGGGLIMAGGAIGAGIGDGVAGNALISGVARQPEAQ...,concluded,concluded,concluded,concluded,concluded,not_concluded,concluded
1,atpE,Rv1305,atpE_p.Asp28Ala,MDPTIAAGALIGGGLIMAGGAIGAGIGAGVAGNALISGVARQPEAQ...,concluded,concluded,concluded,https://zhanglab.comp.nus.edu.sg/I-TASSER/outp...,concluded,not_concluded,concluded
2,atpE,Rv1305,atpE_p.Asp28Gly,MDPTIAAGALIGGGLIMAGGAIGAGIGGGVAGNALISGVARQPEAQ...,concluded,concluded,concluded,not_concluded,concluded,not_concluded,concluded
3,atpE,Rv1305,atpE_p.Asp28Val,MDPTIAAGALIGGGLIMAGGAIGAGIGVGVAGNALISGVARQPEAQ...,concluded,concluded,concluded,not_concluded,concluded,not_concluded,concluded
4,atpE,Rv1305,atpE_p.Glu61Asp,MDPTIAAGALIGGGLIMAGGAIGAGIGDGVAGNALISGVARQPEAQ...,concluded,concluded,concluded,not_concluded,concluded,not_concluded,concluded


In [None]:
variant_df.to_csv(f'{CSV_PATH}fasta_variant_2.csv', sep=';', index=False)  # Save the updated DataFrame to CSV