# Modeller

### In order to run Modeller script, you need to change the kernel to not use the virtual env once the Modeller is installed on your machine, click on the kernel on top right of this cell and chose a python kernel

### Import libraries and loading dependencies

In [None]:
import glob  # Import glob library for file pattern matching
import shutil  # Import shutil library for file operations
import pandas as pd  # Import pandas library for data manipulation
from modeller import *  # Import all from modeller
from modeller.automodel import *  # Import all from modeller.automodel
from pandarallel import pandarallel  # Import pandarallel for parallel processing with pandas

pandarallel.initialize()  # Initialize pandarallel


### Define global variables

In [None]:
CSV_PATH = "../../data/csv/fasta_variant.csv"  # Define the path to the CSV file
PDB_PATH = "../../data/pdb/modeller"  # Define the path to the PDB files
ALI_PATH = "../../data/ali"  # Define the path to the ALI files

### Get models

In [None]:
def move_pdb_file(variant):
    pdb_files = glob.glob(os.path.join(f"{variant}*.pdb"))  # Get the list of PDB files for the variant
    if not pdb_files:  # Check if no PDB files are found
        print(f"No PDB files found for variant {variant}")  # Print a message
        return
    file_to_move = pdb_files[0]  # Get the first PDB file
    destination_file = os.path.join(PDB_PATH, f"{variant.replace('_p.', '_')}.pdb")  # Define the destination file path
    shutil.move(file_to_move, destination_file)  # Move the PDB file to the destination path
    print(f"Moved file: {file_to_move} to {destination_file}")  # Print a success message

def delete_other_files(variant):
    files_to_delete = glob.glob(os.path.join(f"{variant}*.*"))  # Get the list of files for the variant
    for file in files_to_delete:  # Iterate over the files
        if not file.endswith(".pdb"):  # Check if the file is not a PDB file
            os.remove(file)  # Delete the file
            print(f"Deleted file: {file}")  # Print a success message

def process_variant(row):
    variant_raw = row['variant']  # Get the raw variant from the row
    variant = variant_raw.replace("_p.", "_")  # Replace '_p.' with '_' in the variant name
    ali_file = os.path.join(ALI_PATH, f"{variant}.ali")  # Define the path to the ALI file
    if os.path.exists(ali_file):  # Check if the ALI file exists
        with open(ali_file, "r") as file:  # Open the ALI file for reading
            file_content = file.read()  # Read the ALI content

        headers = [line for line in file_content.split('\n') if line.startswith('>')]  # Get the headers from the ALI content
        gene = headers[0].split(';')[1]  # Get the gene from the headers

        env = Environ()  # Create an Environ object
        env.io.atom_files_directory = ['.', 'data/pdb/blast']  # Set the atom files directory

        a = AutoModel(env,
                      alnfile=ali_file,  # Alignment filename
                      knowns=gene,  # Codes of the templates
                      sequence=variant_raw)  # Code of the target
        a.starting_model = 1  # Index of the first model
        a.ending_model = 1  # Index of the last model
                                           
        a.make()  # Make the model

        move_pdb_file(variant_raw)  # Move the PDB file
        delete_other_files(variant_raw)  # Delete other files

        row['modeller'] = 'concluded'  # Update the status to 'concluded'
    else:
        print(f"ALI file for variant {variant} not found.")  # Print a message if the ALI file is not found
    return row  # Return the updated row

def check_and_update_status(row):
    variant = row["variant"]  # Get the variant from the row
    filename = variant.replace('_p.', '_')  # Replace '_p.' with '_' in the variant name
    file_path = f"{PDB_PATH}/{filename}.pdb"  # Define the file path for the PDB file
    print(file_path)  # Print the file path
    if os.path.isfile(file_path):  # Check if the PDB file exists
        return 'concluded'  # Return 'concluded' if the file exists
    return 'not_concluded'  # Return 'not_concluded' if the file does not exist

def update_status(df, column):
    if column not in df.columns:  # Check if the column is not in the DataFrame
        df[column] = 'not_concluded'  # Add the column with default value 'not_concluded'
    df[column] = df.apply(check_and_update_status, axis=1)  # Update the status for each row
    print("Status updated based on existing files")  # Print a status update message
    return df  # Return the updated DataFrame


In [None]:

variant_df = pd.read_csv(CSV_PATH, sep=';')  # Read the CSV file into a DataFrame

variant_df = update_status(variant_df, 'modeller')  # Update the modeller status in the DataFrame

print(f"Total count: {len(variant_df)}")  # Print the total count of rows
print(variant_df['modeller'].value_counts())  # Count the values in the modeller column
input("\nPress Enter to continue...")  # Wait for user input
not_concluded_df = variant_df[variant_df['modeller'] == 'not_concluded']  # Filter rows with 'not_concluded' status

not_concluded_df = not_concluded_df.parallel_apply(process_variant, axis=1)  # Apply the process_variant function in parallel
input()  # Wait for user input
variant_df.update(not_concluded_df)  # Update the original DataFrame with the processed rows
variant_df.to_csv(CSV_PATH, sep=';', index=False)  # Save the updated DataFrame to CSV

print(f"\n\nTotal count: {len(variant_df)}")  # Print the total count of rows
print(variant_df['modeller'].value_counts())  # Count the values in the modeller column