

### **Note:** 
#### Be careful, all the files in input_directory will be deleted after the execution of the code. 

### Import libraries, load dependencies and defining variables

In [None]:
import os  # Import os library for file and directory operations
import zipfile  # Import zipfile library for handling zip files
import glob  # Import glob library for file pattern matching
import shutil  # Import shutil library for file operations
import pandas as pd  # Import pandas library for data manipulation

input_dir = 'input'  # Define the input directory
output_dir = 'output'  # Define the output directory

CSV_PATH = "../../data/csv"  # Define the path to the CSV files

## Common functions

In [None]:
def check_and_update_status(row):
    variant = row["variant"]  # Get the variant from the row
    filename = variant.replace('_p.', '_')  # Replace '_p.' with '_' in the variant name
    file_path = f"{PDB_PATH}/{filename}.pdb"  # Define the file path for the PDB file
    print(file_path)  # Print the file path
    if os.path.isfile(file_path):  # Check if the PDB file exists
        return 'concluded'  # Return 'concluded' if the file exists
    return 'not_concluded'  # Return 'not_concluded' if the file does not exist

def update_status(df, column):
    if column not in df.columns:  # Check if the column is not in the DataFrame
        df[column] = 'not_concluded'  # Add the column with default value 'not_concluded'
    df[column] = df.apply(check_and_update_status, axis=1)  # Update the status for each row
    print("Status updated based on existing files")  # Print a status update message
    return df  # Return the updated DataFrame

## Swiss Model

#### Update .csv with files

In [None]:
PDB_PATH = "../../data/pdb/swiss_model"  # Define the path to the Swiss Model PDB files
df = pd.read_csv(f'{CSV_PATH}/fasta_variant.csv', sep=';')  # Read the CSV file into a DataFrame
df = update_status(df, "swiss_model")  # Update the Swiss Model status in the DataFrame
print(df['swiss_model'].value_counts())  # Count the values in the Swiss Model column
df.to_csv(f'{CSV_PATH}/fasta_variant.csv', index=False, sep=';')  # Save the updated DataFrame to CSV

## Colab AlphaFold2


#### Extract pdb from colab alphafold2 zip files

In [None]:
extract_dir = '../../data/pdb/colab_alphafold2'  # Define the extraction directory

os.makedirs(extract_dir, exist_ok=True)  # Create the extraction directory if it does not exist

zip_files = glob.glob(os.path.join(input_dir, '*.zip'))  # Get the list of zip files in the input directory

for zip_file in zip_files:  # Iterate over the zip files
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:  # Open the zip file
        temp_dir = os.path.join(input_dir, 'temp')  # Define the temporary directory
        os.makedirs(temp_dir, exist_ok=True)  # Create the temporary directory
        zip_ref.extractall(temp_dir)  # Extract the zip file to the temporary directory
        for root, _, files in os.walk(temp_dir):  # Iterate over the files in the temporary directory
            for file in files:  # Iterate over the files
                if file.endswith('.pdb'):  # Check if the file is a PDB file
                    dest_path = os.path.join(extract_dir, file)  # Define the destination path
                    if not os.path.exists(dest_path):  # Check if the destination file does not exist
                        shutil.move(os.path.join(root, file), dest_path)  # Move the PDB file to the destination path
            
        shutil.rmtree(temp_dir)  # Remove the temporary directory

    os.remove(zip_file)  # Remove the zip file

print("Extraction complete.")  # Print a completion message

#### Update .csv with files

In [None]:
PDB_PATH = "../../data/pdb/colab_alphafold2"  # Define the path to the Colab AlphaFold2 PDB files
df = pd.read_csv(f'{CSV_PATH}/fasta_variant.csv', sep=';')  # Read the CSV file into a DataFrame
df = update_status(df, "colab_alphafold2")  # Update the Colab AlphaFold2 status in the DataFrame
print(df['colab_alphafold2'].value_counts())  # Count the values in the Colab AlphaFold2 column
df.to_csv(f'{CSV_PATH}/fasta_variant.csv', index=False, sep=';')  # Save the updated DataFrame to CSV

## Phyre2

### Extract fasta files to use as input on Phyre2

In [None]:
def extract_fasta_files(input_dir, output_dir):
    os.makedirs(output_dir, exist_ok=True)  # Create the output directory if it does not exist
    fasta_files = glob.glob(os.path.join(input_dir, '**/*.fasta'), recursive=True)  # Get the list of FASTA files
    
    concatenated_content = []  # Initialize a list to store concatenated content
    file_count = 0  # Initialize a file count
    variant_index = 1  # Initialize a variant index
    start_index = 1  # Initialize a start index

    for fasta_file in fasta_files:  # Iterate over the FASTA files
        with open(fasta_file, 'r') as f:  # Open the FASTA file for reading
            lines = f.readlines()  # Read the lines from the FASTA file
            concatenated_content.extend(lines)  # Extend the concatenated content with the lines
            concatenated_content.append('\n')  # Add a newline character after each file
            file_count += 1  # Increment the file count

            if file_count == 100:  # Check if the file count is 100
                end_index = start_index + file_count - 1  # Calculate the end index
                output_file = os.path.join(output_dir, f'variant_{start_index}_{end_index}.fasta')  # Define the output file path
                with open(output_file, 'w') as out_f:  # Open the output file for writing
                    out_f.writelines(concatenated_content)  # Write the concatenated content to the output file
        
                concatenated_content = []  # Reset the concatenated content
                file_count = 0  # Reset the file count
                start_index = end_index + 1  # Update the start index
                variant_index += 1  # Increment the variant index

        os.remove(fasta_file)  # Remove the FASTA file

    if concatenated_content:  # Check if there is remaining concatenated content
        end_index = start_index + file_count - 1  # Calculate the end index
        output_file = os.path.join(output_dir, f'variant_{start_index}_{end_index}.fasta')  # Define the output file path
        with open(output_file, 'w') as out_f:  # Open the output file for writing
            out_f.writelines(concatenated_content)  # Write the concatenated content to the output file

    print("Extraction complete.")  # Print a completion message
    shutil.rmtree(input_dir)  # Remove the input directory

extract_fasta_files(input_dir, output_dir)  # Call the function to extract FASTA files

### Extract PDB files from Phyre2 result

In [None]:
def extract_pdb_files_from_phyre2(input_dir, csv_path, output_dir):
    summary_dict = {}  # Initialize a dictionary to store summary information

    os.makedirs(output_dir, exist_ok=True)  # Create the output directory if it does not exist

    zip_files = glob.glob(os.path.join(input_dir, '*.zip'))  # Get the list of zip files in the input directory
    for zip_file in zip_files:  # Iterate over the zip files
        with zipfile.ZipFile(zip_file, 'r') as zip_ref:  # Open the zip file
            temp_dir = os.path.join(input_dir, 'temp')  # Define the temporary directory
            os.makedirs(temp_dir, exist_ok=True)  # Create the temporary directory
            zip_ref.extractall(temp_dir)  # Extract the zip file to the temporary directory

            summaryinfo_path = os.path.join(temp_dir, 'summaryinfo')  # Define the path to the summaryinfo file
            if os.path.isfile(summaryinfo_path):  # Check if the summaryinfo file exists
                with open(summaryinfo_path, 'r') as summary_file:  # Open the summaryinfo file for reading
                    for line in summary_file:  # Iterate over the lines in the summaryinfo file
                        if line.startswith('#'):  # Skip lines that start with '#'
                            continue
                        parts = line.strip().split('|')  # Split the line into parts
                        if len(parts) >= 4:  # Check if there are at least 4 parts
                            description = f"{parts[1].strip()}_{parts[2].strip()}"  # Create a description from parts 1 and 2
                            sequence_identity = parts[3].strip()  # Get the sequence identity from part 3
                            summary_dict[sequence_identity] = description  # Add the sequence identity and description to the dictionary

            for pdb_file in glob.glob(os.path.join(temp_dir, '*.final.pdb')):  # Iterate over the PDB files
                pdb_filename = os.path.basename(pdb_file)  # Get the base name of the PDB file
                pdb_key = os.path.splitext(pdb_filename)[0].split(".")[0]  # Get the key from the PDB file name
                if pdb_key in summary_dict:  # Check if the key is in the summary dictionary
                    description = summary_dict[pdb_key]  # Get the description from the summary dictionary
                    value1, value2 = description.split('_')  # Split the description into value1 and value2
                    csv_file_path = os.path.join(csv_path, 'fasta_variant.csv')  # Define the path to the CSV file
                    df = pd.read_csv(csv_file_path, sep=';')  # Read the CSV file into a DataFrame
                    gene = df.loc[df['identifier'] == value1, 'gene'].values[0]  # Get the gene from the DataFrame
                    new_pdb_filename = f"{gene}_{value2}.pdb"  # Create a new PDB file name
                    new_pdb_path = os.path.join(output_dir, new_pdb_filename)  # Define the new PDB file path
                    shutil.move(pdb_file, new_pdb_path)  # Move the PDB file to the new PDB path

            shutil.rmtree(temp_dir)  # Remove the temporary directory
        os.remove(zip_file)  # Remove the zip file

    csv_file_path = os.path.join(csv_path, 'fasta_variant.csv')  # Define the path to the CSV file
    df = pd.read_csv(csv_file_path, sep=';')  # Read the CSV file into a DataFrame

    for sequence_identity, description in summary_dict.items():  # Iterate over the summary dictionary
        value1, value2 = description.split('_')  # Split the description into value1 and value2
        gene = df.loc[df['identifier'] == value1, 'gene'].values[0]  # Get the gene from the DataFrame
        new_description = f"{gene}_{value2}"  # Create a new description
        if os.path.isfile(os.path.join(output_dir, f"{new_description}.pdb")):  # Check if the PDB file exists
            df.loc[df['identifier'] == value1, 'phyre2'] = 'concluded'  # Update the status to 'concluded'
        else:
            df.loc[df['identifier'] == value1, 'phyre2'] = 'not_concluded'  # Update the status to 'not_concluded'

    df.to_csv(csv_file_path, sep=';', index=False)  # Save the updated DataFrame to CSV

    return summary_dict  # Return the summary dictionary

input_dir = 'input'  # Define the input directory
csv_path = '../../data/csv'  # Define the path to the CSV files
output_dir = '../../data/pdb/phyre2'  # Define the output directory

summary_dict = extract_pdb_files_from_phyre2(input_dir, csv_path, output_dir)  # Call the function to extract PDB files

#### Update .csv with files

In [None]:
PDB_PATH = "../../data/pdb/phyre2"  # Define the path to the Phyre2 PDB files
df = pd.read_csv(f'{CSV_PATH}/fasta_variant.csv', sep=';')  # Read the CSV file into a DataFrame
df = update_status(df, "phyre2")  # Update the Phyre2 status in the DataFrame
print(df['phyre2'].value_counts())  # Count the values in the Phyre2 column
df.to_csv(f'{CSV_PATH}/fasta_variant.csv', index=False, sep=';')  # Save the updated DataFrame to CSV

## Blast

#### Update blast column

In [None]:
def check_and_update_blast_status(row):
    gene = row["gene"]  # Get the gene from the row
    file_path = f"{PDB_PATH}/{gene}.pdb"  # Define the file path for the PDB file
    print(file_path)  # Print the file path
    if os.path.isfile(file_path):  # Check if the PDB file exists
        return 'concluded'  # Return 'concluded' if the file exists
    return 'not_concluded'  # Return 'not_concluded' if the file does not exist

PDB_PATH = "../../data/pdb/blast"  # Define the path to the Blast PDB files
df = pd.read_csv(f'{CSV_PATH}/fasta_wild.csv', sep=';')  # Read the CSV file into a DataFrame
if "blast" not in df.columns:  # Check if the 'blast' column is not in the DataFrame
    df["blast"] = 'not_concluded'  # Add the 'blast' column with default value 'not_concluded'
df["blast"] = df.apply(check_and_update_blast_status, axis=1)  # Update the Blast status for each row
print("Status updated based on existing files")  # Print a status update message
print(df['blast'].value_counts())  # Count the values in the Blast column

df.to_csv(f'{CSV_PATH}/fasta_wild.csv', index=False, sep=';')  # Save the updated DataFrame to CSV

## AlphaFold3

In [None]:
def process_pdb_files():
    destination_folder = os.path.abspath("../../data/pdb/alphafold3")  # Define the destination folder
    os.makedirs(destination_folder, exist_ok=True)  # Create the destination folder if it does not exist
    pdb_files = glob.glob(os.path.join(input_dir, "*.pdb"))  # Get the list of PDB files in the input directory
    for pdb_file in pdb_files:  # Iterate over the PDB files
        new_filename = os.path.basename(pdb_file).replace("_p.", "_").replace("_model0", "")  # Create a new file name
        destination_file = os.path.join(destination_folder, new_filename)  # Define the destination file path

        shutil.move(pdb_file, destination_file)  # Move the PDB file to the destination path
        print(f"Moved file: {pdb_file} to {destination_file}")  # Print a success message
    all_files = glob.glob(os.path.join(input_dir, "*.*"))  # Get the list of all files in the input directory
    for file in all_files:  # Iterate over the files
        if not file.endswith(".pdb"):  # Check if the file is not a PDB file
            os.remove(file)  # Remove the file
            print(f"Deleted file: {file}")  # Print a success message

process_pdb_files()  # Call the function to process PDB files

#### Update .csv with files

In [None]:
PDB_PATH = "../../data/pdb/alphafold3"  # Define the path to the AlphaFold3 PDB files
df = pd.read_csv(f'{CSV_PATH}/fasta_variant.csv', sep=';')  # Read the CSV file into a DataFrame
df = update_status(df, "alphafold3")  # Update the AlphaFold3 status in the DataFrame
print(df['alphafold3'].value_counts())  # Count the values in the AlphaFold3 column
df.to_csv(f'{CSV_PATH}/fasta_variant.csv', index=False, sep=';')  # Save the updated DataFrame to CSV

### Modeller

#### Update .csv with files

In [None]:
#### Update .csv with files
PDB_PATH = "../../data/pdb/modeller"  # Define the path to the Modeller PDB files
df = pd.read_csv(f'{CSV_PATH}/fasta_variant.csv', sep=';')  # Read the CSV file into a DataFrame
df = update_status(df, "modeller")  # Update the Modeller status in the DataFrame
print(df['modeller'].value_counts())  # Count the values in the Modeller column
df.to_csv(f'{CSV_PATH}/fasta_variant.csv', index=False, sep=';')  # Save the updated DataFrame to CSV

## Clean up

In [None]:
def rename_files(path):
    for root, dirs, files in os.walk(path):  # Walk through the directory
        for file in files:  # Iterate over the files
            new_filename = file.replace("_p.", "_")  # Create a new file name
            os.rename(os.path.join(root, file), os.path.join(root, new_filename))  # Rename the file
            print(f"Renamed file: {file} to {new_filename}")  # Print a success message

rename_files("../../data")  # Call the function to rename files