# Format PIR

### Import libraries and load dependencies

In [None]:
import pandas as pd  # Import pandas for data manipulation and analysis
import os  # Import os for file and directory operations

#### Define global variables

In [None]:
CSV_PATH = "../../data/csv/"  # Path to the directory containing CSV files
PDB_PATH = '../../data/pdb/blast'  # Path to the directory containing PDB files
PIR_PATH = '../../data/pir'  # Path to the directory for storing generated PIR files

### Get info from homolog protein


In [None]:
variant_df = pd.read_csv(f'{CSV_PATH}fasta_variant.csv', sep=';')  # Read variant data from a CSV file
variant_df.head()  # Display the first few rows of the DataFrame for verification


In [None]:
# Function to generate PIR-formatted alignment (.ali) files
def generate_ali_files(df):
    # Iterate over each row in the DataFrame
    for index, row in df.iterrows():
        gene = row['gene']  # Extract gene name from the current row
        variant = row['variant']  # Extract variant name from the current row
        target_fasta_content = row['fasta']  # Extract the fasta sequence content from the current row
        pdb_file = os.path.join(PDB_PATH, f"{gene}.pdb")  # Construct the path to the PDB file for the gene

        # Check if the PDB file exists
        if os.path.isfile(pdb_file):
            # Create the content for the .ali file in PIR format
            ali_content = f""">P1;{variant}
sequence:{variant}:::::::0.00: 0.00
{target_fasta_content}*
"""
            # Define the path for the .ali file
            ali_file = os.path.join(PIR_PATH, f"{variant}.txt")
            # Write the .ali content to the file
            with open(ali_file, 'w') as out:
                out.write(ali_content)
            print(f"Generated {ali_file}")  # Notify the user that the file was generated successfully
        else:
            # Notify the user if the PDB file is missing for the gene
            print(f"PDB file for gene {variant} not found.")


In [None]:
generate_ali_files(variant_df) # Call the function to generate .ali files using the variant DataFrame