<h1 style="font-family: 'Times New Roman'; text-align: center; font-weight: bold;">
    BLAST-X
</h1></p>

<div style="font-family: 'Times New Roman'; font-size: 15px; text-align: justify; width: 100%;">
  <div>
    <span style="display: inline-block; width: 100px;"><b>Date</b></span>: 5<sup>th</sup> October 2024
  </div>
  <div>
    <span style="display: inline-block; width: 100px;"><b>Author</b></span>: Deepan Kanagarajan Babu
  </div>
  <div>
    <span style="display: inline-block; width: 100px;"><b>Description</b></span>: In this document, BLAST-X is performed with CoxL form 1 proteins database and CoxL form 2 proteins database. Here is the complete customised code to run BLAST-X and get output in '.csv', '.doc', and '.fasta' file formats.
  </div>
</div>


<h2 style="font-family: 'Times New Roman'; font-weight: bold;">
    Required Libraries
</h2>

In [1]:
# importing necessary libraries
import os
import pandas as pd
from Bio import SeqIO
import subprocess
import csv
from docx import Document
from docx.shared import RGBColor

<h2 style="font-family: 'Times New Roman'; font-weight: bold;">
    Required Functions
</h2>

In [2]:
# Defining single function to open multiple file formats (.csv, .txt, .fasta or .fastq (to display only 1st 100 lines), and .xlsx)
def open_file(file_path):
    if file_path.endswith('.csv'):
        # Read CSV file
        data = pd.read_csv(file_path)
        return data
    elif file_path.endswith('.txt'):
        # Read TXT file
        try:
            with open(file_path, 'r') as file:
                data = file.read()
            return data
        except FileNotFoundError:
            raise FileNotFoundError(f"The file {file_path} does not exist.")
    elif file_path.endswith(('.fasta', '.fastq')):
        # Read FASTA or FASTQ file
        try:
            with open(file_path, 'r') as file:
                data = []
                for _ in range(100):
                    line = file.readline()
                    if not line:
                        break
                    data.append(line.strip())
            return data
        except FileNotFoundError:
            raise FileNotFoundError(f"The file {file_path} does not exist.")
    elif file_path.endswith('.xlsx'):
        # Read XLSX file
        try:
            data = pd.read_excel(file_path)
            return data
        except FileNotFoundError:
            raise FileNotFoundError(f"The file {file_path} does not exist.")
    else:
        raise ValueError("Unsupported file format")

In [3]:
# Defining function to run BlastX
## Define the header for the CSV file with the match scores
csv_header = ["qseqid", "sseqid", "pident", "qcovs", "evalue", "bitscore", "qstart", "qend", "sstart", "send", "length"]

## Function to run BLASTx and capture the output
def run_blastx(input_file, protein_db):
    result = subprocess.run([
        "blastx", # mention the type of blast performed here (blastn, blastp, blastx ...)
        "-query", input_file,
        "-db", protein_db,
        "-outfmt", "6 qseqid sseqid pident qcovs evalue bitscore qstart qend sstart send length qseq sseq",
        "-evalue", "1e-6"
    ], capture_output=True, text=True)
    
    return result.stdout

# Defining function to color-code sequences in DOCX format
def color_code_sequence_docx(paragraph, query_seq, subject_seq):
    for q, s in zip(query_seq, subject_seq):
        if q == s:
            run = paragraph.add_run(q)
            run.font.color.rgb = RGBColor(0, 128, 0)  # Green for match
        elif q == '-' or s == '-':
            run = paragraph.add_run(q)
            run.font.color.rgb = RGBColor(128, 0, 128)  # Purple for gap
        else:
            run = paragraph.add_run(q)
            run.font.color.rgb = RGBColor(255, 0, 0)  # Red for mismatch

<h2 style="font-family: 'Times New Roman'; font-weight: bold;">
    Required Files and Paths
</h2>

In [4]:
# Specify the input folder paths
## Path to MAGs
mags = 'MAG'

## Path to CoxL Form 1 db
coxl1_db = 'CoxL1/coxl1_db'

## Path to CoxL Form 2 db
coxl2_db = 'Coxl2/coxl2_db'

# Output Directories
## Form 1 CoxL BLAST results
coxl1_csv_results = "CoxL1_Results/csv"
## Ensure the output directory exists
os.makedirs(coxl1_csv_results, exist_ok=True)

coxl1_docs_results = "CoxL1_Results/docs"
## Ensure the output directory exists
os.makedirs(coxl1_docs_results, exist_ok=True)

coxl1_fasta_results = "CoxL1_Results/fasta"
## Ensure the output directory exists
os.makedirs(coxl1_fasta_results, exist_ok=True)

## Form 2 CoxL BLAST results
coxl2_csv_results = "CoxL2_Results/csv"
## Ensure the output directory exists
os.makedirs(coxl2_csv_results, exist_ok=True)

coxl2_docs_results = "CoxL2_Results/docs"
## Ensure the output directory exists
os.makedirs(coxl2_docs_results, exist_ok=True)

coxl2_fasta_results = "CoxL2_Results/fasta"
## Ensure the output directory exists
os.makedirs(coxl2_fasta_results, exist_ok=True)

<h2 style="font-family: 'Times New Roman'; font-weight: bold;">
    Performing BLAST-X
</h2>

In [9]:
# Perform BLAST for Form 1 CoxL proteins
## Loop through each .fasta file in the input directory
for filename in os.listdir(mags):
    if filename.endswith(".fasta"):
        input_file = os.path.join(mags, filename)
        ## CSV output path
        csv_output_file = os.path.join(coxl1_csv_results, filename.replace(".fasta", "_scores.csv"))
        ## DOCX output path
        docx_output_file = os.path.join(coxl1_docs_results, filename.replace(".fasta", "_blastx.docx"))
        ## FASTA output path
        fasta_output_file = os.path.join(coxl1_fasta_results, filename.replace(".fasta", "_alignments.fasta"))

        ## Run BLASTx and get the results
        blast_output = run_blastx(input_file, coxl1_db)
        
        ## Prepare a list to hold alignment information
        alignments = []

        ## Open the CSV file to write match scores
        with open(csv_output_file, 'w', newline='') as csv_file:
            csv_writer = csv.writer(csv_file)
            ## Write the header row
            csv_writer.writerow(csv_header)
            
            ## Process the BLASTx output line by line
            for line in blast_output.strip().split("\n"):
                columns = line.split("\t")
                if len(columns) >= 13:  # Ensure there are enough columns
                    ## Write score data to the CSV file (first 11 columns)
                    csv_writer.writerow(columns[:11])
                    
                    ## Store the alignment data (qseq and sseq) for later output
                    query_seq = columns[11]
                    subject_seq = columns[12]
                    alignments.append((columns[0], columns[1], query_seq, subject_seq))

        ## Create a DOCX document
        doc = Document()
        doc.add_heading(f'BLASTx Results for {filename}', level=1)
        doc.add_heading('Sequences producing significant alignments:', level=2)
        
        ## Add the tabular summary for all matches (from CSV)
        table = doc.add_table(rows=1, cols=len(csv_header))
        hdr_cells = table.rows[0].cells
        for i, header in enumerate(csv_header):
            hdr_cells[i].text = header
        
        with open(csv_output_file) as csvfile:
            csv_reader = csv.reader(csvfile)
            next(csv_reader)  # Skip header row
            for row in csv_reader:
                row_cells = table.add_row().cells
                for i, cell in enumerate(row):
                    row_cells[i].text = cell
        
        ## Add all alignment sequences after the table
        doc.add_heading('Alignments:', level=2)
        for alignment_id, subject_id, qseq, sseq in alignments:
            doc.add_heading(f'Alignment for {alignment_id} vs {subject_id}', level=3)

            ## Split sequences into chunks of 60 characters
            for i in range(0, max(len(qseq), len(sseq)), 60):
                query_chunk = qseq[i:i+60]
                subject_chunk = sseq[i:i+60]
                
                ## Add Q label in bold black
                query_paragraph = doc.add_paragraph()
                q_run = query_paragraph.add_run('Q: ')
                q_run.bold = True
                q_run.font.color.rgb = RGBColor(0, 0, 0)  # Black color
                color_code_sequence_docx(query_paragraph, query_chunk, subject_chunk)

                ## Add S label in bold black
                subject_paragraph = doc.add_paragraph()
                s_run = subject_paragraph.add_run('S: ')
                s_run.bold = True
                s_run.font.color.rgb = RGBColor(0, 0, 0)  # Black color
                color_code_sequence_docx(subject_paragraph, subject_chunk, query_chunk)

                doc.add_paragraph('')  # Add space between lines

        ## Save the DOCX document
        doc.save(docx_output_file)

        ## Append query and subject sequences to the FASTA file with proper headings
        with open(fasta_output_file, 'w') as fasta_file:
            for index, (alignment_id, subject_id, qseq, sseq) in enumerate(alignments, start=1):
                fasta_file.write(f">{alignment_id} Query{index}\n{qseq}\n")
                fasta_file.write(f">{subject_id} Subject{index}\n{sseq}\n")

        print(f"BLASTx completed for {input_file}, results saved to {docx_output_file}, {csv_output_file}, and {fasta_output_file}")

BLASTx completed for MAG\Bdellovibrionaceae_bacterium_ERS6626579.fasta, results saved to CoxL1_Results/docs\Bdellovibrionaceae_bacterium_ERS6626579_blastx.docx, CoxL1_Results/csv\Bdellovibrionaceae_bacterium_ERS6626579_scores.csv, and CoxL1_Results/fasta\Bdellovibrionaceae_bacterium_ERS6626579_alignments.fasta
BLASTx completed for MAG\Bdellovibrionaceae_bacterium_ERS6626826.fasta, results saved to CoxL1_Results/docs\Bdellovibrionaceae_bacterium_ERS6626826_blastx.docx, CoxL1_Results/csv\Bdellovibrionaceae_bacterium_ERS6626826_scores.csv, and CoxL1_Results/fasta\Bdellovibrionaceae_bacterium_ERS6626826_alignments.fasta
BLASTx completed for MAG\Burkholderiaceae_bacterium_ERS6626828.fasta, results saved to CoxL1_Results/docs\Burkholderiaceae_bacterium_ERS6626828_blastx.docx, CoxL1_Results/csv\Burkholderiaceae_bacterium_ERS6626828_scores.csv, and CoxL1_Results/fasta\Burkholderiaceae_bacterium_ERS6626828_alignments.fasta
BLASTx completed for MAG\Burkholderiaceae_bacterium_ERS6626861.fasta, re

In [6]:
# Perform BLAST for Form 2 CoxL proteins
## Loop through each .fasta file in the input directory
for filename in os.listdir(mags):
    if filename.endswith(".fasta"):
        input_file = os.path.join(mags, filename)
        ## CSV output path
        csv_output_file = os.path.join(coxl2_csv_results, filename.replace(".fasta", "_scores.csv"))
        ## DOCX output path
        docx_output_file = os.path.join(coxl2_docs_results, filename.replace(".fasta", "_blastx.docx"))
        ## FASTA output path
        fasta_output_file = os.path.join(coxl2_fasta_results, filename.replace(".fasta", "_alignments.fasta"))

        ## Run BLASTx and get the results
        blast_output = run_blastx(input_file, coxl2_db)
        
        ## Prepare a list to hold alignment information
        alignments = []

        ## Open the CSV file to write match scores
        with open(csv_output_file, 'w', newline='') as csv_file:
            csv_writer = csv.writer(csv_file)
            ## Write the header row
            csv_writer.writerow(csv_header)
            
            ## Process the BLASTx output line by line
            for line in blast_output.strip().split("\n"):
                columns = line.split("\t")
                if len(columns) >= 13:  # Ensure there are enough columns
                    ## Write score data to the CSV file (first 11 columns)
                    csv_writer.writerow(columns[:11])
                    
                    ## Store the alignment data (qseq and sseq) for later output
                    query_seq = columns[11]
                    subject_seq = columns[12]
                    alignments.append((columns[0], columns[1], query_seq, subject_seq))

        ## Create a DOCX document
        doc = Document()
        doc.add_heading(f'BLASTx Results for {filename}', level=1)
        doc.add_heading('Sequences producing significant alignments:', level=2)
        
        ## Add the tabular summary for all matches (from CSV)
        table = doc.add_table(rows=1, cols=len(csv_header))
        hdr_cells = table.rows[0].cells
        for i, header in enumerate(csv_header):
            hdr_cells[i].text = header
        
        with open(csv_output_file) as csvfile:
            csv_reader = csv.reader(csvfile)
            next(csv_reader)  # Skip header row
            for row in csv_reader:
                row_cells = table.add_row().cells
                for i, cell in enumerate(row):
                    row_cells[i].text = cell
        
        ## Add all alignment sequences after the table
        doc.add_heading('Alignments:', level=2)
        for alignment_id, subject_id, qseq, sseq in alignments:
            doc.add_heading(f'Alignment for {alignment_id} vs {subject_id}', level=3)

            ## Split sequences into chunks of 60 characters
            for i in range(0, max(len(qseq), len(sseq)), 60):
                query_chunk = qseq[i:i+60]
                subject_chunk = sseq[i:i+60]
                
                ## Add Q label in bold black
                query_paragraph = doc.add_paragraph()
                q_run = query_paragraph.add_run('Q: ')
                q_run.bold = True
                q_run.font.color.rgb = RGBColor(0, 0, 0)  # Black color
                color_code_sequence_docx(query_paragraph, query_chunk, subject_chunk)

                ## Add S label in bold black
                subject_paragraph = doc.add_paragraph()
                s_run = subject_paragraph.add_run('S: ')
                s_run.bold = True
                s_run.font.color.rgb = RGBColor(0, 0, 0)  # Black color
                color_code_sequence_docx(subject_paragraph, subject_chunk, query_chunk)

                doc.add_paragraph('')  # Add space between lines

        ## Save the DOCX document
        doc.save(docx_output_file)

        ## Append query and subject sequences to the FASTA file with proper headings
        with open(fasta_output_file, 'w') as fasta_file:
            for index, (alignment_id, subject_id, qseq, sseq) in enumerate(alignments, start=1):
                fasta_file.write(f">{alignment_id} Query{index}\n{qseq}\n")
                fasta_file.write(f">{subject_id} Subject{index}\n{sseq}\n")

        print(f"BLASTx completed for {input_file}, results saved to {docx_output_file}, {csv_output_file}, and {fasta_output_file}")

BLASTx completed for MAG\Bdellovibrionaceae_bacterium_ERS6626579.fasta, results saved to CoxL2_Results/docs\Bdellovibrionaceae_bacterium_ERS6626579_blastx.docx, CoxL2_Results/csv\Bdellovibrionaceae_bacterium_ERS6626579_scores.csv, and CoxL2_Results/fasta\Bdellovibrionaceae_bacterium_ERS6626579_alignments.fasta
BLASTx completed for MAG\Bdellovibrionaceae_bacterium_ERS6626826.fasta, results saved to CoxL2_Results/docs\Bdellovibrionaceae_bacterium_ERS6626826_blastx.docx, CoxL2_Results/csv\Bdellovibrionaceae_bacterium_ERS6626826_scores.csv, and CoxL2_Results/fasta\Bdellovibrionaceae_bacterium_ERS6626826_alignments.fasta
BLASTx completed for MAG\Burkholderiaceae_bacterium_ERS6626828.fasta, results saved to CoxL2_Results/docs\Burkholderiaceae_bacterium_ERS6626828_blastx.docx, CoxL2_Results/csv\Burkholderiaceae_bacterium_ERS6626828_scores.csv, and CoxL2_Results/fasta\Burkholderiaceae_bacterium_ERS6626828_alignments.fasta
BLASTx completed for MAG\Burkholderiaceae_bacterium_ERS6626861.fasta, re