Filter BLASTp results to identify Reciprocal Best Hits (RBHs)

In [7]:
import pandas as pd
import os

os.getcwd()

ModuleNotFoundError: No module named 'pandas'

In [None]:
def find_reciprocal_hits(file1, file2, output_rbh, output_rbh_no, species_a, species_b):
    # Load BLASTP results into DataFrames
    col_names = ["query", "subject", "perc_identity", "alignment_length", 
                 "mismatches", "gap_opens", "q_start", "q_end", 
                 "s_start", "s_end", "evalue", "bit_score"]
    
    df1 = pd.read_csv(file1, sep="\t", names=col_names)
    df2 = pd.read_csv(file2, sep="\t", names=col_names)
    
    # Extract the best hits from both files by keeping only best hit per query based on e-value
    # Ties are broken by selecting the highest bit score (% similarity)
    best_hits1 = df1.sort_values(by = ["query", "evalue", "bit_score"],
                                 ascending=[True, True, False]).drop_duplicates(subset=["query"], keep="first")
    best_hits2 = df2.sort_values(by = ["query", "evalue", "bit_score"],
                                 ascending=[True, True, False]).drop_duplicates(subset=["query"], keep="first")
    
    # Create dictionaries for quick lookup
    best_hits1_dict = dict(zip(best_hits1["query"], best_hits1["subject"]))
    best_hits2_dict = dict(zip(best_hits2["query"], best_hits2["subject"]))
    
    # Find reciprocal best hits
    rbh = []
    rbh_no = [] # to sore queries without a RBH
    for query, subject in best_hits1_dict.items():
        if best_hits2_dict.get(subject) == query:
            rbh.append((query, subject))
        else:
            rbh_no.append(query)
    
    # Save the results with RBH
    rbh_df = pd.DataFrame(rbh, columns=[f"{species_a}", f"{species_b}"])
    rbh_df.to_csv(output_rbh, sep="\t", index=False)
    print(f"Reciprocal best hits saved to {output_rbh}")
    
    # Save the queries with no reciprocal match
    rbh_no_df = pd.DataFrame(rbh_no, columns=[f"{species_a}"])
    rbh_no_df.to_csv(output_rbh_no, sep="\t", index=False)
    print(f"Queries from {species_a} with no reciprocal match saved to {output_rbh_no}")

In [None]:
# File paths
file1 = "/cluster/tufts/dopmanlab/Jacob/onub_ortholog_id/output/Onub2Dmel_blastp_v2.txt"
file2 = "/cluster/tufts/dopmanlab/Jacob/onub_ortholog_id/output/Dmel2Onub_blastp_v2.txt"
output_rbh = "/cluster/tufts/dopmanlab/Jacob/onub_ortholog_id/output/OnubDmel_rbh_v2.txt"
output_rbh_no = "/cluster/tufts/dopmanlab/Jacob/onub_ortholog_id/output/OnubDmel_rbh_no_v2.txt"
species_a = "ostrinia_nubilalis"
species_b = "drosophila_melanogaster"

In [None]:
with open(file1, "r") as file:
    ln = file.readlines()
    total_line = len(ln)
    print(f"Number of lines in file 1: {total_line}")
    
with open(file2, "r") as file:
    ln = file.readlines()
    total_line = len(ln)
    print(f"Number of lines in file 2: {total_line}")

In [None]:
find_reciprocal_hits(file1, file2, output_rbh, output_rbh_no, species_a, species_b)

In [None]:
with open(output_file, "r") as file:
    ln = file.readlines()
    total_line = len(ln)
    print(f"Number of lines in file 2: {total_line}")