# Install Required Libraries
Use pip to install the necessary libraries, including pandas, pyhmmer, and glob2.

In [None]:
# Install Required Libraries
!pip install pandas pyhmmer glob2

# Import Required Libraries
Import the necessary libraries, including pandas, pyhmmer, glob, and collections.

In [None]:
# Import Required Libraries
import pandas as pd
import pyhmmer
import glob
import pyhmmer.easel as easel
import collections

# Define Helper Functions
Define the retrieve_hits function to process sequence files and run HMMs.

In [None]:
# Define Helper Functions
def retrieve_hits(seqs_path, hmms, fields=["query", "subject", "bitscore", "evalue"]):
    # Load cluster proteins
    with pyhmmer.easel.SequenceFile(seqs_path, digital=True, alphabet=easel.Alphabet.amino()) as seqs_file:
        proteins = seqs_file.read_block()

    # Run HMMs
    Result = collections.namedtuple("Result", fields)
    results = []
    for hits in pyhmmer.hmmsearch(hmms, proteins, E=1):
        cog = hits.query_name.decode()
        for hit in hits:
            if hit.included:
                results.append(Result(hit.name.decode(), cog, hit.score, hit.evalue))

    # Results --> df
    hits_df = {}
    c = 0
    for i in results:
        hits_df[c] = list(i)
        c += 1
    hits_df = pd.DataFrame.from_dict(hits_df, orient='index', columns=fields)
    
    return hits_df

# Load HMMs
Find and load a collection of HMMs from the specified directory.

In [None]:
# Load HMMs
from google.colab import drive
drive.mount('/content/drive')

# Find and load a collection of HMMs
HMMS = []
for fil in glob.glob('/content/drive/MyDrive/data/hmms_nifHDK/*.hmm'):
    with pyhmmer.plan7.HMMFile(fil) as hmm_file:
        HMMS.append(hmm_file.read())
HMMS

# Retrieve Hits for First Dataset
Use the retrieve_hits function to process the first dataset and display the results.

In [None]:
# Retrieve Hits for First Dataset
results = retrieve_hits('/content/drive/MyDrive/data/omd2_candidate.faa', HMMS)
results

# Retrieve Hits for Second Dataset
Use the retrieve_hits function to process the second dataset and display the results.

In [None]:
# Retrieve Hits for Second Dataset
results_second_dataset = retrieve_hits('/content/drive/MyDrive/data/cyanobact.faa', HMMS)
results_second_dataset