In [37]:
import pandas as pd

import os
from Bio import SeqIO

In [42]:
def files_without_extension(directory):
    files_without_ext = []

    # Iterate over all files in the directory
    for filename in os.listdir(directory):
        if filename == "AMRProt":
            continue  # Skip the file with the name "AMRProt"
        filepath = os.path.join(directory, filename)
        if os.path.isfile(filepath) and '.' not in filename:
            files_without_ext.append(filename)

    return files_without_ext

In [43]:
def import_fasta_file(file_path):
    fasta_dict = {}
    try:
        # Parse the FASTA file and add sequences to the dictionary
        with open(file_path) as file:
            records = SeqIO.parse(file, 'fasta')
            for record in records:
                fasta_dict[record.id] = str(record.seq)
    except Exception as e:
        print("Error importing file {}: {}".format(file_path, e))

    return fasta_dict

In [55]:
def join_dicts(list_of_dicts):
    result_dict = {}
    for dictionary in list_of_dicts:
        result_dict.update(dictionary)
    return result_dict

In [56]:
def import_all_nuclotide(directory_path):
    NE_files = files_without_extension(directory_path)

    fasta_file_dicts = []
    for file in NE_files:
        path = os.path.join(directory_path,file)
        temp = import_fasta_file(path)
        fasta_file_dicts.append(temp)

    concat_dicts = join_dicts(fasta_file_dicts)

    return concat_dicts

In [70]:
def search_dict_by_key_containing_string(dictionary, search_string):
    result = {}
    for key in dictionary.keys():
        if search_string in key:
            result[search_string] = dictionary[key]
    return result

In [86]:
def write_fasta_dict_to_file(fasta_dict, output_file):
    with open(output_file, "w") as output_handle:
        for seq_id, seq in fasta_dict.items():
            output_handle.write(f">{seq_id}\n")
            output_handle.write(f"{seq}\n")

In [89]:
def get_acc_bulk(directory):
    # Loop through files in the directory
    acc_list = []
    for filename in os.listdir(directory):
        # Check if the entry is a file
        filepath = os.path.join(directory, filename)
        if os.path.isfile(filepath):
            # Process the file
            print("Processing file:", filename)
            amrf_out = pd.read_csv(filepath, sep='\t')
            accession_list = list(amrf_out['Accession of closest sequence'])
            for acc in accession_list:
                acc_list.append(acc)
    unique_acc = list(set(acc_list))
    return unique_acc

In [99]:
def get_nuc_seqs_amrfinder(DB_Path, AMRF_OUT_DIR, Output_Fasta_filename):
    fnai_dict = import_all_nuclotide(DB_Path)

    acc_list = get_acc_bulk(AMRF_OUT_DIR)
    result_ls = []
    for acc in acc_list:
        result = search_dict_by_key_containing_string(concat_dicts, acc)
        result_ls.append(result)
        if len(result) == 0:
            print('No result found')
        elif len(result) > 1:
            print('more than one result found')
    result_df = join_dicts(result_ls)

    if len(acc_list) == len(result_df):
        print('Finished Correctly, all sequences found')
    elif len(acc_list) < len(result_df):
        print('Somthing went wrong, some sequences were duplicated')
    else:
        print('Somthing went wrong, some sequences not found')

    write_fasta_dict_to_file(result_df, Output_Fasta_filename)

In [100]:
directory_path = "ARG_Databases/AMRFinder_db/amrfinder_db_down/latest/"
output_dir = 'testing_outputs/AMRFINDER/'
get_nuc_seqs_amrfinder(directory_path, output_dir, 'testing_output.fasta')

Processing file: barcode01_AMRFinder.tsv
Processing file: barcode02_AMRFinder.tsv
Processing file: barcode03_AMRFinder.tsv
Processing file: barcode04_AMRFinder.tsv
Processing file: barcode05_AMRFinder.tsv
Processing file: barcode06_AMRFinder.tsv
Processing file: barcode07_AMRFinder.tsv
Processing file: barcode08_AMRFinder.tsv
Processing file: barcode09_AMRFinder.tsv
Processing file: barcode10_AMRFinder.tsv
Processing file: barcode11_AMRFinder.tsv
Processing file: barcode12_AMRFinder.tsv
Finished Correctly, all sequences found
