# Daisy Algorithm for ImmeDB results

1. For a new row of '20230614_AMR in TE_Abricate-report' googlesheet, open the ImmeDB results for the strain that corresponds to the row. 
2. In the ImmeDB result file, search hits with "q.start" value smaller than "START" value of Abricate (Column "C" in '20230614_AMR in TE_Abricate-report'; mobile genetic element region should start prior to antibiotic resistance gene). 
3. Among those, search hits with "q.end" value larger than "END" value of Abricate (Column "D" in '20230614_AMR in TE_Abricate-report'; mobile genetic element region should end after the antibiotic resistance gene).
4. For a hit that satisfies both 2&3, get "subject acc.ver" in ImmeDB results file (ex; NZ_NMTU01000006.1:71946-83411).
5. Search the "subject acc.ver" in "Data1_MGE_sequences.fasta" file to figure out the name of the hit (ex;  IMEs459). 
    - If there are multiple hits, any ICE elements take priority to document. 
    - If there's no hit, put 'na' in column O (ImmeDB values) of '20230614_AMR in TE_Abricate-report' googlesheet
6. Copy the name of the hit to column O (ImmeDB values) of '20230614_AMR in TE_Abricate-report' googlesheet.
7. Copy the corresponding information from the row in the ImmeDB result file (from column B-P) and paste it to column P-AD of '20230614_AMR in TE_Abricate-report' googlesheet. 

In [1]:
import pandas as pd
from cloudpathlib import AnyPath

In [2]:
abricate_file = AnyPath("~/Downloads/20230614_AMR-Abricate_report.csv").expanduser()

abricate_df = (
    pd.read_csv(
        abricate_file,
        sep=",",
        header=0,
        index_col=False,
        dtype=str,
        usecols=set(range(0, 14)),
    )
    .rename(columns={"#FILE": "FILE"})
    .sort_values(by=["FILE"])
)

abricate_df.head()

Unnamed: 0,FILE,SEQUENCE,START,END,STRAND,GENE,COVERAGE,COVERAGE_MAP,GAPS,%COVERAGE,%IDENTITY,DATABASE,ACCESSION,PRODUCT
0,./Inputfasta/SH0001344-00329.fasta,scaffold_Scaffolds_0.cp.600,3402126,3404045,+,tet(O),1-1920/1920,===============,0/0,100.0,99.9,ncbi,NG_048267.1,tetracycline resistance ribosomal protection p...
1,./Inputfasta/SH0001365-00245.fasta,1,43127,44353,+,mef(A),1-1227/1227,===============,0/0,100.0,97.64,ncbi,NG_047970.1,macrolide efflux MFS transporter Mef(A)
2,./Inputfasta/SH0001372-00039.fasta,1,3220448,3222364,-,tet(W),1-1917/1923,===============,0/0,99.69,96.19,ncbi,NG_048295.1,tetracycline resistance ribosomal protection p...
3,./Inputfasta/SH0001373-00076.fasta,1,670898,671463,+,catA16,95-660/660,..=============,0/0,85.76,99.29,ncbi,NG_047591.1,type A-16 chloramphenicol O-acetyltransferase ...
4,./Inputfasta/SH0001404-00077.fasta,scaffold_Scaffolds_0.cp.600,1387439,1389355,-,tet(W),1-1917/1923,===============,0/0,99.69,97.91,ncbi,NG_048295.1,tetracycline resistance ribosomal protection p...


In [3]:
immedb_file = AnyPath(
    "s3://genomics-workflow-core/scratch/sunitj/daisy/immeDB/Data1_MGE_sequences.annotations.csv"
)
immedb_df = pd.read_csv(
    immedb_file,
    sep=",",
    header=0,
    index_col=False,
    dtype=str,
    names=["accession", "annotation"],
)

immedb_annotations = dict(zip(immedb_df["accession"], immedb_df["annotation"]))
# immedb_annotations

In [4]:
from collections import Counter


def read_immedb_result(genome_name: str, seq_name: str, start: int, end: int):
    result_columns = [
        "query",
        "subject",
        "perc_identity",
        "alignment_length",
        "mismatches",
        "gap_opens",
        "q_start",
        "q_end",
        "s_start",
        "s_end",
        "evalue",
        "bit_score",
        "query_length",
        "subject_length",
        "perc_query_coverage_per_subject",
        "subject sci names",
    ]
    # s3://genomics-workflow-core/Results/Blast/MITI-MCB/SH0001342-00095/immeDB/SH0001342-00095.blastn.tsv
    immedb_file = AnyPath(
        f"s3://genomics-workflow-core/Results/Blast/MITI-MCB/{genome_name}/immeDB/{genome_name}.blastn.tsv"
    )
    df = (
        pd.read_table(
            immedb_file,
            comment="#",
            header=None,
            names=result_columns,
            dtype={"query": str, "subject": str},
        )
        .query("(query == @seq_name) and (q_start <= @start) and (q_end >= @end)")
        .assign(ImmeDB_values=lambda x: x["subject"].map(immedb_annotations))
    )
    # return df

    if df.empty:
        return [None] * (len(result_columns) + 2)

    # add freq of also found categories.
    uniq_annot = Counter(df["ImmeDB_values"].to_list())
    df_0 = df.iloc[0].copy()
    df_0["All_immeDB_annotations"] = uniq_annot

    return df_0


# read_immedb_result("SH0001372-00039", "1", 3220448, 3222364)
read_immedb_result(
    "SH0001344-00329", "scaffold_Scaffolds_0.cp.600", 3402126, 3404045
).index

Index(['query', 'subject', 'perc_identity', 'alignment_length', 'mismatches',
       'gap_opens', 'q_start', 'q_end', 's_start', 's_end', 'evalue',
       'bit_score', 'query_length', 'subject_length',
       'perc_query_coverage_per_subject', 'subject sci names', 'ImmeDB_values',
       'All_immeDB_annotations'],
      dtype='object')

In [5]:
result_columns = [
    "query",
    "subject",
    "perc_identity",
    "alignment_length",
    "mismatches",
    "gap_opens",
    "q_start",
    "q_end",
    "s_start",
    "s_end",
    "evalue",
    "bit_score",
    "query_length",
    "subject_length",
    "perc_query_coverage_per_subject",
    "subject sci names",
    "ImmeDB_values",
    "All_immeDB_annotations",
]
abricate_df[result_columns] = abricate_df.apply(
    lambda x: read_immedb_result(
        AnyPath(x.FILE).stem, x.SEQUENCE, int(x.START), int(x.END)
    ),
    axis=1,
    result_type="expand",
)

In [11]:
abricate_df

Unnamed: 0,FILE,SEQUENCE,START,END,STRAND,GENE,COVERAGE,COVERAGE_MAP,GAPS,%COVERAGE,...,s_start,s_end,evalue,bit_score,query_length,subject_length,perc_query_coverage_per_subject,subject sci names,ImmeDB_values,All_immeDB_annotations
0,./Inputfasta/SH0001344-00329.fasta,scaffold_Scaffolds_0.cp.600,3402126,3404045,+,tet(O),1-1920/1920,===============,0/0,100,...,11001.0,1.0,0.0,20116.0,3890762.0,11001.0,0.0,,IMEs1,"{'IMEs1': 20, 'ICEs78': 1, 'IMEs393': 1}"
1,./Inputfasta/SH0001365-00245.fasta,1,43127,44353,+,mef(A),1-1227/1227,===============,0/0,100,...,9585.0,8157.0,0.0,2368.0,2324782.0,11179.0,0.0,,Genomic_islands50,{'Genomic_islands50': 1}
2,./Inputfasta/SH0001372-00039.fasta,1,3220448,3222364,-,tet(W),1-1917/1923,===============,0/0,99.69,...,1.0,8027.0,0.0,14277.0,4962077.0,8027.0,0.0,,IMEs42,"{'IMEs42': 3, 'IMEs201': 1}"
3,./Inputfasta/SH0001373-00076.fasta,1,670898,671463,+,catA16,95-660/660,..=============,0/0,85.76,...,,,,,,,,,,
4,./Inputfasta/SH0001404-00077.fasta,scaffold_Scaffolds_0.cp.600,1387439,1389355,-,tet(W),1-1917/1923,===============,0/0,99.69,...,193.0,2435.0,0.0,3938.0,2294929.0,8027.0,0.0,,IMEs42,"{'IMEs42': 3, 'IMEs201': 1}"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253,./Inputfasta/SH0002528-00199.fasta,1,328911,330131,+,tet(40),1-1221/1221,===============,0/0,100,...,9442.0,10836.0,0.0,2499.0,3770667.0,10836.0,0.0,,IMEs393,"{'IMEs393': 1, 'IMEs460': 1}"
252,./Inputfasta/SH0002528-00199.fasta,1,326851,328770,+,tet(O),1-1920/1920,===============,0/0,100,...,1.0,2757.0,0.0,5092.0,3770667.0,11001.0,0.0,,IMEs1,"{'IMEs1': 20, 'ICEs78': 1, 'IMEs393': 1}"
254,./Inputfasta/SH0002528-00199.fasta,1,2718930,2719796,+,aadE,1-867/867,===============,0/0,100,...,1648.0,2.0,0.0,2863.0,3770667.0,8189.0,0.0,,IMEs35,"{'IMEs35': 3, 'IMEs363': 1, 'IMEs369': 1, 'ICE..."
255,./Inputfasta/SH0002531-00147.fasta,scaffold_1.cp.600,839789,841708,+,tet(O),1-1920/1920,===============,0/0,100,...,10430.0,1.0,0.0,15830.0,3022989.0,10938.0,0.0,,IMEs1,"{'IMEs1': 20, 'ICEs78': 1, 'IMEs393': 1}"


In [12]:
abricate_df.to_csv(
    "/Users/sunitjain/Downloads/20230614_AMR-Abricate_report.extended_immedb.csv",
    index=False,
)

In [8]:
# 2.5kb extra for on each side.