In [103]:
import sqlite3
import pandas as pd
from Bio import SeqIO
import os
import random
import json
import numpy as np
import re

from pandocfilters import Table

In [2]:
# Delete the database file if it exists
if os.path.exists("Testing.db"):
    os.remove("Testing.db")
    print("Deleted Testing.db")
else:
    print("Testing.db does not exist")

Deleted Testing.db


In [3]:
# Import Sheet
df = pd.read_csv("test_data/Testing_SQLite/samplesheet_nexus.csv")
df.rename(columns={"path": "read1"}, inplace=True)

In [4]:
df.head()

Unnamed: 0,sample,read1,reference_genome,sequencing_type
0,NIFP2402605,/90daydata/nece/bird/NECE_NEXUS_2025/Data/NIFP...,/90daydata/nece/bird/NECE_NEXUS_2025/Data/GCF_...,META_ONT_DNA
1,NIFP2402606,/90daydata/nece/bird/NECE_NEXUS_2025/Data/NIFP...,/90daydata/nece/bird/NECE_NEXUS_2025/Data/GCF_...,META_ONT_DNA
2,NIFP2402607,/90daydata/nece/bird/NECE_NEXUS_2025/Data/NIFP...,/90daydata/nece/bird/NECE_NEXUS_2025/Data/GCF_...,META_ONT_DNA
3,NIFP2402608,/90daydata/nece/bird/NECE_NEXUS_2025/Data/NIFP...,/90daydata/nece/bird/NECE_NEXUS_2025/Data/GCF_...,META_ONT_DNA
4,NIFP2402609,/90daydata/nece/bird/NECE_NEXUS_2025/Data/NIFP...,/90daydata/nece/bird/NECE_NEXUS_2025/Data/GCF_...,META_ONT_DNA


In [5]:
# Connect to (or create) the SQLite database
conn = sqlite3.connect("Testing.db")

In [6]:
#Creating Metadata table
conn.execute("""
CREATE TABLE IF NOT EXISTS metadata (
    sample TEXT,
    read1 TEXT,
    read2 TEXT,
    reference_genome TEXT,
    sequencing_type TEXT,
    location_name TEXT,
    location_coordinates TEXT,
    collection_date TEXT,
    sequencing_data TEXT,
    other_sample_information TEXT,
    long_term_storage_r1 TEXT,
    long_term_storage_r2 TEXT,
    PRIMARY KEY (sample, sequencing_type)
);
""")

<sqlite3.Cursor at 0x14a07b6c0>

In [7]:
#Columns that could be in the metadata
optional_fields = [
    "read2",
    "location_name",
    "location_coordinates",
    "collection_date",
    "sequencing_data",
    "other_sample_information",
    "long_term_storage_r1",
    "long_term_storage_r2"
]

#Colums that every sample should have
required_fields = ["sample", "read1", "reference_genome", "sequencing_type"]

#Check for required Colums
for field in required_fields:
    if field not in df.columns:
        raise ValueError(f"Missing required column: {field}")

# Add missing optional columns with None
for field in optional_fields:
    if field not in df.columns:
        df[field] = None

In [8]:
# Connect to database
cursor = conn.cursor()
# Attempt to insert each row individually and throw error if duplicate found
try:
    for row in df.itertuples(index=False):
        cursor.execute("""
            INSERT INTO metadata (
                sample, read1, reference_genome, sequencing_type,
                location_name, location_coordinates,
                collection_date, sequencing_data, other_sample_information
            ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
        """, (
            row.sample,
            row.read1,
            row.reference_genome,
            row.sequencing_type,
            row.location_name,
            row.location_coordinates,
            row.collection_date,
            row.sequencing_data,
            row.other_sample_information
        ))
    conn.commit()
except sqlite3.IntegrityError as e:
    raise RuntimeError(f"❌ Duplicate entry detected: {e}")
finally:
    conn.close()

In [9]:
# Connect to your SQLite database
conn = sqlite3.connect("Testing.db")
cursor = conn.cursor()

In [10]:
# Query all unique samples
cursor.execute("SELECT DISTINCT sample FROM metadata;")
rows = cursor.fetchall()

# Print the sample names
for row in rows:
    print(row[0])

NIFP2402605
NIFP2402606
NIFP2402607
NIFP2402608
NIFP2402609
NIFP2402610
NIFP2402611
NIFP2402612


In [11]:
# Close the connection
conn.close()

In [12]:
fasta_folder = "/Users/edwardbird/Documents/GitHub/edwardbirdlab-tools/test_data/Testing_SQLite"
db_path = "Testing.db"

In [13]:
# Connect to database
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

In [14]:
#aking a new table for the contigs
cursor.execute("""
CREATE TABLE IF NOT EXISTS contigs (
    sample TEXT,
    sequencing_type TEXT,
    contig_id TEXT,
    sequence TEXT,
    PRIMARY KEY (sample, sequencing_type, contig_id),
    FOREIGN KEY (sample, sequencing_type) REFERENCES metadata(sample, sequencing_type)
);
""")

<sqlite3.Cursor at 0x14a138540>

In [15]:
# Making a function to get the sequencing type from the metadata we imported already
def get_sequencing_type(sample_name):
    result = cursor.execute("""
        SELECT sequencing_type FROM metadata
        WHERE sample = ?
    """, (sample_name,)).fetchall()

    if len(result) == 1:
        return result[0][0]
    elif len(result) > 1:
        raise ValueError(f"Multiple sequencing types found for sample {sample_name}")
    else:
        raise ValueError(f"No metadata entry found for sample {sample_name}")

In [16]:
# Go though fasta files and import them
for fname in os.listdir(fasta_folder):
    if fname.endswith(".fasta") or fname.endswith(".fa"):
        sample = os.path.splitext(fname)[0].split("_")[0]
        try:
            sequencing_type = get_sequencing_type(sample)
        except ValueError as e:
            print(f"Skipping {fname}: {e}")
            continue

        fasta_path = os.path.join(fasta_folder, fname)
        # Counter for inserted sequences
        inserted_count = 0

        for record in SeqIO.parse(fasta_path, "fasta"):
            contig_id = record.id
            sequence = str(record.seq)

            try:
                cursor.execute("""
                    INSERT INTO contigs (sample, sequencing_type, contig_id, sequence)
                    VALUES (?, ?, ?, ?)
                """, (sample, sequencing_type, contig_id, sequence))
                inserted_count += 1
            except sqlite3.IntegrityError:
                print(f"Duplicate entry skipped: {sample}, {sequencing_type}, {contig_id}")

        # Print summary for this sample
        print(f"✅ Added {inserted_count} sequences for sample '{sample}' (type: {sequencing_type})")

✅ Added 19744 sequences for sample 'NIFP2402609' (type: META_ONT_DNA)
✅ Added 14230 sequences for sample 'NIFP2402606' (type: META_ONT_DNA)
✅ Added 21851 sequences for sample 'NIFP2402607' (type: META_ONT_DNA)
✅ Added 60455 sequences for sample 'NIFP2402612' (type: META_ONT_DNA)
✅ Added 71171 sequences for sample 'NIFP2402608' (type: META_ONT_DNA)
✅ Added 45472 sequences for sample 'NIFP2402605' (type: META_ONT_DNA)
✅ Added 22078 sequences for sample 'NIFP2402610' (type: META_ONT_DNA)
✅ Added 31189 sequences for sample 'NIFP2402611' (type: META_ONT_DNA)


In [17]:
conn.commit()
conn.close()

In [18]:
# Testing out grabing a random contig
# Connect to the database
conn = sqlite3.connect("Testing.db", timeout=10)
cursor = conn.cursor()

# Query to get all contig ids for the sample 'NIFP2402606'
cursor.execute("""
    SELECT contig_id, sequence FROM contigs
    WHERE sample = 'NIFP2402606';
""")

# Fetch all results
contigs = cursor.fetchall()

# If there are contigs, select a random one
if contigs:
    random_contig = random.choice(contigs)
    contig_id, sequence = random_contig
    print(f"Random contig ID: {contig_id}")
    print(f"Sequence: {sequence}")
else:
    print("No contigs found for sample NIFP2402606")

# Close the connection
conn.close()

Random contig ID: NoClass_6155e154-5c33-45e8-9a55-a5d29f8847b0
Sequence: GTATTGAAGAAAAAGGTAAGAAGCTAAACAGCAGCATGTTGAGAACACAATGAAACAAGAAAAAGAACTCCTGATCGCAATTACTGCTCTCTTCGTTACCCTCCTTGCTTTCCGAATCGCCGATTGTGTACTTCACAAACACGTACTCAACGACATGATAGCCAGGAGAAATAATTAGAAGAGAATGAGAGAAATGCTTTCCTTTATTATTGTAAAAGAGATTAGGATTGCACAGCGCAGAACCGGTGAAAGGGGGAGGCTAGTTGACAGTCATAGCAAAAGCCTGGATGTTGTTCTTTCGTTTGTTGGTTTTGTGTAGATTCCCCCCTTTTTTTTCTGTTATCTGGTCTGACTCTGTCTCCCAGGATGCTTTTTGTTTGTTTGTTTGTTTGCTCTTTTTTGTTTGGTTTGGTTTTGTTCTCTCACGCTACGAATTGTGGCTGTTTCGCAAGATGAACGACCAAGATGACATATGCCGCTCTCATTGTTGACATCAACTTACGAGATTTTTCACCATGCAAAATTATTTGGCATGTGGATATCTTGAGAAAAATCACATACTCGTGATTTTTCTCCTTTAAAGAAAAATAAAAAAAATAGGGAGAAACAGAGAGAAAGAGAGCTCGTGAGTCCTGTCGCAGCTCTCATTCCAGTTGTTTTTGTTGTTGGCGTTGTTCAT


In [19]:
#Paths for importing amrfinder reuslts
report_folder = "/Users/edwardbird/Documents/GitHub/edwardbirdlab-tools/test_data/Testing_SQLite/AMRFINDER"
json_mapping_path = "/Users/edwardbird/Documents/GitHub/edwardbirdlab-tools/conf/abstraction_layers/amrfinderplus_abstraction.json"

In [20]:
# Load column mapping json
with open(json_mapping_path, "r") as f:
    column_map = json.load(f)

In [21]:
# Open DB connection
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

In [22]:
# Create table if needed
columns_sql = ",\n".join([f"{val} TEXT" for val in column_map.values()])
cursor.execute(f"""
CREATE TABLE IF NOT EXISTS amrfinder (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    sample TEXT,
    sequencing_type TEXT,
    {columns_sql},
    FOREIGN KEY (sample, sequencing_type, contig_id) REFERENCES contigs(sample, sequencing_type, contig_id)
);
""")

<sqlite3.Cursor at 0x14a103d40>

In [23]:
# Bulk import AMRFinder reports
for fname in os.listdir(report_folder):
    if fname.endswith("_AMRFinder.tsv"):
        sample = fname.split("_AMRFinder.tsv")[0]
        file_path = os.path.join(report_folder, fname)

        try:
            sequencing_type = get_sequencing_type(sample)
        except ValueError as e:
            print(f"Skipping {fname}: {e}")
            continue

        try:
            df = pd.read_csv(file_path, sep="\t")
            df.rename(columns=column_map, inplace=True)

            # Add required relationship columns
            df["sample"] = sample
            df["sequencing_type"] = sequencing_type

            if "contig_id" not in df.columns:
                raise ValueError(f"No 'contig_id' column found in {fname} after mapping.")

            # Reorder columns
            df = df[["sample", "sequencing_type", "contig_id"] + list(column_map.values())]

            # Insert into database
            df.to_sql("amrfinder", conn, if_exists="append", index=False)
            print(f"✅ Imported {len(df)} AMRFinder hits for sample '{sample}' (type: {sequencing_type})")

        except Exception as e:
            print(f"⚠️ Failed to import '{fname}': {e}")

✅ Imported 1 AMRFinder hits for sample 'NIFP2402608' (type: META_ONT_DNA)
✅ Imported 6 AMRFinder hits for sample 'NIFP2402611' (type: META_ONT_DNA)
✅ Imported 1 AMRFinder hits for sample 'NIFP2402606' (type: META_ONT_DNA)
✅ Imported 0 AMRFinder hits for sample 'NIFP2402605' (type: META_ONT_DNA)
✅ Imported 7 AMRFinder hits for sample 'NIFP2402612' (type: META_ONT_DNA)
✅ Imported 0 AMRFinder hits for sample 'NIFP2402607' (type: META_ONT_DNA)
✅ Imported 4 AMRFinder hits for sample 'NIFP2402610' (type: META_ONT_DNA)
✅ Imported 1 AMRFinder hits for sample 'NIFP2402609' (type: META_ONT_DNA)


In [24]:
conn.commit()
conn.close()

In [25]:
#Setting directories for kraken2 import
K2Reports = "/Users/edwardbird/Documents/GitHub/edwardbirdlab-tools/test_data/Testing_SQLite/KRAKEN2_PLUSPF_SE"
K2_Column_Abs = "/Users/edwardbird/Documents/GitHub/edwardbirdlab-tools/conf/abstraction_layers/kraken2_abstraction.json"

In [26]:
# Extract Column Names, Null names will be skipped
with open(K2_Column_Abs, "r") as f:
    column_mapping = json.load(f)

col_indices = {k: v for k, v in column_mapping.items() if v is not None}
col_names = list(col_indices.keys())
col_positions = list(col_indices.values())

In [27]:
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

In [28]:
# Make sure the new kraken_report table exists
cursor.execute("""
CREATE TABLE IF NOT EXISTS kraken_report (
    sample TEXT,
    sequencing_type TEXT,
    percentage REAL,
    clade_fragments INTEGER,
    taxon_fragments INTEGER,
    minimizers INTEGER,
    distinct_minimizers INTEGER,
    rank_code TEXT,
    tax_id INTEGER,
    name TEXT,
    FOREIGN KEY (sample, sequencing_type) REFERENCES metadata(sample, sequencing_type)
)
""")

<sqlite3.Cursor at 0x12ff41740>

In [29]:
# Process each report
for fname in os.listdir(K2Reports):
    if fname.endswith("_report.tsv"):
        sample = fname.replace("_report.tsv", "")
        sequencing_type = get_sequencing_type(sample)
        full_path = os.path.join(K2Reports, fname)

        df_raw = pd.read_csv(full_path, sep="\t", header=None, usecols=col_positions)
        df_raw.columns = col_names

        df_raw["sample"] = sample
        df_raw["sequencing_type"] = sequencing_type

        df_raw.to_sql("kraken_report", conn, if_exists="append", index=False)

        print(f"✅ Imported Kraken report for sample: {sample} (type: {sequencing_type})")

✅ Imported Kraken report for sample: NIFP2402606 (type: META_ONT_DNA)
✅ Imported Kraken report for sample: NIFP2402612 (type: META_ONT_DNA)
✅ Imported Kraken report for sample: NIFP2402609 (type: META_ONT_DNA)
✅ Imported Kraken report for sample: NIFP2402610 (type: META_ONT_DNA)
✅ Imported Kraken report for sample: NIFP2402607 (type: META_ONT_DNA)
✅ Imported Kraken report for sample: NIFP2402611 (type: META_ONT_DNA)
✅ Imported Kraken report for sample: NIFP2402605 (type: META_ONT_DNA)
✅ Imported Kraken report for sample: NIFP2402608 (type: META_ONT_DNA)


In [30]:
# Connect to (or create) the SQLite database
conn = sqlite3.connect("Testing.db")

In [31]:
#Paths for importing amrfinder reuslts
report_folder = "/Users/edwardbird/Documents/GitHub/edwardbirdlab-tools/test_data/Testing_SQLite/CARD_CONTIG"
json_mapping_path = "/Users/edwardbird/Documents/GitHub/edwardbirdlab-tools/conf/abstraction_layers/card_phm_abstraction.json"

In [32]:
# Load column mapping json
with open(json_mapping_path, "r") as f:
    column_map = json.load(f)

In [33]:
# Create table if needed
columns_sql = ",\n".join([f"{val} TEXT" for val in column_map.values()])
cursor.execute(f"""
CREATE TABLE IF NOT EXISTS card_phm (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    sample TEXT,
    sequencing_type TEXT,
    contig_id TEXT,
    {columns_sql},
    FOREIGN KEY (sample, sequencing_type, contig_id) REFERENCES contigs(sample, sequencing_type, contig_id)
);
""")

<sqlite3.Cursor at 0x12ff41740>

In [34]:
# Bulk import AMRFinder reports
for fname in os.listdir(report_folder):
    if fname.endswith("_out.txt"):
        sample = fname.split("_out.txt")[0]
        file_path = os.path.join(report_folder, fname)

        try:
            sequencing_type = get_sequencing_type(sample)
        except ValueError as e:
            print(f"Skipping {fname}: {e}")
            continue

        try:
            df = pd.read_csv(file_path, sep="\t")
            df.rename(columns=column_map, inplace=True)
            df['contig_id'] = df['contig'].str.replace(r'_\d+$', '', regex=True)

            # Add required relationship columns
            df["sample"] = sample
            df["sequencing_type"] = sequencing_type

            if "contig_id" not in df.columns:
                raise ValueError(f"No 'contig_id' column found in {fname} after mapping.")

            # Reorder columns
            df = df[["sample", "sequencing_type", "contig_id"] + list(column_map.values())]

            # Insert into database
            df.to_sql("card_phm", conn, if_exists="append", index=False)
            print(f"✅ Imported {len(df)} card_phm hits for sample '{sample}' (type: {sequencing_type})")

        except Exception as e:
            print(f"⚠️ Failed to import '{fname}': {e}")

✅ Imported 8 card_phm hits for sample 'NIFP2402605' (type: META_ONT_DNA)
✅ Imported 1 card_phm hits for sample 'NIFP2402607' (type: META_ONT_DNA)
✅ Imported 0 card_phm hits for sample 'NIFP2402606' (type: META_ONT_DNA)
✅ Imported 8 card_phm hits for sample 'NIFP2402612' (type: META_ONT_DNA)
✅ Imported 2 card_phm hits for sample 'NIFP2402609' (type: META_ONT_DNA)
✅ Imported 12 card_phm hits for sample 'NIFP2402610' (type: META_ONT_DNA)
✅ Imported 7 card_phm hits for sample 'NIFP2402611' (type: META_ONT_DNA)
✅ Imported 5 card_phm hits for sample 'NIFP2402608' (type: META_ONT_DNA)


In [35]:
conn.commit()
conn.close()

In [None]:
# Connect to (or create) the SQLite database
conn = sqlite3.connect("Testing.db")

In [93]:
conn.commit()
conn.close()

In [36]:
# Connect to (or create) the SQLite database
conn = sqlite3.connect("Testing.db")

In [37]:
# Checking CARD PHM Import
query_counts = """
SELECT
  COUNT(*) AS total_rows,
  SUM(CASE
        WHEN contigs.contig_id IS NOT NULL THEN 1
        ELSE 0
      END) AS matched_rows,
  SUM(CASE
        WHEN contigs.contig_id IS NULL THEN 1
        ELSE 0
      END) AS missing_rows
FROM card_phm
LEFT JOIN contigs
  ON card_phm.sample = contigs.sample
  AND card_phm.sequencing_type = contigs.sequencing_type
  AND card_phm.contig_id = contigs.contig_id;
"""

counts = pd.read_sql_query(query_counts, conn)
print(counts)



   total_rows  matched_rows  missing_rows
0          43            43             0


In [38]:
#Checking AMRFinder
query_counts = """
SELECT
  COUNT(*) AS total_rows,
  SUM(CASE
        WHEN contigs.contig_id IS NOT NULL THEN 1
        ELSE 0
      END) AS matched_rows,
  SUM(CASE
        WHEN contigs.contig_id IS NULL THEN 1
        ELSE 0
      END) AS missing_rows
FROM amrfinder
LEFT JOIN contigs
  ON amrfinder.sample = contigs.sample
  AND amrfinder.sequencing_type = contigs.sequencing_type
  AND amrfinder.contig_id = contigs.contig_id;
"""

counts = pd.read_sql_query(query_counts, conn)
print(counts)



   total_rows  matched_rows  missing_rows
0          20            20             0


In [39]:
# Load JSON from file
with open("/Users/edwardbird/Documents/GitHub/edwardbirdlab-tools/test_data/Testing_SQLite/RESFINDER/NIFP2402612/NIFP2402612_allclass.json") as f:
    data = json.load(f)

# Convert to DataFrame
df = pd.DataFrame(data)

# Export to CSV
df.to_csv("resfinder_test.csv", index=False)

In [53]:
def parse_resfinder_json(path):
    with open(path) as f:
        data = json.load(f)

    genes = []
    mutations = []

    # Get helper dicts
    phenotypes = data.get("phenotypes", {})
    regions = data.get("seq_regions", {})
    variations = data.get("seq_variations", {})

    # Loop over resistant phenotypes
    for p in phenotypes.values():
        #if not p.get("amr_resistant", False):
        #    continue

        drug = p.get("amr_resistance", "unknown")
        drug_class = ", ".join(p.get("amr_classes", []))

        # --- Gene hits ---
        for region_key in p.get("seq_regions", []):
            r = regions.get(region_key, {})
            genes.append({
                "gene": r.get("name"),
                "drug": drug,
                "drug_class": drug_class,
                "identity": r.get("identity"),
                "coverage": r.get("coverage"),
                "ref_acc": r.get("ref_acc"),
            })

        # --- Mutations ---
        for v_key in p.get("seq_variations", []):
            v = variations.get(v_key, {})
            for r_key in v.get("seq_regions", []):
                r = regions.get(r_key, {})
                mutations.append({
                    "mutation": v.get("seq_var"),
                    "codon_change": v.get("codon_change"),
                    "gene": r.get("name"),
                    "drug": drug,
                    "drug_class": drug_class
                })

    return genes, mutations

In [54]:
genes, mutations = parse_resfinder_json("/Users/edwardbird/Documents/GitHub/edwardbirdlab-tools/test_data/Testing_SQLite/RESFINDER/NIFP2402612/NIFP2402612_allclass.json")

# Print gene results
for g in genes:
    print(g)

# Print mutations
for m in mutations:
    print(m)


{'gene': 'aadA1', 'drug': 'streptomycin', 'drug_class': 'aminoglycoside', 'identity': 98.23232323232324, 'coverage': 99.49302915082383, 'ref_acc': 'JQ480156'}
{'gene': 'ant(6)-Ia', 'drug': 'streptomycin', 'drug_class': 'aminoglycoside', 'identity': 97.14599341383095, 'coverage': 98.45984598459846, 'ref_acc': 'AF330699'}
{'gene': "aph(3')-III", 'drug': 'amikacin', 'drug_class': 'aminoglycoside', 'identity': 97.24310776942356, 'coverage': 99.11949685534591, 'ref_acc': 'M26832'}
{'gene': "aph(3')-III", 'drug': 'amikacin', 'drug_class': 'aminoglycoside', 'identity': 98.6232790988736, 'coverage': 99.87421383647799, 'ref_acc': 'M26832'}
{'gene': "aph(3')-III", 'drug': 'isepamicin', 'drug_class': 'aminoglycoside', 'identity': 97.24310776942356, 'coverage': 99.11949685534591, 'ref_acc': 'M26832'}
{'gene': "aph(3')-III", 'drug': 'isepamicin', 'drug_class': 'aminoglycoside', 'identity': 98.6232790988736, 'coverage': 99.87421383647799, 'ref_acc': 'M26832'}
{'gene': "aph(3')-III", 'drug': 'kanamyc

In [55]:
len(genes)

67

In [56]:
len(mutations)

0

In [83]:
def parse_resfinder_json_2(path):
    with open(path) as f:
        data = json.load(f)

    genes = []
    mutations = []

    # Get helper dicts
    phenotypes = data.get("phenotypes", {})
    regions = data.get("seq_regions", {})
    variations = data.get("seq_variations", {})

    # Maps to store gene and mutation information
    region_info = {}
    variation_info = {}

    # --- Loop through all phenotypes ---
    for p in phenotypes.values():
        drug = p.get("amr_resistance", "unknown")
        drug_class = ", ".join(p.get("amr_classes", []))
        resistance = p.get("amr_resistant", "unknown")  # Directly using the value of amr_resistant
        grade = p.get("grade", "unknown")  # Fetching the grade information

        # Debugging output to check the values
        print("amr_resistant:", resistance)

        # --- Loop through seq_regions in phenotypes ---
        for region_key in p.get("seq_regions", []):
            if region_key not in region_info:
                region_info[region_key] = {
                    "drugs": set(),
                    "drug_classes": set(),
                    "resistance": resistance,
                    "grade": grade,  # Adding grade information
                    "gene": regions.get(region_key, {}).get("name", "unknown"),
                    "identity": regions.get(region_key, {}).get("identity", "unknown"),
                    "coverage": regions.get(region_key, {}).get("coverage", "unknown"),
                    "ref_acc": regions.get(region_key, {}).get("ref_acc", "unknown"),
                }

            # Add drug, drug class, predicted resistance, and grade to the respective sets
            region_info[region_key]["drugs"].add(drug)
            region_info[region_key]["drug_classes"].add(drug_class)

        # --- Loop through seq_variations in phenotypes ---
        for variation_key in p.get("seq_variations", []):
            for region_key in variations.get(variation_key, {}).get("seq_regions", []):
                if region_key not in variation_info:
                    variation_info[region_key] = {
                        "drugs": set(),
                        "drug_classes": set(),
                        "resistance": resistance,
                        "grade": grade,  # Adding grade information
                        "gene": regions.get(region_key, {}).get("name", "unknown"),
                        "identity": regions.get(region_key, {}).get("identity", "unknown"),
                        "coverage": regions.get(region_key, {}).get("coverage", "unknown"),
                        "ref_acc": regions.get(region_key, {}).get("ref_acc", "unknown"),
                    }

                # Add drug, drug class, and predicted resistance to the respective sets
                variation_info[region_key]["drugs"].add(drug)
                variation_info[region_key]["drug_classes"].add(drug_class)

    # --- Now prepare final result for genes and mutations ---
    for region_key, info in region_info.items():
        genes.append({
            "gene": info["gene"],
            "drug": ", ".join(info["drugs"]),  # Combine all drugs into one string
            "drug_class": ", ".join(info["drug_classes"]),  # Combine all drug classes into one string
            "resistance": info["resistance"],  # Convert to string
            "grade": info["grade"],  # Adding grade information here
            "identity": info["identity"],
            "coverage": info["coverage"],
            "ref_acc": info["ref_acc"],
        })

    for region_key, info in variation_info.items():
        mutations.append({
            "mutation": ", ".join(info["drugs"]),  # Combine all drugs into one string
            "codon_change": ", ".join(info["drug_classes"]),  # Combine all drug classes into one string
            "resistance": info["resistance"],  # Convert to string
            "grade": info["grade"],  # Adding grade information here
            "gene": info["gene"],
            "drug": ", ".join(info["drugs"]),  # Combine all drugs into one string
            "drug_class": ", ".join(info["drug_classes"]),  # Combine all drug classes into one string
        })

    return genes, mutations


In [84]:
genes, mutations = parse_resfinder_json_2("/Users/edwardbird/Documents/GitHub/edwardbirdlab-tools/test_data/Testing_SQLite/RESFINDER/NIFP2402612/NIFP2402612_allclass.json")

# Print gene results
for g in genes:
    print(g)

# Print mutations
for m in mutations:
    print(m)

amr_resistant: False
amr_resistant: False
amr_resistant: True
amr_resistant: True
amr_resistant: True
amr_resistant: False
amr_resistant: True
amr_resistant: True
amr_resistant: True
amr_resistant: True
amr_resistant: True
amr_resistant: False
amr_resistant: False
amr_resistant: True
amr_resistant: False
amr_resistant: False
amr_resistant: False
amr_resistant: False
amr_resistant: False
amr_resistant: False
amr_resistant: False
amr_resistant: False
amr_resistant: True
amr_resistant: False
amr_resistant: True
amr_resistant: False
amr_resistant: False
amr_resistant: False
amr_resistant: False
amr_resistant: True
amr_resistant: False
amr_resistant: False
amr_resistant: False
amr_resistant: False
amr_resistant: True
amr_resistant: False
amr_resistant: False
amr_resistant: False
amr_resistant: False
amr_resistant: False
amr_resistant: False
amr_resistant: True
amr_resistant: False
amr_resistant: False
amr_resistant: False
amr_resistant: False
amr_resistant: False
amr_resistant: False
amr_re

In [64]:
len(genes)

26

In [128]:
# Load index-to-column mapping
with open("conf/abstraction_layers/resfinder_pheno_abstraction.json", "r") as f:
    column_map = json.load(f)

# Build the column names in order
ordered_column_names = [None] * len(column_map)
for name, index in column_map.items():
    ordered_column_names[index] = name

# Read the ResFinder table
testing_phenotype_df = pd.read_csv(
    "test_data/Testing_SQLite/RESFINDER/NIFP2402612/pheno_table.txt",
    sep="\t",
    comment="#",
    header=None,
    names=ordered_column_names
)

testing_phenotype_df.head()

Unnamed: 0,antibiotic,drug_class,wgs_predicted_phenotype,match_level,genetic_background
0,gentamicin,aminoglycoside,No resistance,0,
1,tobramycin,aminoglycoside,No resistance,0,
2,streptomycin,aminoglycoside,Resistant,1,"aadA1 (aadA1_JQ480156), ant(6)-Ia (ant(6)-Ia_A..."
3,amikacin,aminoglycoside,Resistant,1,aph(3')-III (aph(3')-III_M26832)
4,isepamicin,aminoglycoside,Resistant,1,aph(3')-III (aph(3')-III_M26832)


In [129]:
def split_genetic_background(value):
    if pd.isna(value):
        return None, None  # Return None for both if NaN

    # Split by commas if multiple gene-reference pairs
    pairs = value.split(", ")

    # Initialize lists to store genes and references
    genes = []
    references = []

    # Loop over each gene-reference pair
    for pair in pairs:
        seqs = pair.split(" ")
        gene = seqs[0]
        reference = seqs[1].split(gene + '_')[1][:-1]
        genes.append(gene)
        references.append(reference)

    return genes, references

In [131]:

# Apply the function to split the "genetic_background" column
testing_phenotype_df[['genes', 'references']] = testing_phenotype_df['genetic_background'].apply(lambda x: pd.Series(split_genetic_background(x)))

# Display the resulting DataFrame
testing_phenotype_df.head(300)

Unnamed: 0,antibiotic,drug_class,wgs_predicted_phenotype,match_level,genetic_background,genes,references
0,gentamicin,aminoglycoside,No resistance,0,,,
1,tobramycin,aminoglycoside,No resistance,0,,,
2,streptomycin,aminoglycoside,Resistant,1,"aadA1 (aadA1_JQ480156), ant(6)-Ia (ant(6)-Ia_A...","[aadA1, ant(6)-Ia]","[JQ480156, AF330699]"
3,amikacin,aminoglycoside,Resistant,1,aph(3')-III (aph(3')-III_M26832),[aph(3')-III],[M26832]
4,isepamicin,aminoglycoside,Resistant,1,aph(3')-III (aph(3')-III_M26832),[aph(3')-III],[M26832]
...,...,...,...,...,...,...,...
86,rifampicin,rifamycin,No resistance,0,,,
87,metronidazole,nitroimidazole,No resistance,0,,,
88,narasin,ionophores,No resistance,0,,,
89,salinomycin,ionophores,No resistance,0,,,


In [132]:
# Load the column name mapping
with open("conf/abstraction_layers/resfinder_hits_abstraction.json", "r") as f:
    column_map = json.load(f)

# Read the TSV (with header)
testing_hits_df = pd.read_csv("test_data/Testing_SQLite/RESFINDER/NIFP2402612/ResFinder_results_tab.txt", sep="\t")

# Rename columns using the map
testing_hits_df.rename(columns=column_map, inplace=True)

# Split the 'contig_position' column into 'start_position' and 'stop_position'
testing_hits_df[['start_position', 'stop_position']] = testing_hits_df['contig_position'].str.split(r"\.\.", expand=True).astype(int)

testing_hits_df.head(90)

Unnamed: 0,resistance_gene,identity,alignment_vs_gene_length,coverage,reference_position,contig_id,contig_position,phenotype,accession_number,start_position,stop_position
0,aadA1,98.23,792/789,99.493029,1..789,PLAS_P:0.674|C:0.326_7dc2184f-d64d-406a-8889-1...,3846..4633,"Spectinomycin, Streptomycin",JQ480156,3846,4633
1,ant(6)-Ia,97.15,911/909,98.459846,1..909,PLAS_P:0.546|C:0.454_cd01ab1e-6bd0-4f3c-826e-3...,7256..8152,Streptomycin,AF330699,7256,8152
2,aph(3')-III,97.24,798/795,99.119497,1..795,PLAS_P:0.546|C:0.454_cd01ab1e-6bd0-4f3c-826e-3...,1779..2569,"Kanamycin, Amikacin, Neomycin, Butirosin, Isep...",M26832,1779,2569
3,aph(3')-III,98.62,799/795,99.874214,1..795,CHROMO_P:0.156|C:0.844_0b206394-1c46-4fd8-9150...,240..1037,"Kanamycin, Amikacin, Neomycin, Butirosin, Isep...",M26832,240,1037
4,cfxA5,86.78,590/966,60.869565,365..953,PLAS_P:0.842|C:0.158_781834e3-5c9e-4188-8760-4...,1..589,Unknown Beta-lactam,AY769934,1,589
5,cfxA4,86.78,590/966,60.869565,365..953,PLAS_P:0.842|C:0.158_781834e3-5c9e-4188-8760-4...,1..589,Unknown Beta-lactam,AY769933,1,589
6,cfxA3,86.78,590/966,60.869565,365..953,PLAS_P:0.842|C:0.158_781834e3-5c9e-4188-8760-4...,1..589,Ampicillin,AF472622,1,589
7,cfxA2,86.78,590/966,60.869565,365..953,PLAS_P:0.842|C:0.158_781834e3-5c9e-4188-8760-4...,1..589,Unknown Beta-lactam,AF504914,1,589
8,cfxA,86.78,590/966,60.869565,365..953,PLAS_P:0.842|C:0.158_781834e3-5c9e-4188-8760-4...,1..589,"Cefoxitin, Ampicillin",U38243,1,589
9,mef(A),92.51,854/1218,68.801314,371..1218,PLAS_P:0.542|C:0.458_3e3ebdde-6338-4ad9-b84d-0...,2..845,"Erythromycin, Azithromycin",AF227520,2,845


In [136]:
#Getting drug class info
for pheno in testing_hits_df['phenotype']:
    pheno_ls = pheno.split(",")
    pheno_ls = [pheno.strip() for pheno in pheno_ls]
    print(pheno_ls)

['Spectinomycin', 'Streptomycin']
['Streptomycin']
['Kanamycin', 'Amikacin', 'Neomycin', 'Butirosin', 'Isepamicin', 'Lividomycin', 'Paromomycin', 'Ribostamycin']
['Kanamycin', 'Amikacin', 'Neomycin', 'Butirosin', 'Isepamicin', 'Lividomycin', 'Paromomycin', 'Ribostamycin']
['Unknown Beta-lactam']
['Unknown Beta-lactam']
['Ampicillin']
['Unknown Beta-lactam']
['Cefoxitin', 'Ampicillin']
['Erythromycin', 'Azithromycin']
['Erythromycin', 'Azithromycin']
['Lincomycin']
['Erythromycin', 'Azithromycin']
['Erythromycin', 'Lincomycin', 'Clindamycin', 'Quinupristin', 'Pristinamycin IA', 'Virginiamycin S']
['Ciprofloxacin']
['Ciprofloxacin']
['Ciprofloxacin']
['Doxycycline', 'Tetracycline', 'Minocycline']
['Doxycycline', 'Tetracycline', 'Minocycline']
['Doxycycline', 'Tetracycline', 'Minocycline']
['Doxycycline', 'Tetracycline', 'Minocycline']
['Doxycycline', 'Tetracycline', 'Minocycline']
['Doxycycline', 'Tetracycline']
['Doxycycline', 'Tetracycline', 'Minocycline']
['Doxycycline', 'Tetracycline

In [137]:
from eralchemy import render_er

# Define the path to your SQLite database
db_path = 'sqlite:///Testing.db'

# Generate the ERD and save it as a .png image
output_path = 'Testing_DB_Outline.png'

# This function generates the ERD and saves it as a PNG file
render_er(db_path, output_path)

print(f"ERD has been generated and saved to {output_path}")

ERD has been generated and saved to Testing_DB_Outline.png
