## Construction of structured antibiotic resistance database (SARD) — Group Level Classification

Please ensure the following files are located in the './Tutorials/Data' directory. The download link to the data is as follows: https://drive.google.com/drive/folders/1ZM0p5YHCg2FBTQwBCHyl11L-0fzNEk_A?usp=drive_link :

- embARG-Full-V1.0-2021.7.csv: Contains the mapping of IDs to categories in the Expanded Antibiotic Resistance Genes (ARGs) dataset.

- embARG-Full-V1.0-2021.7.fasta: Stores IDs and their corresponding amino acid sequences in the Expanded ARGs dataset.

- kernel.txt: Core antibiotic resistance database.

In [8]:
from Bio import SeqIO
from sklearn.model_selection import train_test_split
import pandas as pd
import os
os.chdir("../Tutorials")

csv_file_path = "Data/embARG-Full-V1.0-2021.7.csv"
fasta_file_path = "Data/embARG-Full-V1.0-2021.7.fasta"
core_file_path = "Data/kernel.txt"
if not (os.path.exists(csv_file_path) and os.path.exists(fasta_file_path) and os.path.exists(core_file_path)):
    print("Please download the required files from the following download link (https://drive.google.com/drive/folders/1ZM0p5YHCg2FBTQwBCHyl11L-0fzNEk_A?usp=drive_link) and save them to the 'Data/' directory on your local machine.")


## (1) Construct  Group Level training, validation, and testing datasets for the Antibiotic Resistance Database
To partition the dataset into training, validation, and test sets in a 6:2:2 ratio.



In [4]:
csv_file_path = "Data/embARG-Full-V1.0-2021.7.csv"
outfile = "Data/ARGs_GroupLevel"
if not os.path.exists(outfile):
    os.makedirs(outfile)
Id_to_class = {}
Id_to_class_little = {}

Expand_BigCategory_to_littleCategory = {}
Kernel_BigCategory_to_littleCategory = {}

Only_little_expand = {}
Only_little_kernel = {}
with open(csv_file_path,"r") as file:
    for i,line in enumerate(file):
        if i == 0:
            pass
        else:
            content = line.strip().split(",")
            if content[7] != 'Elfamycins':
                Id_to_class[content[3]] = content[7]
                Id_to_class_little[content[3]] = content[9]
                # Expand
                if i > 2972:
                    if content[7] not in Expand_BigCategory_to_littleCategory:
                        Expand_BigCategory_to_littleCategory[content[7]] = {}
                    if content[9] not in Expand_BigCategory_to_littleCategory[content[7]]:
                        Expand_BigCategory_to_littleCategory[content[7]][content[9]] = 1
                    else:
                        Expand_BigCategory_to_littleCategory[content[7]][content[9]] += 1

                    Only_little_expand[content[3]] = content[9]
                # Kernel
                else:
                    if content[7] not in Kernel_BigCategory_to_littleCategory:
                        Kernel_BigCategory_to_littleCategory[content[7]] = {}
                    if content[9] not in Kernel_BigCategory_to_littleCategory[content[7]]:
                        Kernel_BigCategory_to_littleCategory[content[7]][content[9]] = 1
                    else:
                        Kernel_BigCategory_to_littleCategory[content[7]][content[9]] += 1

                    Only_little_kernel[content[3]] = content[9]

number_little_category_big_5 = 0
category_lower_5 = []
for item in Expand_BigCategory_to_littleCategory:
    for little in Expand_BigCategory_to_littleCategory[item]:
        if Expand_BigCategory_to_littleCategory[item][little] > 4:
            number_little_category_big_5 += 1
        else:
            category_lower_5.append(little)
print(number_little_category_big_5)
print(category_lower_5)
print(len(category_lower_5))

#************************************************************************
Split_Train_Vali_Test = {}
with open(outfile+"/"+"train.txt","w") as write:
    for i in Id_to_class_little:
        if i in Only_little_kernel:
            write.write(i +"\n")
        else:
            if Id_to_class_little[i] in category_lower_5:
                write.write(i +"\n")
            else:
                Split_Train_Vali_Test[i] = Id_to_class_little[i]

all_data = list(Split_Train_Vali_Test.keys())
all_label = list(Split_Train_Vali_Test.values())
data_train,data_test,label_train,label_test = train_test_split(all_data,all_label,train_size=0.8,random_state=2021,stratify=all_label)
data_train_last,data_validation,label_train_last,label_validation = train_test_split(data_train,label_train,train_size=0.75,random_state=2021,stratify=label_train)

with open(outfile+"/"+"validation.txt","w") as write:
    for i in data_validation:
        write.write(i +"\n")

with open(outfile+"/"+"test.txt","w") as write:
    for i in data_test:
        write.write(i +"\n")

with open(outfile+"/"+"train.txt","a+") as write:
    for i in data_train_last:
        write.write(i +"\n")

353
['RMTB', 'CPAA', 'NPMA', 'ARMA', 'ADEG', 'LSAA', 'LMRC', 'CMEA', 'PARR', 'MEXV', 'MEXP', 'MEXA', 'CFRB', 'CFRC', 'CEOA', 'ABES', 'MEXJ', 'CARA', 'OPRZ', 'ARLS', 'LSAE', 'SALA', 'QACB', 'MEXG', 'QACH', 'AXYY', 'OPMB', 'CDEA', 'CRCB', 'OPTRA', 'CPRR', 'CPRS', 'BASS', 'VPH', 'ALMG', 'ICR', 'PAC', 'BLAF', 'PEP', 'GES', 'LEN', 'LAP', 'MUS', 'LMB', 'IMI', 'NMCR', 'SMB', 'FONA', 'CCRA', 'MIR', 'ACI', 'AER', 'CFE', 'CMH', 'CPS', 'MECB', 'CKO', 'PDC', 'RCP', 'VMB', 'EBR', 'SLB', 'CGA', 'VARG', 'KHM', 'THINB', 'BLAS', 'IDC', 'PNGM', 'DES', 'SPM', 'SFH', 'HMB', 'MECI', 'ROB', 'CEPH', 'BLAB', 'SCO', 'SFB', 'NMCA', 'DFRD', 'DFRB', 'DFRE', 'DFRF', 'DFRK', 'LNUF', 'MPHE', 'EREA', 'LING', 'LNUA', 'ERMV', 'MYRA', 'LIN', 'MEFB', 'LNUD', 'EREB', 'OLEB', 'ERED', 'MSRE', 'FOSK', 'FOMA', 'FOMB', 'FOSD', 'MUPA', 'TETD', 'TETX', 'OTRB', 'TETY', 'TRIB', 'FUSC', 'FUSF', 'SUL3', 'BRP', 'VANB', 'VANT', 'VANO', 'VANG', 'VANF', 'QNRE', 'QEPA']
120




In [5]:
import matplotlib.pyplot as plt

# All data
csv_file_path = 'Data/embARG-Full-V1.0-2021.7.csv'

category = {}
plt_category = {}
with open(csv_file_path,"r") as write:
    for i,item in enumerate(write):
        if i != 0:
            content = item.strip().split(",")
            id = content[3]
            big_category = content[7]
            little_category = content[9]
            if big_category not in category:
                category[big_category] = {}
                plt_category[big_category] = 1
            else:
                plt_category[big_category] += 1

            if little_category not in category[big_category]:
                category[big_category][little_category] = 1
            else:
                category[big_category][little_category] += 1

fig = plt.gcf()
fig.set_size_inches(40, 15)
name_list= plt_category.keys()
num_list = plt_category.values()
plt.bar(name_list, num_list)
plt.show()
print(category)

<Figure size 4000x1500 with 1 Axes>

{'Betalactams': {'CBLA': 30, 'SHV': 258, 'CTX-M': 214, 'NDM': 70, 'ACT': 75, 'CARB': 88, 'TEM': 348, 'LRA': 28, 'KPC': 57, 'OXA': 725, 'IMP': 63, 'CMY': 204, 'VEB': 10, 'DHA': 26, 'OXY': 27, 'MOX': 15, 'GES': 28, 'VIM': 55, 'FOX': 31, 'PEP': 2, 'GIM': 2, 'TLA': 9, 'PDC': 106, 'IND': 32, 'BLASME': 1, 'CPHA': 32, 'OKP': 33, 'CFXA': 24, 'OCH': 6, 'SRT': 9, 'LEN': 44, 'JOHN': 20, 'MIR': 20, 'IMIS': 1, 'CEPS': 8, 'FEZ': 7, 'R39': 1, 'TUS': 1, 'ACC': 37, 'SLB': 5, 'BLAF': 5, 'PER': 11, 'ROB': 11, 'IMI': 9, 'SFB': 4, 'CFE': 3, 'MECR': 44, 'AIM': 9, 'BCL': 332, 'BLAZ': 66, 'SMB': 2, 'MECC': 13, 'GOB': 17, 'CGB': 13, 'LCR': 1, 'MECI': 2, 'RCP': 4, 'BLAA': 14, 'BEL': 3, 'CAU': 10, 'EBR': 4, 'AER': 3, 'SIM': 1, 'SED': 28, 'MECA': 237, 'BCII': 295, 'BLA1': 209, 'BLA2': 26, 'CCRA': 2, 'THINB': 2, 'EXO': 12, 'LAT': 1, 'MUS': 3, 'MECB': 4, 'L1': 89, 'CEPA': 14, 'CEPH': 4, 'AQU': 21, 'DIM': 1, 'KHM': 3, 'BLAR': 13, 'NMCR': 3, 'NMCA': 2, 'AMPC': 294, 'BJP': 65, 'VCC': 1, 'PEDO': 8, 'CPS': 3, 'ESP': 10,

## (2) Constructing the training dataset required for model training


#### Step 1

In [6]:
file_path = "Data/ARGs_GroupLevel"
csv_file_path = 'Data/embARG-Full-V1.0-2021.7.csv'
fasta_file_path = 'Data/embARG-Full-V1.0-2021.7.fasta'
outfile = "Data/ARGs_GroupLevel/all_train"
if not os.path.exists(outfile):
    os.makedirs(outfile)
Id_to_class = {}
Id_to_class_little = {}
Id_to_class_little_2 = {}
with open(csv_file_path,"r") as file:
    for i,line in enumerate(file):
        if i == 0:
            pass
        else:
            content = line.strip().split(",")
            if content[7] != 'Elfamycins':
                Id_to_class[content[3]] = content[7]
                Id_to_class_little[content[3]] = content[9]
                Id_to_class_little_2[content[3]] = content[12]
Id_to_seq = {}
for seq_record in SeqIO.parse(fasta_file_path, "fasta"):
    Id_to_seq[seq_record.id] = seq_record.seq

##########################################################
filename = 0
training_category = {}
with open(file_path+"/"+"train.txt","r") as read:
    for i in read:
        big_category = Id_to_class[i.strip()]
        Small_category = Id_to_class_little[i.strip()]

        with open(outfile+"/" + str(Small_category) + ".txt", "a+") as write_category:
            write_category.write(i.strip() +"\t"+ Id_to_class[i.strip()] +"\t"+ Id_to_class_little[i.strip()] +"\n" )

            if big_category not in training_category:
                training_category[big_category] = {}

            if Small_category not in training_category[big_category]:
                training_category[big_category][Small_category] = 1
            else:
                training_category[big_category][Small_category] += 1
print(training_category)

{}
{'Betalactams': {'CBLA': 18, 'SHV': 230, 'CTX-M': 186, 'NDM': 53, 'ACT': 59, 'CARB': 60, 'TEM': 287, 'LRA': 22, 'KPC': 54, 'OXA': 627, 'IMP': 57, 'CMY': 184, 'VEB': 10, 'DHA': 24, 'OXY': 25, 'MOX': 13, 'GES': 28, 'VIM': 49, 'FOX': 23, 'PEP': 2, 'GIM': 2, 'TLA': 7, 'PDC': 106, 'IND': 26, 'BLASME': 1, 'CPHA': 22, 'OKP': 33, 'CFXA': 17, 'OCH': 6, 'SRT': 7, 'LEN': 44, 'JOHN': 12, 'MIR': 20, 'IMIS': 1, 'CEPS': 6, 'FEZ': 5, 'R39': 1, 'TUS': 1, 'ACC': 25, 'SLB': 5, 'BLAF': 5, 'PER': 9, 'ROB': 11, 'IMI': 9, 'SFB': 4, 'CFE': 3, 'MECR': 27, 'AIM': 6, 'BCL': 200, 'BLAZ': 40, 'SMB': 2, 'MECC': 9, 'GOB': 17, 'CGB': 9, 'LCR': 1, 'MECI': 2, 'RCP': 4, 'BLAA': 9, 'BEL': 3, 'CAU': 6, 'EBR': 4, 'AER': 3, 'SIM': 1, 'SED': 18, 'MECA': 143, 'BCII': 177, 'BLA1': 125, 'BLA2': 16, 'CCRA': 2, 'THINB': 2, 'EXO': 8, 'LAT': 1, 'MUS': 3, 'MECB': 4, 'L1': 53, 'CEPA': 9, 'CEPH': 4, 'AQU': 14, 'DIM': 1, 'KHM': 3, 'BLAR': 9, 'NMCR': 3, 'NMCA': 2, 'AMPC': 178, 'BJP': 39, 'VCC': 1, 'PEDO': 6, 'CPS': 3, 'ESP': 6, 'MSI'

#### Step 2

In [None]:
import random
import linecache
from Bio import SeqIO

all = {'Betalactams': {'CBLA': 18, 'SHV': 230, 'CTX-M': 186, 'NDM': 53, 'ACT': 59, 'CARB': 60, 'TEM': 287, 'LRA': 22, 'KPC': 54, 'OXA': 627, 'IMP': 57, 'CMY': 184, 'VEB': 10, 'DHA': 24, 'OXY': 25, 'MOX': 13, 'GES': 28, 'VIM': 49, 'FOX': 23, 'PEP': 2, 'GIM': 2, 'TLA': 7, 'PDC': 106, 'IND': 26, 'BLASME': 1, 'CPHA': 22, 'OKP': 33, 'CFXA': 17, 'OCH': 6, 'SRT': 7, 'LEN': 44, 'JOHN': 12, 'MIR': 20, 'IMIS': 1, 'CEPS': 6, 'FEZ': 5, 'R39': 1, 'TUS': 1, 'ACC': 25, 'SLB': 5, 'BLAF': 5, 'PER': 9, 'ROB': 11, 'IMI': 9, 'SFB': 4, 'CFE': 3, 'MECR': 27, 'AIM': 6, 'BCL': 200, 'BLAZ': 40, 'SMB': 2, 'MECC': 9, 'GOB': 17, 'CGB': 9, 'LCR': 1, 'MECI': 2, 'RCP': 4, 'BLAA': 9, 'BEL': 3, 'CAU': 6, 'EBR': 4, 'AER': 3, 'SIM': 1, 'SED': 18, 'MECA': 143, 'BCII': 177, 'BLA1': 125, 'BLA2': 16, 'CCRA': 2, 'THINB': 2, 'EXO': 8, 'LAT': 1, 'MUS': 3, 'MECB': 4, 'L1': 53, 'CEPA': 9, 'CEPH': 4, 'AQU': 14, 'DIM': 1, 'KHM': 3, 'BLAR': 9, 'NMCR': 3, 'NMCA': 2, 'AMPC': 178, 'BJP': 39, 'VCC': 1, 'PEDO': 6, 'CPS': 3, 'ESP': 6, 'MSI': 1, 'SPG': 1, 'AMPH': 377, 'ADC': 61, 'SPM': 2, 'FAR': 9, 'RM3': 10, 'FIM': 1, 'FONA': 7, 'FPH': 13, 'FRI': 9, 'STA': 1, 'FTU': 1, 'SFH': 2, 'ARL': 6, 'BRO': 2, 'HMB': 2, 'TMB': 2, 'OMP': 1, 'BAT': 6, 'BES': 1, 'MECD': 1, 'NPS': 1, 'VARG': 5, 'LAP': 4, 'BUT': 1, 'ACI': 4, 'RSA': 2, 'HERA': 7, 'TRU': 1, 'BLAB': 2, 'BIL': 2, 'CBP': 1, 'CGA': 3, 'BKC': 1, 'BPU': 79, 'CIA': 4, 'AST': 5, 'CKO': 3, 'CME': 18, 'OMPK': 56, 'SCO': 2, 'CMH': 4, 'DES': 2, 'CAM': 1, 'PNGM': 2, 'ERP': 1, 'IDC': 4, 'VMB': 2, 'LMB': 4, 'FLC': 1, 'OMPA': 58, 'GPC': 1, 'BLAS': 4, 'PAC': 4}, 'Trimethoprim': {'DFRF': 5, 'DFRA': 72, 'DFRG': 12, 'DFRD': 3, 'DFRB': 12, 'DFRC': 53, 'DFRE': 3, 'DFRK': 2, 'DFRI': 1, 'DFR22': 1}, 'MLS': {'ERM': 153, 'LNUA': 3, 'MSRE': 3, 'MYRA': 4, 'CHRB': 1, 'LNUC': 4, 'TLRB': 1, 'MPH': 14, 'LNUB': 1, 'VAT': 59, 'ERMC': 9, 'ERMA': 155, 'ERMG': 4, 'EREA': 5, 'ERMO': 6, 'MEFB': 4, 'MSRA': 40, 'ERMV': 5, 'MPHA': 9, 'ERED': 2, 'OLED': 66, 'OLEB': 2, 'MEFE': 38, 'LNUF': 3, 'VGB': 2, 'MPHB': 12, 'OLEI': 1, 'LING': 3, 'EREB': 2, 'LNUD': 2, 'TLRC': 6, 'MSRC': 6, 'MPHC': 6, 'GIMA': 164, 'LNUP': 1, 'MPHG': 1, 'LNUE': 1, 'MEFC': 1, 'OLEC': 362, 'RLMA': 136, 'MPHM': 28, 'LIN': 4, 'EMTA': 6, 'LLMA': 14, 'MPHI': 8, 'MGTA': 39, 'LNUG': 1, 'MPHE': 5, 'MPHH': 1, 'MPHK': 25, 'MPHN': 1, 'MPHO': 4, 'MPHJ': 4, 'CLCD': 1, 'CFR': 1, 'MSRF': 1, 'MSRH': 1, 'MEFD': 1, 'LSA': 5}, 'Fusidic acid': {'FUSB': 1, 'FUSH': 9, 'FUSD': 1, 'FUSC': 2, 'FUSF': 4}, 'Fosfomycin': {'FOSA': 104, 'FOSX': 12, 'FOMA': 2, 'FOSC': 2, 'FOSB': 27, 'FOSK': 4, 'FOMB': 3, 'FOSD': 5, 'MURA': 10, 'ABAF': 34}, 'Aminoglycosides': {'APH3-PRIME': 70, 'RMTH': 1, 'AAC3': 117, 'AAC6-PRIME': 250, 'ANT3-DPRIME': 220, 'APH2-DPRIME': 11, 'CPXA': 203, 'APH4': 6, 'NPMA': 2, 'APH9': 7, 'ANT6': 39, 'APH7-DPRIME': 1, 'AAC2-PRIME': 36, 'APH6': 68, 'RMTB': 4, 'ANT4-PRIME': 59, 'ANT2-DPRIME': 7, 'RMTG': 1, 'ANT9': 7, 'ARMA': 2, 'RMTC': 1, 'APH3-DPRIME': 46, 'RMTD': 8, 'RMTA': 1, 'SGM': 1, 'KDPE': 471, 'APMA': 1, 'CPAA': 3, 'RMTE': 2, 'CPXR': 458, 'KAMB': 1, 'RANB': 50, 'RANA': 16}, 'Fluoroquinolones': {'QNRB': 86, 'QNRS': 27, 'QNRA': 10, 'QNRVC': 8, 'QNRD': 5, 'MFPA': 39, 'QEPA': 3, 'QNRC': 6, 'QNR': 1, 'PATA': 206, 'PATB': 10, 'QNRE': 5, 'OEPA': 6, 'CRPP': 9, 'ABAQ': 26}, 'Multi-drug resistance': {'GOLS': 12, 'MDTP': 103, 'SME': 156, 'GADX': 27, 'MDTF': 137, 'CLBB': 8, 'SRMB': 7, 'ADEF': 133, 'MGRA': 18, 'VGA': 11, 'ARLS': 5, 'ADES': 197, 'SDIA': 12, 'LSAE': 2, 'SMED': 31, 'OPRN': 30, 'MEXX': 129, 'ADEC': 35, 'YKKD': 1, 'QACA': 6, 'MDTH': 313, 'MEPA': 12, 'MDTA': 374, 'MDSA': 15, 'OPMD': 9, 'MTRE': 6, 'ADER': 77, 'AMRB': 356, 'MEXA': 4, 'MACB': 5, 'ROSB': 33, 'ACRS': 36, 'EMRY': 34, 'ADEH': 248, 'QACB': 3, 'CLBA': 24, 'LMRB': 80, 'ABES': 4, 'MDTG': 999, 'EMRK': 437, 'EMEA': 9, 'ADEJ': 14, 'CFRA': 6, 'CMEB': 21, 'CMEC': 13, 'MEXC': 6, 'OPRJ': 24, 'MDTB': 577, 'TOLC': 263, 'MEPR': 8, 'ADEB': 323, 'CRP': 108, 'FEXA': 4, 'CARA': 4, 'LMRC': 2, 'MEXG': 4, 'EVGA': 9, 'ACRB': 561, 'MEXE': 16, 'LSAB': 111, 'NORB': 45, 'MDSC': 12, 'CMEA': 4, 'MDTM': 758, 'ADEG': 2, 'CMER': 9, 'HNS': 55, 'CEOB': 445, 'BMR': 13, 'ROSA': 22, 'LSAA': 5, 'ADEA': 114, 'MDTC': 1334, 'EVGS': 99, 'EMRR': 272, 'BAER': 290, 'MTRC': 65, 'MEXH': 19, 'ABEM': 60, 'MDSB': 72, 'ARLR': 34, 'ACRD': 527, 'ACRF': 1377, 'MDTN': 50, 'ACRE': 267, 'ADEI': 39, 'AMRA': 5, 'NORA': 45, 'MTRA': 90, 'OPRA': 6, 'MEXD': 27, 'LMRD': 1, 'CLBC': 1, 'mexI': 100, 'ADEK': 128, 'RAMA': 36, 'MACA': 6, 'CEOA': 5, 'TAP': 97, 'LSAC': 33, 'MDTK': 1877, 'EMRA': 33, 'MEXY': 15, 'YKKC': 4, 'EMRB': 285, 'ADEN': 38, 'MDTE': 16, 'MARA': 126, 'MTRD': 68, 'MEXB': 206, 'MDTO': 24, 'OPCM': 59, 'PMRA': 26, 'OPMH': 27, 'MEXP': 5, 'MEXQ': 152, 'MEXJ': 3, 'MEXK': 707, 'MEXV': 4, 'MEXW': 639, 'OPME': 25, 'MEXM': 21, 'MEXN': 30, 'MEXL': 6, 'SALA': 2, 'OPTRA': 4, 'GADW': 12, 'CDEA': 3, 'CFRB': 3, 'OQXA': 26, 'OQXB': 461, 'ABCA': 16, 'CIPA': 12, 'EFRA': 12, 'EFRB': 1, 'MSBA': 223, 'HMRM': 6, 'EFMA': 5, 'EFPA': 15, 'FARA': 24, 'FARB': 24, 'HP1181': 6, 'IRFA': 41, 'IMRP': 6, 'TAEA': 4, 'TVAA': 4, 'MUXA': 10, 'OPMB': 4, 'ADEL': 89, 'EMRE': 17, 'ACRA': 76, 'BCR': 46, 'ARMR': 1, 'MEXF': 836, 'OPRM': 6, 'MUXB': 39, 'MUXC': 16, 'PMPM': 18, 'LPEA': 4, 'LPEB': 6, 'SOXR': 15, 'OPRZ': 3, 'AXYX': 5, 'AXYY': 3, 'CFRC': 4, 'MDFA': 500, 'POXT': 1, 'VMLR': 41, 'AMVA': 12, 'KPNE': 143, 'KPNF': 21, 'KPN': 204, 'QACE': 25, 'YAJC': 4, 'CRCB': 2, 'LPTD': 74, 'PARS': 6, 'PARR': 5, 'QACH': 4, 'QACL': 12, 'LMRS': 16, 'RSMA': 965}, 'Glycopeptides': {'VANY': 23, 'VANXY': 13, 'VANR': 841, 'VANN': 1, 'VANT': 7, 'VANL': 16, 'VANS': 134, 'VANC': 5, 'VANH': 12, 'VANV': 1, 'VAND': 1, 'VANU': 5, 'VANZ': 16, 'VANW': 9, 'VANB': 5, 'VANG': 2, 'VANX': 121, 'VANE': 1, 'VANO': 2, 'VANF': 3, 'VANM': 5, 'VANA': 9, 'VANJ': 19, 'BRP': 3, 'BLM': 5}, 'Phenicol': {'CAT': 199, 'CMLR': 48, 'CMLA': 28, 'CMRA': 6, 'CMLB': 10, 'CMLV': 176, 'CMX': 12, 'FLOR': 21, 'PEXA': 1}, 'Rifampin': {'ARR': 63, 'IRI': 30, 'RPHA': 8, 'RBPA': 64, 'RPHB': 17, 'RPOB': 1378}, 'Tetracyclines': {'TET': 217, 'TETA': 314, 'TETW': 98, 'TETB': 180, 'TETY': 4, 'TCR': 1, 'TETX': 5, 'OTRB': 5, 'TETC': 15, 'TETG': 9, 'TETH': 12, 'TETJ': 4, 'TETV': 42, 'TET44': 4, 'TETQ': 35, 'OTRA': 21, 'TETD': 3, 'TETK': 5, 'TETS': 7, 'TETM': 77, 'TETZ': 4, 'OTRC': 1, 'TETO': 15, 'TETT': 31, 'TETU': 1, 'TETL': 122, 'TXR': 27}, 'Peptide': {'TSNR': 1, 'EPTA': 178, 'UGD': 56, 'VPH': 3, 'ARNA': 75, 'ARNC': 518, 'MPRF': 22, 'MCR': 109, 'YOJI': 122, 'EDEQ': 1, 'BASS': 3, 'ALMG': 2, 'ICR': 4, 'PGPB': 9, 'EPTB': 752, 'LPSB': 83, 'LPSA': 27, 'ARNT': 21, 'CPRR': 3, 'CPRS': 3}, 'Bacitracin': {'BCRC': 1, 'BCRA': 13, 'BCRB': 9, 'BACA': 344, 'BAHA': 6}, 'Sulfonamide': {'SUL3': 3, 'SUL2': 24, 'SUL1': 52, 'SUL4': 1}, 'Nucleosides': {'SAT': 32, 'TMRB': 5, 'SATA': 120}, 'Aminocoumarins': {'NOVA': 713}, 'Triclosan': {'TRIA': 4, 'TRIB': 2, 'TRIC': 27}, 'Mupirocin': {'ILES': 341, 'MUPB': 1, 'MUPA': 5}}
little_all = []
csv_file_path = 'Data/embARG-Full-V1.0-2021.7.csv'
file = "Data/ARGs_GroupLevel"

for item in all:
    for little in all[item]:
        little_all.append(little)
print(little_all)

little_to_big = {}
for item in all:
    for little in all[item]:
        little_to_big[little] = item

id_to_seq = {}
for seq_record in SeqIO.parse(csv_file_path,"fasta"):
    sequence = str(seq_record.seq)
    sequence = sequence[:1022]
    id_to_seq[seq_record.id] = sequence

num_all = 0
random.seed(2021)
with open(file+"/all_train_data.txt","w") as write:
    for filename in little_all:
        dir_filename = file+"/all_train/"+filename+".txt"
        count = len(open(dir_filename,"rU").readlines())  # The number of lines in the file.
        # 跳过只有一条序列的
        if count == 1:
            continue
        else:
            for i in range(800):
                num_1 = random.randrange(0, count, 1)  # Select a line at random.
                num_2 = random.randrange(0, count, 1)  # Select a line at random.
                # Ensure that the two selected lines are different.
                if num_2 == num_1:
                    while num_2 == num_1:
                        num_2 = random.randrange(0, count, 1)

                line_1 = linecache.getline(dir_filename,num_1 + 1)  # Retrieve the corresponding data.
                line_2 = linecache.getline(dir_filename,num_2 + 1)  # Retrieve the corresponding data.

                # The class category to which "NOVA" belongs contains only this specific group category, and the same applies to the class category of "FACT".
                # Hence, it is not possible to select other group categories within the same class category.
                if filename not in ("NOVA","FACT"):
                    # Selecting negative sequences
                    # Sample sequences from other group categories within the same class.
                    if i < 400:
                        keys = list(all[little_to_big[filename]].keys())
                        origin = keys.index(filename)
                        random_select = random.randrange(0, len(keys), 1)
                        if random_select == origin:
                            while random_select == origin:
                                random_select = random.randrange(0, len(keys), 1)
                        filename_same_big = keys[random_select]

                        dir_filename_negnative = file+"/all_train/"+filename_same_big+".txt"
                        negnative_count = len(open(dir_filename_negnative,"rU").readlines())
                        num_3 = random.randrange(0, negnative_count, 1)  # 随机选取一行
                        line_3 = linecache.getline(dir_filename_negnative,num_3 + 1)

                        ####################################
                        id_category_1, big_category_1, little_category_1 = line_1.strip().split("\t")
                        id_category_2, big_category_2, little_category_2 = line_2.strip().split("\t")
                        id_category_3, big_category_3, little_category_3 = line_3.strip().split("\t")

                        write.write(id_to_seq[id_category_1] +"\t"+ id_to_seq[id_category_2] +"\t"+ id_to_seq[id_category_3]+"\n")

                    # Selecting negative sequences
                    # Sample sequences from other group categories within different classes.
                    else:
                        keys = list(all[little_to_big[filename]].keys())
                        others = [x for x in little_all if x not in keys]
                        random_select = random.randrange(0, len(others), 1)  # Sample
                        filename_diffrernt_big = others[random_select]

                        dir_filename_negnative = file+"/all_train/" + filename_diffrernt_big + ".txt"
                        negnative_count = len(open(dir_filename_negnative, "rU").readlines())
                        num_3 = random.randrange(0, negnative_count, 1)
                        line_3 = linecache.getline(dir_filename_negnative,num_3 + 1)

                        ####################################
                        id_category_1, big_category_1, little_category_1 = line_1.strip().split("\t")
                        id_category_2, big_category_2, little_category_2 = line_2.strip().split("\t")
                        id_category_3, big_category_3, little_category_3 = line_3.strip().split("\t")

                        write.write(id_to_seq[id_category_1] + "\t" + id_to_seq[id_category_2] + "\t" + id_to_seq[id_category_3] + "\n")
                else:
                    # Sample sequences from other group categories within different classes.
                    keys = list(all[little_to_big[filename]].keys())
                    #
                    others = [x for x in little_all if x not in keys]
                    random_select = random.randrange(0, len(others), 1)
                    filename_diffrernt_big = others[random_select]

                    dir_filename_negnative = file+"/all_train/" + filename_diffrernt_big + ".txt"
                    negnative_count = len(open(dir_filename_negnative, "rU").readlines())
                    num_3 = random.randrange(0, negnative_count, 1)  # Sample
                    line_3 = linecache.getline(dir_filename_negnative, num_3 + 1)

                    ####################################
                    id_category_1, big_category_1, little_category_1 = line_1.strip().split("\t")
                    id_category_2, big_category_2, little_category_2 = line_2.strip().split("\t")
                    id_category_3, big_category_3, little_category_3 = line_3.strip().split("\t")

                    write.write(id_to_seq[id_category_1] + "\t" + id_to_seq[id_category_2] + "\t" + id_to_seq[
                        id_category_3] + "\n")
            print(filename)
            num_all += 1
print(num_all)

#### Step 3

In [None]:
csv_file_path = 'Data/embARG-Full-V1.0-2021.7.csv'
fasta_file_path = 'Data/embARG-Full-V1.0-2021.7.fasta'
input_file = 'Data/ARGs_GroupLevel'
id_to_seq = {}
for seq_record in SeqIO.parse(fasta_file_path,"fasta"):
    sequence = str(seq_record.seq)
    if len(sequence) >= 1022:
        sequence = sequence[:1022]
    id_to_seq[seq_record.id] = sequence

Id_to_class = {}
Id_to_class_little = {}
Id_to_class_little_2 = {}
list_id = []
with open(csv_file_path,"r") as file:
    for i,line in enumerate(file):
        if i == 0:
            pass
        else:
            content = line.strip().split(",")
            if content[7] != 'Elfamycins':
                list_id.append(content[3])
                Id_to_class[content[3]] = content[7]
                Id_to_class_little[content[3]] = content[9]
                Id_to_class_little_2[content[3]] = content[12]

with open(input_file+"/"+"validation_last.txt","w") as write:
    with open(input_file+"/"+"validation.txt","r") as read:
        for item in read:
            write.write(item.strip() +"\t"+ Id_to_class[item.strip()] +"\t"+ Id_to_class_little[item.strip()] +"\t"+ Id_to_class_little_2[item.strip()] +"\t"+ id_to_seq[item.strip()] +"\n")


with open(input_file+"/"+"test_last.txt","w") as write:
    with open(input_file+"/"+"test.txt","r") as read:
        for item in read:
            write.write(
                item.strip() + "\t" + Id_to_class[item.strip()] + "\t" + Id_to_class_little[item.strip()] + "\t" +
                Id_to_class_little_2[item.strip()] + "\t" + id_to_seq[item.strip()] + "\n")

with open(input_file+"/"+"kernel_sequence.txt","w") as write1:
    with open("Data/kernel.txt","w") as write:
        for item in list_id:
            write.write(item +"\t"+ Id_to_class[item] +"\t"+ Id_to_class_little[item] +"\t"+Id_to_class_little_2[item] +"\n")
            write1.write(item +"\t"+ id_to_seq[item] +"\n")

👏 You can employ the training, validation, and test sets to replicate ARGTyper results of Group Level presented in our paper.
Furthermore, we also offer a direct download link to these datasets: https://drive.google.com/drive/folders/1QZHu0lY1-l_qdL9xu7BVZMtaEzVnydwO?usp=drive_link