# Processing MAGs for Model Usage

In [1]:
import pandas as pd
import os
import requests
import re
import sys

sys.path.append("..")
DATA_DIR = "C:/Users/jakel/Desktop/code/data/"

## For parsing EC-annotated .gff's:

In [2]:
import modules.utils as utils

directory = os.path.join(DATA_DIR, "MAGs", "FENIX21", "DRAM", "all") #can also separate sediments / fluids
bins = utils.process_directory(directory) # Parse .gff files for ec numbers

bins.to_csv(os.path.join(DATA_DIR, 'MAGs', 'FENIX21', 'FENIX21-all.csv'), index=False)

test_ecs = bins.copy()
filter = test_ecs["ec"].str.contains("N/A")
test_ecs = test_ecs[~filter]

print(len(test_ecs), "annotations")
test_ecs.head()

Processing file: C:/Users/jakel/Desktop/code/data/MAGs\FENIX21\DRAM\all\KBase_derived_Bin.001.fastaBA_F_extracted_bins.AssemblySet_DRAM.gff
Processing file: C:/Users/jakel/Desktop/code/data/MAGs\FENIX21\DRAM\all\KBase_derived_Bin.001.fastaBA_S_extracted_bins.AssemblySet_DRAM.gff
Processing file: C:/Users/jakel/Desktop/code/data/MAGs\FENIX21\DRAM\all\KBase_derived_Bin.001.fastaCF_F_extracted_bins.AssemblySet_DRAM.gff
Processing file: C:/Users/jakel/Desktop/code/data/MAGs\FENIX21\DRAM\all\KBase_derived_Bin.001.fastaCG_S_extracted_bins.AssemblySet_DRAM.gff
Processing file: C:/Users/jakel/Desktop/code/data/MAGs\FENIX21\DRAM\all\KBase_derived_Bin.001.fastaFE_F_extracted_bins.AssemblySet_DRAM.gff
Processing file: C:/Users/jakel/Desktop/code/data/MAGs\FENIX21\DRAM\all\KBase_derived_Bin.001.fastaGA_F_extracted_bins.AssemblySet_DRAM.gff
Processing file: C:/Users/jakel/Desktop/code/data/MAGs\FENIX21\DRAM\all\KBase_derived_Bin.001.fastaLS_F_extracted_bins.AssemblySet_DRAM.gff
Processing file: C:/

Unnamed: 0,filename,seqname,source,feature,ID,product,ec,ko
3,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,NODE_1052_length_11999_cov_4.021266,KBase,gene,NODE_1052_length_11999_cov_4.021266_4,release factor glutamine methyltransferase %5B...,2.1.1.297,KO:K02493
5,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,NODE_1052_length_11999_cov_4.021266,KBase,gene,NODE_1052_length_11999_cov_4.021266_6,histidyl-tRNA synthetase %5BEC:6.1.1.21%5D,6.1.1.21,KO:K01892
13,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,NODE_1109_length_11407_cov_3.999383,KBase,gene,NODE_1109_length_11407_cov_3.999383_3,type III pantothenate kinase %5BEC:2.7.1.33%5D,2.7.1.33,KO:K03525
29,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,NODE_1114_length_11353_cov_4.046645,KBase,gene,NODE_1114_length_11353_cov_4.046645_8,tRNA-uridine 2-sulfurtransferase %5BEC:2.8.1.1...,2.8.1.13,KO:K00566
36,KBase_derived_Bin.001.fastaBA_F_extracted_bins...,NODE_1141_length_11155_cov_4.291892,KBase,gene,NODE_1141_length_11155_cov_4.291892_2,branched-chain amino acid aminotransferase %5B...,2.6.1.42,KO:K00826


In [3]:
# Load dataframes
train = pd.read_csv(os.path.join(DATA_DIR, "taxa2ec-final.csv"), low_memory=False)
test = pd.read_csv(os.path.join(DATA_DIR, 'MAGs', 'FENIX21', 'FENIX21-all.csv'), low_memory=False)

# Weird na values, just make some filters
test["ec"] = test["ec"].astype(str).dropna()
NaN = test["ec"].str.contains("NaN")
nan = test["ec"].str.contains("nan")
test = test[~NaN & ~nan]

# Start formatting to merge with taxa2ec-final
test["media_id"] = "unknown"
test["species"] = "unknown" ### can update with taxonomy predictions from the KBase workspace
test = test.rename(columns={"filename": "taxon_id", "ID": "node"})

# Subset and Concat
test = test[["media_id","species","taxon_id", "source", "ec"]]
frames = train, test
ml_input = pd.concat(frames)

ml_input.to_csv(os.path.join(DATA_DIR, "model", "ml_input.csv"), index=False)
ml_input

Unnamed: 0,media_id,species,taxon_id,source,ec
0,1a,Comamonas testosteroni,1886637.0,uniprot,2.6.1.1
1,1a,Comamonas testosteroni,1886637.0,uniprot,4.1.1.12
2,1a,Comamonas testosteroni,1886637.0,uniprot,1.13.11.74
3,1a,Comamonas testosteroni,1886637.0,uniprot,1.13.11.76
4,1a,Comamonas testosteroni,1886637.0,uniprot,1.14.13.23
...,...,...,...,...,...
590610,unknown,unknown,KBase_derived_Bin.039.fastaLS_S_extracted_bins...,KBase,2.7.1.107
590612,unknown,unknown,KBase_derived_Bin.039.fastaLS_S_extracted_bins...,KBase,1.7.2.5
590618,unknown,unknown,KBase_derived_Bin.039.fastaLS_S_extracted_bins...,KBase,3.2.1.81
590619,unknown,unknown,KBase_derived_Bin.039.fastaLS_S_extracted_bins...,KBase,3.1.21.7


## For KO-annotated MAGs:

In [None]:
# Load ko-annotated MAG (LCHF)
mag = pd.read_csv(os.path.join(DATA_DIR, "Thermodesulfo_MAG.csv"))
mag = mag.dropna()
ko_list = mag["KO"].to_list()

def fetch_and_extract_ec_numbers(ko):
    url = f'https://rest.kegg.jp/get/ko:{ko}'
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.text
        ec_numbers = re.findall(r'\b\d+\.\d+\.\d+\.\d+\b', data)
        return ec_numbers
    else:
        print(f"Failed to retrieve data for KO: {ko}")
        return []

all_ec_numbers = {}
for ko in ko_list:
    ec_numbers = fetch_and_extract_ec_numbers(ko)
    all_ec_numbers[ko] = ec_numbers

data_for_df = []
for ko, ec_numbers in all_ec_numbers.items():
    for ec in ec_numbers:
        data_for_df.append({'KO': ko, 'EC Number': ec})
df = pd.DataFrame(data_for_df)

cp = df.copy()
x = "LC_bin_0"
y = "unknown"
z = "Thermodesulfovibrionales"
s = "MAG"
cp[["taxon_id","media_id", "species", "source"]] = x, y, z, s
cp = cp.rename(columns={"EC Number":"ec"})

cp.to_csv(os.path.join(DATA_DIR, "MAGs", "test.csv"))


# Merge our results
taxa = pd.read_csv(os.path.join(DATA_DIR, "taxa2ec-final.csv"), low_memory=False)
cp = pd.read_csv(os.path.join(DATA_DIR, "MAGs", "test.csv"))
test = cp[["media_id", "species", "taxon_id", "source", "ec"]]
data = [test, taxa]
model = pd.concat(data)

#model.to_csv(os.path.join(DATA_DIR, "test_model.csv"), index=False)
model.head()