In [1]:
import numpy as np
import torch
from mcspace.model import MCSPACE
from mcspace.trainer import train_model
from mcspace.data_utils import get_data, get_human_timeseries_dataset
from mcspace.utils import get_device, pickle_load, pickle_save,\
    get_summary_results, estimate_process_variance, MODEL_FILE, DATA_FILE, \
    get_mcspace_cooccur_prob
from pathlib import Path
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import mcspace.visualization as vis
from mcspace.dataset import DataSet
import pandas as pd

import ete3
from Bio import SeqIO, Phylo

In [2]:
ls

 Volume in drive C is OS
 Volume Serial Number is 5E7E-789E

 Directory of C:\Users\Gary2\Partners HealthCare Dropbox\Gurdip Uppal\research_bwh\MCSPACE_FINAL\MCSPACE\mcspace\data\human_experiments\gappa_taxonomy

08/21/2024  01:30 PM    <DIR>          .
08/21/2024  01:28 PM    <DIR>          ..
08/21/2024  01:30 PM    <DIR>          .ipynb_checkpoints
08/21/2024  01:29 PM    <DIR>          human_results_clean
08/21/2024  01:30 PM                72 process_gappa_taxonomy.ipynb
08/21/2024  01:29 PM                31 README.md
               2 File(s)            103 bytes
               4 Dir(s)  994,139,885,568 bytes free


## load gappa taxonomy

In [3]:
sattaxpath = Path("./human_results_clean/")

In [4]:
sativa = pd.read_csv(sattaxpath / "sativa.tsv", sep="\t", header=None)

In [5]:
sativa.head()

Unnamed: 0,0,1,2
0,OTU1720,Bacteria;Firmicutes;Clostridia;Clostridiales;O...,1;1;1;1;0.766642;0.766642;0.766642
1,OTU3529,Bacteria;Proteobacteria;Gammaproteobacteria;Th...,1;1;1;0.740465;0.740465;0.740465;0.485297
2,OTU3620,Bacteria;Bacteroidetes;Bacteroidia;Bacteroidal...,1;1;1;1;1;0.999934
3,OTU93,Bacteria;Bacteroidetes;Bacteroidia;Bacteroidal...,1;1;1;1;0.985272;0.985272
4,OTU3524,Bacteria;Firmicutes;Clostridia;Clostridiales;L...,1;1;1;1;1;1;0.999878


In [6]:
ranks = [
    'Kingdom',
    'Phylum',
    'Class',
    'Order',
    'Family',
    'Genus',
    'Species'
]

In [7]:
rconf = [f'{r}_conf' for r in ranks]
print(rconf)

['Kingdom_conf', 'Phylum_conf', 'Class_conf', 'Order_conf', 'Family_conf', 'Genus_conf', 'Species_conf']


In [8]:
ntaxa = sativa.shape[0]
print(ntaxa)

3855


In [9]:
# create dict of lists
dfdata = {}
for rkey in ranks:
    dfdata[rkey] = []
for ckey in rconf:
    dfdata[ckey] = []
dfdata['Otu'] = []

for idx in range(ntaxa):
    otuid = sativa.iloc[idx,0]
    taxstr = sativa.iloc[idx,1]
    confstr = sativa.iloc[idx,2]

    dfdata['Otu'].append(otuid)
    taxlist = taxstr.split(';')
    conflist = confstr.split(';')
    for i,rkey in enumerate(ranks):
        if i <= (len(taxlist)-1):
            taxlevel = taxlist[i]
            conflevel = conflist[i]
        else:
            taxlevel = 'na'
            conflevel = 0
        dfdata[rkey].append(taxlevel)
        ckey = rconf[i]
        dfdata[ckey].append(conflevel)

In [10]:
sativa_taxonomy = pd.DataFrame(dfdata)

In [11]:
sativa_taxonomy.loc[sativa_taxonomy['Otu']=='OTU3',:]

Unnamed: 0,Kingdom,Phylum,Class,Order,Family,Genus,Species,Kingdom_conf,Phylum_conf,Class_conf,Order_conf,Family_conf,Genus_conf,Species_conf,Otu
1273,Bacteria,Bacteroidetes,Bacteroidia,Bacteroidales,Bacteroidaceae,Bacteroides,Bacteroides dorei,1,1,1,1,1,1,0.497429,OTU3


In [12]:
def apply_taxonomy_threshold(taxonomy, threshold=0.5):
#     ranks= ['domain', 'phylum', 'class', 'order', 'family', 'genus']
    conf = rconf #['dconf', 'pconf', 'cconf', 'oconf', 'fconf', 'gconf']
    
    taxcopy = taxonomy.reset_index()
    ntaxa = taxcopy.shape[0]
    for i in range(ntaxa):
        for r,c in zip(ranks, conf):
#             print(taxcopy.loc[i,c])
#             print(type(taxcopy.loc[i,c]))
            if float(taxcopy.loc[i,c]) < threshold:
                taxcopy.loc[i,r] = 'na'
    
    ptaxa = taxcopy.set_index("Otu")
    ptaxa2 = ptaxa[ranks]
    
    mapper = {x:x.capitalize() for x in list(ptaxa2.columns)}
    ptaxa3 = ptaxa2.rename(columns=mapper)
    return ptaxa3

In [13]:
sativa_tax = apply_taxonomy_threshold(sativa_taxonomy)

In [14]:
sativa_tax.shape

(3855, 7)

In [15]:
sativa_tax.head()

Unnamed: 0_level_0,Kingdom,Phylum,Class,Order,Family,Genus,Species
Otu,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
OTU1720,Bacteria,Firmicutes,Clostridia,Clostridiales,Oscillospiraceae,Oscillibacter,Oscillibacter valericigenes
OTU3529,Bacteria,Proteobacteria,Gammaproteobacteria,Thiotrichales,Thiotrichaceae,Thiothrix,na
OTU3620,Bacteria,Bacteroidetes,Bacteroidia,Bacteroidales,Porphyromonadaceae,Parabacteroides,na
OTU93,Bacteria,Bacteroidetes,Bacteroidia,Bacteroidales,Bacteroidaceae,Bacteroides,na
OTU3524,Bacteria,Firmicutes,Clostridia,Clostridiales,Lachnospiraceae,Roseburia,Roseburia inulinivorans


In [16]:
ls

 Volume in drive C is OS
 Volume Serial Number is 5E7E-789E

 Directory of C:\Users\Gary2\Partners HealthCare Dropbox\Gurdip Uppal\research_bwh\MCSPACE_FINAL\MCSPACE\mcspace\data\human_experiments\gappa_taxonomy

08/21/2024  01:32 PM    <DIR>          .
08/21/2024  01:28 PM    <DIR>          ..
08/21/2024  01:30 PM    <DIR>          .ipynb_checkpoints
08/21/2024  01:29 PM    <DIR>          human_results_clean
08/21/2024  01:32 PM            11,139 process_gappa_taxonomy.ipynb
08/21/2024  01:29 PM                31 README.md
               2 File(s)         11,170 bytes
               4 Dir(s)  994,136,465,408 bytes free


In [17]:
sativa_tax.to_csv("sativa_taxonomy.csv")

In [18]:
ls

 Volume in drive C is OS
 Volume Serial Number is 5E7E-789E

 Directory of C:\Users\Gary2\Partners HealthCare Dropbox\Gurdip Uppal\research_bwh\MCSPACE_FINAL\MCSPACE\mcspace\data\human_experiments\gappa_taxonomy

08/21/2024  01:34 PM    <DIR>          .
08/21/2024  01:28 PM    <DIR>          ..
08/21/2024  01:30 PM    <DIR>          .ipynb_checkpoints
08/21/2024  01:29 PM    <DIR>          human_results_clean
08/21/2024  01:32 PM            11,139 process_gappa_taxonomy.ipynb
08/21/2024  01:29 PM                31 README.md
08/21/2024  01:34 PM           319,433 sativa_taxonomy.csv
               3 File(s)        330,603 bytes
               4 Dir(s)  994,136,334,336 bytes free
