In [1]:
import numpy as np
from pathlib import Path
import pandas as pd

In [2]:
ls

 Volume in drive C is OS
 Volume Serial Number is 7CF0-0838

 Directory of C:\Users\guppa\Dropbox (Partners HealthCare)\research_bwh\dynamical_rules\RuleBasedDynamics\dynrules\CDIFF_DATA

07/19/2023  06:18 PM    <DIR>          .
07/19/2023  01:43 PM    <DIR>          ..
07/19/2023  04:52 PM    <DIR>          .ipynb_checkpoints
02/14/2023  09:41 PM             3,487 biomass.txt
02/14/2023  09:41 PM            11,563 counts.txt
07/19/2023  06:18 PM            38,802 create_standard_format_cdiff_data.ipynb
02/14/2023  09:41 PM             5,085 crossvalidation_parameters.cfg
02/14/2023  09:55 PM             1,863 meta.tsv
02/14/2023  09:41 PM             2,058 metadata.txt
02/14/2023  09:41 PM             4,952 parameters.cfg
02/14/2023  09:55 PM            44,128 process_make_data_files.ipynb
07/19/2023  06:07 PM    <DIR>          processed_data
               8 File(s)        111,938 bytes
               4 Dir(s)  19,926,556,672 bytes free


In [3]:
basepath = Path("./")

In [4]:
outpath = basepath / "processed_data"

In [5]:
outpath.mkdir(exist_ok=True, parents=True)

## metadata

In [6]:
meta = pd.read_csv("meta.tsv", sep="\t")

In [7]:
meta

Unnamed: 0.1,Unnamed: 0,sampleID,subjectID,time
0,0,1,1,0.75
1,1,2,1,1.00
2,2,3,1,2.00
3,3,4,1,3.00
4,4,5,1,4.00
...,...,...,...,...
125,125,126,5,42.00
126,126,127,5,45.00
127,127,128,5,49.00
128,128,129,5,52.00


In [8]:
meta = meta.loc[:,["sampleID", "subjectID", "time"]]

In [9]:
meta.to_csv(outpath / "meta.tsv", sep="\t", index=False)

## count data

In [10]:
counts = pd.read_csv(basepath / "counts.txt", sep="\t", index_col=0)

In [11]:
counts.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,121,122,123,124,125,126,127,128,129,130
#OTU ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Collinsella-aerofaciens,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Clostridium-hiranonis,1483,1416,1038,84,101,72,46,55,89,102,...,0,0,0,0,0,0,0,0,0,0
Clostridium-difficile,0,0,0,0,0,1,0,0,0,0,...,399,1903,895,1720,1420,1089,882,816,537,503
Proteus-mirabilis,1330,1327,1092,726,1271,1602,1290,1375,670,1093,...,40,110,403,425,618,1118,1120,657,449,509
Clostridium-scindens,1065,460,199,98,209,343,195,234,120,255,...,23,73,53,68,79,92,88,122,68,72


In [12]:
idx = list(counts.index)

In [13]:
mapper = {}
for i, taxon in enumerate(idx):
    mapper[taxon] = f"Otu{i+1}"

In [14]:
mapper

{'Collinsella-aerofaciens': 'Otu1',
 'Clostridium-hiranonis': 'Otu2',
 'Clostridium-difficile': 'Otu3',
 'Proteus-mirabilis': 'Otu4',
 'Clostridium-scindens': 'Otu5',
 'Ruminococcus-obeum': 'Otu6',
 'Clostridium-ramosum': 'Otu7',
 'Bacteroides-ovatus': 'Otu8',
 'Akkermansia-muciniphila': 'Otu9',
 'Parabacteroides-distasonis': 'Otu10',
 'Bacteroides-fragilis': 'Otu11',
 'Bifidobacterium-longum': 'Otu12',
 'Bacteroides-vulgatus': 'Otu13',
 'Enterococcus-faecalis': 'Otu14',
 'Prevotella-melaninogenica': 'Otu15',
 'Klebsiella-oxytoca': 'Otu16',
 'Lactobacillus-fermentum': 'Otu17',
 'Lactobacillus-reuteri': 'Otu18',
 'Roseburia-hominis': 'Otu19',
 'Staphylococcus-epidermidis': 'Otu20',
 'Propionibacterium-acnes': 'Otu21',
 'Escherichia-coli': 'Otu22',
 'Streptococcus-mitis': 'Otu23'}

In [15]:
counts_otus = counts.rename(index=mapper)

In [16]:
counts_otus.to_csv(outpath / "reads.tsv", sep="\t")

## taxonomy

In [17]:
ranks = ["kingdom","phylum","class","order","family","genus","species"]

In [18]:
otus = [f"Otu{i+1}" for i in range(len(idx))]

In [19]:
unk = ["na" for i in range(len(idx))]

In [20]:
taxdata = {}

In [21]:
taxdata["name"] = otus
for key in ranks:
    taxdata[key] = unk
taxdata["species"] = idx


In [22]:
taxonomy = pd.DataFrame(taxdata)

In [23]:
taxonomy = taxonomy.set_index("name")

In [24]:
taxonomy.head()

Unnamed: 0_level_0,kingdom,phylum,class,order,family,genus,species
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Otu1,na,na,na,na,na,na,Collinsella-aerofaciens
Otu2,na,na,na,na,na,na,Clostridium-hiranonis
Otu3,na,na,na,na,na,na,Clostridium-difficile
Otu4,na,na,na,na,na,na,Proteus-mirabilis
Otu5,na,na,na,na,na,na,Clostridium-scindens


In [25]:
taxonomy.to_csv(outpath / "taxonomy.tsv", sep="\t")

## qpcr

In [26]:
biomass = pd.read_csv("biomass.txt", sep="\t")

In [27]:
biomass.head()

Unnamed: 0,mass1,mass2,mass3
0,4460000000.0,4750000000.0,6930000000.0
1,11600000000.0,4840000000.0,5380000000.0
2,5940000000.0,13200000000.0,8790000000.0
3,4760000000.0,5120000000.0,2270000000.0
4,7650000000.0,2690000000.0,2890000000.0


In [28]:
nsamples = biomass.shape[0]

In [29]:
sampleIDs = [f"{i+1}" for i in range(nsamples)]

In [30]:
bmap = {}
for i in range(3):
    bmap[f"mass{i+1}"] = f"measurement{i+1}"

In [31]:
bmap

{'mass1': 'measurement1', 'mass2': 'measurement2', 'mass3': 'measurement3'}

In [32]:
biomass = biomass.rename(columns=bmap)

In [33]:
biomass["sampleID"] = sampleIDs

In [34]:
biomass = biomass.set_index("sampleID")

In [36]:
biomass.to_csv(outpath / "qpcr.tsv", sep="\t")

## taxa metadata

In [37]:
taxonomy

Unnamed: 0_level_0,kingdom,phylum,class,order,family,genus,species
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Otu1,na,na,na,na,na,na,Collinsella-aerofaciens
Otu2,na,na,na,na,na,na,Clostridium-hiranonis
Otu3,na,na,na,na,na,na,Clostridium-difficile
Otu4,na,na,na,na,na,na,Proteus-mirabilis
Otu5,na,na,na,na,na,na,Clostridium-scindens
Otu6,na,na,na,na,na,na,Ruminococcus-obeum
Otu7,na,na,na,na,na,na,Clostridium-ramosum
Otu8,na,na,na,na,na,na,Bacteroides-ovatus
Otu9,na,na,na,na,na,na,Akkermansia-muciniphila
Otu10,na,na,na,na,na,na,Parabacteroides-distasonis


In [39]:
# tsv file with otu index, time and type columns
# intro time = 28.9 for c. diff

In [41]:
otus = list(taxonomy.index)

In [45]:
cdiff_otu = taxonomy.index[taxonomy.loc[:,"species"] == "Clostridium-difficile"][0]

In [46]:
times = [0 for _ in range(len(otus))]
types = ['bacteria' for _ in range(len(otus))]

In [47]:
data = {'name': otus, 'time': times, 'type': types}

In [48]:
taxmeta = pd.DataFrame(data)

In [51]:
taxmeta = taxmeta.set_index('name')

In [53]:
taxmeta.loc[cdiff_otu,'time'] = 28.9

In [54]:
taxmeta

Unnamed: 0_level_0,time,type
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Otu1,0.0,bacteria
Otu2,0.0,bacteria
Otu3,28.9,bacteria
Otu4,0.0,bacteria
Otu5,0.0,bacteria
Otu6,0.0,bacteria
Otu7,0.0,bacteria
Otu8,0.0,bacteria
Otu9,0.0,bacteria
Otu10,0.0,bacteria


In [55]:
taxmeta.to_csv(outpath / "taxa_meta.tsv", sep="\t")