In [1]:
#Module imports 
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import re 
import seaborn as sns 
import os 

In [2]:
##Preprocessing##
#Read all input files into DataFrames 
formatted_dir = "formatted_input"
bpm_path = "{0}/19isolates_BPM.csv".format(formatted_dir)
mcseed_path = "{0}/19isolates_mcseed_pathwaycomplete.csv".format(formatted_dir)
# dc_tpm_path = "{0}/dc2ndadd_merged_tpm_data.tsv".format(formatted_dir) #Deprecated dataset 
# rob_tpm_path = "{0}/rob3rd_merged_tpm_data.tsv".format(formatted_dir)
# dc_count_path = "{0}/dc2ndadd_merged_count_data.tsv".format(formatted_dir)
# rob_count_path = "{0}/rob3rd_merged_count_data.tsv".format(formatted_dir)
#Flexible whitespace parsing - tpm first column is tab separated then space separated 
# rob_tpm_df = pd.read_csv(rob_tpm_path,sep=r"\s+")
# dc_tpm_df = pd.read_csv(dc_tpm_path,sep=r"\s+")
# rob_count_df = pd.read_csv(rob_count_path,sep=r"\s+")
# dc_count_df = pd.read_csv(dc_count_path,sep=r"\s+")
count_4th_trial_path = "{0}/4thtrial_merged_count_data.tsv".format(formatted_dir)
tpm_4th_trial_path = "{0}/4thtrial_merged_tpm_data.tsv".format(formatted_dir)
count_4t_df = pd.read_csv(count_4th_trial_path,sep=r"\s+")
tpm_4t_df = pd.read_csv(tpm_4th_trial_path,sep=r"\s+")

bpm_df = pd.read_csv(bpm_path)
#Process GO annotations from semicolon separated string into list of entries for columns Functional Pathway and Phenotype
mcseed_df = pd.read_csv(mcseed_path)
for col in ["Functional pathway", "Phenotype"]:
    mcseed_df.loc[:,col] = mcseed_df.loc[:,col].str.split(";")

#Set indices: BPM -> Isolate name; mcseed -> Locus tag; TPM -> target_id (locus)
bpm_df.set_index("Isolate name",inplace=True)
mcseed_df.set_index("Locus tag",inplace=True)
# rob_tpm_df.set_index("target_id",inplace=True)
# dc_tpm_df.set_index("target_id",inplace=True)
# rob_count_df.set_index("target_id",inplace=True)
# dc_count_df.set_index("target_id",inplace=True)
count_4t_df.set_index("target_id",inplace=True)
tpm_4t_df.set_index("target_id",inplace=True)

bpm_df = bpm_df.transpose().drop(index="# functions")
STRAIN_ABBREVS = ["Bbr","Bca","Bli2D9","Blu","Rob","Dfo","Dlo","Eav","Eco","FprB","Lga4B6","Lru","Mmu","Pco","Pst",
                  "Rgn","Rto","Sga","Spa"]
STRAIN_TAGS = dict(zip(bpm_df.columns,STRAIN_ABBREVS))
SHORT_ABBREVS = [abbrev[:3] for abbrev in STRAIN_ABBREVS]
STRAIN_ABBREV_TO_FULL = dict(zip(SHORT_ABBREVS, bpm_df.columns))

for strain in STRAIN_TAGS:
    strain_abbrev = STRAIN_TAGS[strain]
    if not (strain == "Blautia obeum Bg7063_SSTS2015" and strain_abbrev == "Rob"): 
        assert(strain[0] == strain_abbrev[0])

#BPM Summary statistics 
print("BPM=1 pathways by strain")
print(bpm_df.sum())
print("Total BPM=1 pathways: {0}".format(bpm_df.sum().sum()))

BPM=1 pathways by strain
Isolate name
Bifidobacterium breve Bgsng463_m5_93            46
Bifidobacterium catenulatum Bgsng468_m22_84     44
Bifidobacterium longum infantis 40721_2D9_SN    50
Blautia luti Bg7063                             53
Blautia obeum Bg7063_SSTS2015                   53
Dorea formicigenerans Bg7063                    43
Dorea longicatena Bg7063                        45
Enterococcus_avium_Bang_SAM2_39_S1              57
Escherichia coli PS_131_S11                     77
Faecalibacterium prausnitzii Bg7063             43
Lactococcus garvieae Bang155_08_4B6_JG2017      32
Ligilactobacillus ruminis ATCC_25644            33
Mitsuokella multacida DSM_20544                 45
Prevotella copri PS_131_S11                     49
Prevotella stercorea DSM_18206                  29
Ruminococcus gnavus M8243_3A11_TMS_2014         63
Ruminococcus torques Bg7063                     50
Streptococcus gallolyticus PS_064_S07           42
Streptococcus pasteriuanus Bang_SAM2_39_S1  

In [3]:
#Assertion testing and visual inspection of DFs 
#assert(len(mcseed_df)==len(mcseed_df.index.unique())) #False - locus tags can have multiple entries in mcseed_df, corresponding
#to different subcomponents of same locus; must handle duplicate locus entries in tpm_df 
# assert(len(rob_tpm_df)==len(rob_tpm_df.index.unique()))
# assert(len(dc_tpm_df)==len(dc_tpm_df.index.unique()))
#Concatenate DC and Rob tpm DFs
# tpm_df = pd.concat([dc_tpm_df,rob_tpm_df])
#Concatenate defined community and Rob counts data, convert counts to int 
# count_df = pd.concat([dc_count_df,rob_count_df])
#Sanity checks on concatenated tpm and counts
# assert(len(tpm_df) == (len(rob_tpm_df) + len(dc_tpm_df)))
# assert(len(count_df) == (len(rob_count_df) + len(dc_count_df)))


tpm_df = tpm_4t_df.copy()
count_df = count_4t_df.copy()
# display(tpm_df)
# display(count_df)
#Remove .tpm and .est_counts tags from column names in tpm and count dfs 
tpm_df.columns = tpm_df.columns.str.extract('(.*)\.tpm',expand=False)
count_df.columns = count_df.columns.str.extract('(.*)\.est_counts',expand=False)


assert((tpm_df.columns==count_df.columns).all())
#Filter down to only 1C (Pre-weaning P.copri) and 2A (No P. copri)
samples_1C, samples_2B = [tpm_df.columns.str.contains(tag) for tag in ["1C_Pup","2B_Pup"]] #Boolean arrays 
samples_1C, samples_2B = tpm_df.columns[samples_1C], tpm_df.columns[samples_2B]#Filtered column lists
reordered_cols = pd.Index(list(samples_1C)+list(samples_2B))
print(reordered_cols)
tpm_df, count_df = tpm_df.loc[:,reordered_cols], count_df.loc[:,reordered_cols]
#
count_df_int = count_df.copy()
count_df_int.iloc[:,1:] = count_df_int.iloc[:,1:].astype(int)
#mcseed_df_duplicates = mcseed_df.index[mcseed_df.index.duplicated(keep=False)]

show_tables = True 
if show_tables:
    display(mcseed_df)
    display(tpm_df)
    display(bpm_df)
    
full_dir_path = "formatted_output/4th_trial/full"
if not os.path.exists(full_dir_path):
    os.makedirs(full_dir_path)
overwrite_files = True
if overwrite_files:
    tpm_df.to_csv("{0}/full_merged_tpm.csv".format(full_dir_path))
    count_df.to_csv("{0}/full_merged_count.csv".format(full_dir_path))
    count_df_int.to_csv("{0}/full_merged_count_int.csv".format(full_dir_path))

display(count_df)

Index(['Pup_1-cecal_contents_53_1C_Pup_1', 'Pup_1-ileal_contents_53_1C_Pup_1',
       'Pup_2-cecal_contents_53_1C_Pup_2', 'Pup_2-ileal_contents_53_1C_Pup_2',
       'Pup_3-cecal_contents_53_1C_Pup_3', 'Pup_3-ileal_contents_53_1C_Pup_3',
       'Pup_4-cecal_contents_53_1C_Pup_4', 'Pup_4-ileal_contents_53_1C_Pup_4',
       'Pup_5-cecal_contents_53_1C_Pup_5', 'Pup_5-ileal_contents_53_1C_Pup_5',
       'Pup_6-cecal_contents_53_1C_Pup_6', 'Pup_6-ileal_contents_53_1C_Pup_6',
       'Pup_7-cecal_contents_53_1C_Pup_7', 'Pup_7-ileal_contents_53_1C_Pup_7',
       'Pup_8-cecal_contents_53_1C_Pup_8', 'Pup_8-ileal_contents_53_1C_Pup_8',
       'Pup_1-cecal_contents_54_2B_Pup_1', 'Pup_1-ileal_contents_54_2B_Pup_1',
       'Pup_2-cecal_contents_54_2B_Pup_2', 'Pup_2-ileal_contents_54_2B_Pup_2',
       'Pup_3-cecal_contents_54_2B_Pup_3', 'Pup_3-ileal_contents_54_2B_Pup_3',
       'Pup_4-cecal_contents_54_2B_Pup_4', 'Pup_4-ileal_contents_54_2B_Pup_4',
       'Pup_5-cecal_contents_54_2B_Pup_5', 'Pup_5-il

Unnamed: 0_level_0,Isolate name,Protein name,Protein product,Functional category,Functional pathway,Phenotype
Locus tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ANCJAENF_00011,Bifidobacterium breve Bgsng463_m5_93,MalE,"Maltose/maltodextrin ABC transporter, substrat...",Carbohydrate utilization,"[maltose utilization, maltooligosaccharides u...","[Mal, (Mal)n]"
ANCJAENF_00013,Bifidobacterium breve Bgsng463_m5_93,MalF,"Maltose/maltodextrin ABC transporter, permease...",Carbohydrate utilization,"[maltose utilization, maltooligosaccharides u...","[Mal, (Mal)n]"
ANCJAENF_00014,Bifidobacterium breve Bgsng463_m5_93,MalG,"Maltose/maltodextrin ABC transporter, permease...",Carbohydrate utilization,"[maltose utilization, maltooligosaccharides u...","[Mal, (Mal)n]"
ANCJAENF_00052,Bifidobacterium breve Bgsng463_m5_93,GalE,UDP-glucose 4-epimerase (EC 5.1.3.2),Carbohydrate utilization,"[galactose utilization, lactose utilization]","[Gal, Lac]"
ANCJAENF_00063,Bifidobacterium breve Bgsng463_m5_93,GalE,UDP-glucose 4-epimerase (EC 5.1.3.2),Carbohydrate utilization,"[galactose utilization, lactose utilization]","[Gal, Lac]"
...,...,...,...,...,...,...
LDOIJNDB_02225,Streptococcus pasteriuanus Bang_SAM2_39_S1,TreB_c,"PTS system, trehalose-specific IIC component (...",Carbohydrate utilization,[trehalose utilization],[Tre]
LDOIJNDB_02226,Streptococcus pasteriuanus Bang_SAM2_39_S1,TreB_b,"PTS system, trehalose-specific IIB component (...",Carbohydrate utilization,[trehalose utilization],[Tre]
LDOIJNDB_02245,Streptococcus pasteriuanus Bang_SAM2_39_S1,GalE,UDP-glucose 4-epimerase (EC 5.1.3.2),Carbohydrate utilization,"[galactose utilization, lactose utilization]","[Gal, Lac]"
LDOIJNDB_02252,Streptococcus pasteriuanus Bang_SAM2_39_S1,MetA,Homoserine O-succinyltransferase (EC 2.3.1.46),Amino acids,[methionine biosynthesis],[Met]


Unnamed: 0_level_0,Pup_1-cecal_contents_53_1C_Pup_1,Pup_1-ileal_contents_53_1C_Pup_1,Pup_2-cecal_contents_53_1C_Pup_2,Pup_2-ileal_contents_53_1C_Pup_2,Pup_3-cecal_contents_53_1C_Pup_3,Pup_3-ileal_contents_53_1C_Pup_3,Pup_4-cecal_contents_53_1C_Pup_4,Pup_4-ileal_contents_53_1C_Pup_4,Pup_5-cecal_contents_53_1C_Pup_5,Pup_5-ileal_contents_53_1C_Pup_5,...,Pup_3-cecal_contents_54_2B_Pup_3,Pup_3-ileal_contents_54_2B_Pup_3,Pup_4-cecal_contents_54_2B_Pup_4,Pup_4-ileal_contents_54_2B_Pup_4,Pup_5-cecal_contents_54_2B_Pup_5,Pup_5-ileal_contents_54_2B_Pup_5,Pup_6-cecal_contents_54_2B_Pup_6,Pup_6-ileal_contents_54_2B_Pup_6,Pup_7-cecal_contents_54_2B_Pup_7,Pup_7-ileal_contents_54_2B_Pup_7
target_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ANCJAENF_00001,678.600000,20.97070,167.432000,9.825210,715.706000,3.465060e+01,1534.45000,23.051100,1631.420000,50.70230,...,3.799670,1.530190,0.108305,0.067922,88.54310,630.0980,255.22700,0.244386,153.265000,0.881001
ANCJAENF_00002,6196.870000,124.49900,1705.060000,59.888400,9143.190000,1.574560e+02,13517.20000,126.393000,14086.200000,312.95700,...,36.250800,4.755110,0.971734,1.917570,742.91600,3082.2900,2084.49000,1.285410,1150.030000,4.975310
ANCJAENF_00003,678.600000,20.97070,167.432000,9.825210,715.706000,3.465060e+01,1534.45000,23.051100,1631.420000,50.70230,...,3.799670,1.530190,0.108305,0.067922,88.54310,630.0980,255.22700,0.244386,153.265000,0.881001
ANCJAENF_00004,6196.870000,124.49900,1705.060000,59.888400,9143.190000,1.574560e+02,13517.20000,126.393000,14086.200000,312.95700,...,36.250800,4.755110,0.971734,1.917570,742.91600,3082.2900,2084.49000,1.285410,1150.030000,4.975310
ANCJAENF_00005,6.251760,0.00000,0.554181,0.000000,3.599670,9.038230e-02,3.17250,0.000000,0.796541,0.00000,...,0.000000,0.000000,0.000000,0.000000,1.92123,0.0000,0.00000,0.000000,0.591864,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LDOIJNDB_02259,0.000000,6.27988,0.000000,0.000000,0.000000,5.026050e-09,0.00000,22.167900,0.000000,2.41611,...,0.000000,12.563000,0.000000,0.202981,0.00000,0.0000,0.00000,0.000000,0.000000,8.712830
LDOIJNDB_02260,1.411490,0.00000,5.109380,0.000000,4.975030,4.114010e-01,5.10357,0.842406,2.567400,0.00000,...,0.611705,0.000000,2.475080,0.654158,15.89970,0.0000,5.58507,10.036100,3.149660,0.000000
LDOIJNDB_02261,63.744900,59.79710,184.225000,173.775000,109.157000,4.862450e+01,114.33300,138.692000,135.114000,172.26000,...,50.878800,76.087600,88.956900,42.067600,94.70170,2508.0500,881.98300,65.596400,168.805000,218.904000
LDOIJNDB_02262,0.946617,1.37405,3.146830,0.450998,0.750576,9.599220e-01,1.18046,0.142805,1.885920,1.10349,...,0.430976,0.636417,1.223350,0.650411,5.43815,65.8396,1.20440,3.550830,2.036340,0.063649


Isolate name,Bifidobacterium breve Bgsng463_m5_93,Bifidobacterium catenulatum Bgsng468_m22_84,Bifidobacterium longum infantis 40721_2D9_SN,Blautia luti Bg7063,Blautia obeum Bg7063_SSTS2015,Dorea formicigenerans Bg7063,Dorea longicatena Bg7063,Enterococcus_avium_Bang_SAM2_39_S1,Escherichia coli PS_131_S11,Faecalibacterium prausnitzii Bg7063,Lactococcus garvieae Bang155_08_4B6_JG2017,Ligilactobacillus ruminis ATCC_25644,Mitsuokella multacida DSM_20544,Prevotella copri PS_131_S11,Prevotella stercorea DSM_18206,Ruminococcus gnavus M8243_3A11_TMS_2014,Ruminococcus torques Bg7063,Streptococcus gallolyticus PS_064_S07,Streptococcus pasteriuanus Bang_SAM2_39_S1
Glc,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1
Gal,0,0,1,0,0,1,1,1,1,1,0,0,0,1,1,0,0,0,1
Fru,1,1,1,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1
Man,0,0,0,0,0,0,0,1,1,0,1,1,1,0,0,0,0,1,1
Tag,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Lys_d,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
Met_d,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
Pro_d,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
Thr_d,0,0,0,0,0,0,1,1,1,0,1,0,0,0,1,0,0,0,0


Unnamed: 0_level_0,Pup_1-cecal_contents_53_1C_Pup_1,Pup_1-ileal_contents_53_1C_Pup_1,Pup_2-cecal_contents_53_1C_Pup_2,Pup_2-ileal_contents_53_1C_Pup_2,Pup_3-cecal_contents_53_1C_Pup_3,Pup_3-ileal_contents_53_1C_Pup_3,Pup_4-cecal_contents_53_1C_Pup_4,Pup_4-ileal_contents_53_1C_Pup_4,Pup_5-cecal_contents_53_1C_Pup_5,Pup_5-ileal_contents_53_1C_Pup_5,...,Pup_3-cecal_contents_54_2B_Pup_3,Pup_3-ileal_contents_54_2B_Pup_3,Pup_4-cecal_contents_54_2B_Pup_4,Pup_4-ileal_contents_54_2B_Pup_4,Pup_5-cecal_contents_54_2B_Pup_5,Pup_5-ileal_contents_54_2B_Pup_5,Pup_6-cecal_contents_54_2B_Pup_6,Pup_6-ileal_contents_54_2B_Pup_6,Pup_7-cecal_contents_54_2B_Pup_7,Pup_7-ileal_contents_54_2B_Pup_7
target_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ANCJAENF_00001,31592.600000,2852.4500,9574.71000,1148.4300,28120.200000,6.356800e+03,6.063660e+04,1768.360000,6.397330e+04,3300.96000,...,427.762000,202.3200,10.47990,14.633800,10404.10000,1370.69,20874.30000,51.33900,20041.30000,121.7910
ANCJAENF_00002,614048.000000,36221.9000,207898.00000,14958.8000,766224.000000,6.184710e+04,1.138520e+06,20773.500000,1.178570e+06,43596.60000,...,8698.240000,1344.3600,200.07000,884.377000,185254.00000,14274.50,363192.00000,577.35000,319494.00000,1471.7700
ANCJAENF_00003,31592.600000,2852.4500,9574.71000,1148.4300,28120.200000,6.356800e+03,6.063660e+04,1768.360000,6.397330e+04,3300.96000,...,427.762000,202.3200,10.47990,14.633800,10404.10000,1370.69,20874.30000,51.33900,20041.30000,121.7910
ANCJAENF_00004,614048.000000,36221.9000,207898.00000,14958.8000,766224.000000,6.184710e+04,1.138520e+06,20773.500000,1.178570e+06,43596.60000,...,8698.240000,1344.3600,200.07000,884.377000,185254.00000,14274.50,363192.00000,577.35000,319494.00000,1471.7700
ANCJAENF_00005,19.000000,0.0000,2.00000,0.0000,9.000000,1.000000e+00,8.000000e+00,0.000000,2.000000e+00,0.00000,...,0.000000,0.0000,0.00000,0.000000,15.00000,0.00,0.00000,0.00000,5.00000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LDOIJNDB_02259,0.000000,28.4212,0.00000,0.0000,0.000000,3.755120e-08,0.000000e+00,65.345600,0.000000e+00,6.05787,...,0.000000,54.3603,0.00000,1.266000,0.00000,0.00,0.00000,0.00000,0.00000,58.0740
LDOIJNDB_02260,0.285714,0.0000,1.28571,0.0000,0.857143,2.857140e-01,8.571430e-01,0.285714,4.285710e-01,0.00000,...,0.285714,0.0000,1.14286,0.571429,7.85714,0.00,2.14286,8.85714,1.71429,0.0000
LDOIJNDB_02261,5997.760000,16515.7000,21327.30000,41207.2000,8685.180000,1.813040e+04,9.143430e+03,21638.000000,1.073310e+04,22780.20000,...,11591.400000,20421.6000,17391.40000,18417.500000,22426.50000,11028.90,145913.00000,27970.50000,44534.00000,61472.0000
LDOIJNDB_02262,2.062500,6.5000,6.73293,1.6875,0.875000,7.437500e+00,1.500000e+00,0.437500,2.687500e+00,2.87500,...,1.062500,2.8804,4.50000,4.267950,20.25000,3.75,4.31250,25.06250,10.68750,0.4375


In [4]:
cecal_abundance_path = "formatted_input/cecal_abundance.csv"
cecal_abundance_df = pd.read_csv(cecal_abundance_path,sep=",",index_col="MouseID")
cecal_abundance_df.columns = [col.strip() for col in cecal_abundance_df.columns]
# display(cecal_abundance_df)

ABUNDANCE_SCALING_FACTOR = 10**7

STRAIN_ABBREVS = ["Bbr","Bca","Bli463","Bli2D9","Blu","Rob","Dfo","Dlo","Eav","Eco","FprB","Lga4B6","Lru","Mmu","Pco","Pst",
                  "Rgn","Rto","Sga","Spa"]
bact_columns = [col.strip() for col in cecal_abundance_df.columns[3:-1].values]
abbrevs_map = dict(zip(bact_columns,STRAIN_ABBREVS))

cecal_abundance_df = cecal_abundance_df.rename(columns=abbrevs_map)
cecal_abundance_converted  = cecal_abundance_df.copy()
cecal_abundance_converted = cecal_abundance_converted.replace("ND",0)
cecal_abundance_converted.loc[:,STRAIN_ABBREVS] = cecal_abundance_converted.loc[:,STRAIN_ABBREVS].astype(float)

for col in STRAIN_ABBREVS:
    cecal_abundance_converted.loc[:,col] = np.power(cecal_abundance_converted.loc[:,col].values,
                                                    [10]*len(cecal_abundance_converted))
    cecal_abundance_converted.loc[:,col] = cecal_abundance_converted.loc[:,col]/(ABUNDANCE_SCALING_FACTOR) #Scale by millions of genome equivalents 

#Drop 1B samples from cecal_abundance 
sample_arm_re=r'[\w_]+[\w_]+_([12][ABC])_[\w+]'
cecal_abundance_converted.loc[:,"Arm"] = cecal_abundance_converted.index.str.extract(sample_arm_re,expand=False)
cecal_abundance_converted = cecal_abundance_converted.loc[~(cecal_abundance_converted["Arm"]=="1B"),:]
with pd.option_context('display.max_columns',None):
    display(cecal_abundance_converted)
# cecal_abundance_converted = cecal_abundance_converted.loc[]

Unnamed: 0_level_0,Treatment,Mouse Number,Sex,Bbr,Bca,Bli463,Bli2D9,Blu,Rob,Dfo,Dlo,Eav,Eco,FprB,Lga4B6,Lru,Mmu,Pco,Pst,Rgn,Rto,Sga,Spa,Total Bacterial Load,Arm
MouseID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
Pup_1-cecal_contents_53_1C_Pup_1,Arm 1 (Pre-weaning P. copri colonization),1,female,5.65539,9.094947,0.514907,0.0,0.001209,7.995611,0.0,8.258601,4.614541,44.775428,0.0,3.143136,0.525991,1.957257,42.391158,11.350332,13.671302,6.902967,2.674554,5.287094,7.75,1C
Pup_2-cecal_contents_53_1C_Pup_2,Arm 1 (Pre-weaning P. copri colonization),2,male,1.850428,6.462036,0.170802,0.0,0.000514,4.857144,7e-06,11.173952,4.234339,31.202841,8e-06,1.716156,0.221194,1.312418,24.461941,7.251237,12.656837,2.926456,2.269807,4.536131,7.48,1C
Pup_3-cecal_contents_53_1C_Pup_3,Arm 1 (Pre-weaning P. copri colonization),3,male,8.6677,24.109723,0.57251,0.0,0.000514,9.541056,0.000201,16.165156,6.148154,35.908028,4.7e-05,2.069606,3.495291,5.287094,48.578532,8.6677,13.882765,5.561215,2.926456,8.392994,7.82,1C
Pup_4-cecal_contents_53_1C_Pup_4,Arm 1 (Pre-weaning P. copri colonization),4,male,9.541056,25.917595,0.676816,0.0,0.001583,10.828571,0.0,14.75837,7.133429,43.568617,0.0,3.087654,2.674554,6.251222,44.775428,13.462743,13.054194,4.857144,2.579384,7.866963,7.85,1C
Pup_5-cecal_contents_53_1C_Pup_5,Arm 1 (Pre-weaning P. copri colonization),5,female,12.270399,32.552436,1.994081,0.0,0.029203,18.779876,0.000559,16.165156,8.258601,44.775428,0.0,3.315007,3.620333,10.328496,45.390044,14.534932,13.671302,5.287094,0.231675,8.808064,7.91,1C
Pup_6-cecal_contents_53_1C_Pup_6,Arm 1 (Pre-weaning P. copri colonization),6,male,17.168893,42.976258,1.190424,0.0,0.011259,11.173952,0.000433,25.180303,11.894612,53.381193,0.0,5.198461,5.025195,9.541056,46.642105,14.097166,18.228378,5.468453,0.690989,15.922622,8.03,1C
Pup_7-cecal_contents_53_1C_Pup_7,Arm 1 (Pre-weaning P. copri colonization),7,male,10.492829,26.674261,0.781787,0.0,0.017277,18.228378,0.001033,19.061218,8.950471,44.775428,0.0,2.532952,1.078733,5.946593,69.548489,13.882765,25.546555,8.392994,1.652366,14.314543,8.04,1C
Pup_8-cecal_contents_53_1C_Pup_8,Arm 1 (Pre-weaning P. copri colonization),8,female,8.529351,25.546555,3.087654,0.0,0.003662,12.46227,0.001257,22.746036,10.65951,63.447974,0.0,3.620333,1.23794,6.679228,58.607037,0.002858,24.818783,15.447356,2.108325,22.746036,8.07,1C
Pup_1-cecal_contents_54_2B_Pup_1,Arm 3 (No P. copri colonization),1,male,1.921046,8.392994,0.0,0.0,0.020644,0.067918,0.001467,24.109723,27.450751,99.587545,0.0,7.370793,0.015786,0.014412,0.0,0.0,69.548489,19.061218,0.584704,26.674261,8.22,2B
Pup_2-cecal_contents_54_2B_Pup_2,Arm 3 (No P. copri colonization),2,female,0.124722,0.127821,0.0,0.0,0.004825,0.00088,7.6e-05,0.049401,12.854134,71.385861,0.0,0.069713,0.001307,0.002858,0.0,0.0,62.616971,0.083516,0.01627,3.882567,7.95,2B


In [5]:
def vc_strain_locus_tag(expr_df,mcseed_df):
    """Returns a DataFrame indexed on locus tags for each strain, containing columns: 
        "Filtered Loci" - number of loci corresponding to that locus tag (ie loci per strain)
        "Strain" - full strain name corresponding to locus tag 
        :param pd.DataFrame expr_df: DataFrame indexed by ORFs containing (transformed) expression data, columns
        are samples 
        :param pd.DataFrame mcseed_df: DataFrame containing mcSEED annotations for loci, not necessarily for 
        all loci in expr_df 
    """
    vc_by_strain_locus_tag = expr_df.index.str.extract(r'(\w+)_\d+',expand=False).value_counts()
#     vc_by_strain_locus_tag.drop("ROSSTS7063_a2",inplace=True) #2nd3rd_trial specific error 
    locus_tag_strains = [mcseed_df.loc[mcseed_df.index.str.contains(lt),"Isolate name"].values[0] 
                             for lt in vc_by_strain_locus_tag.index] #if lt != "ROSSTS7063_a2"]
    locus_vc_df = pd.DataFrame(index=vc_by_strain_locus_tag.index,columns=["Filtered Loci","Strain"])
    locus_vc_df.loc[:,"Filtered Loci"] = vc_by_strain_locus_tag
    locus_vc_df.loc[:,"Strain"] = locus_tag_strains
    locus_vc_df.loc[:,"Abbreviation"] = [STRAIN_TAGS[lts] for lts in locus_tag_strains]
    return locus_vc_df

def abundance_correct_expr_df(expr_df,abundance_df,locus_vc_df):
    abundance_corrected = expr_df.copy()
    abundance_corrected = abundance_corrected.loc[:,abundance_corrected.columns.isin(abundance_df.index)]
    abundance_corrected.loc[:,"Locus Tag"] = abundance_corrected.index.str.extract(r'(\w+)_\d+',expand=False)
    for lt in abundance_corrected["Locus Tag"].unique():
        strain_abbrev = locus_vc_df.loc[lt,"Abbreviation"]
        lt_expr_df = abundance_corrected.loc[abundance_corrected.loc[:,"Locus Tag"]==lt]
        abundance_data = abundance_df.loc[:,strain_abbrev]
        samples = list(abundance_data.index)
#         print(samples)
        lt_ac = lt_expr_df.loc[:,lt_expr_df.columns.isin(samples)]/abundance_data
        abundance_corrected.loc[lt_ac.index,samples] = lt_ac.loc[:,samples]
    abundance_corrected.drop(columns="Locus Tag",inplace=True)
#     display(abundance_corrected.loc[np.isinf(abundance_corrected).any(axis=1),:])
    abundance_corrected.replace([np.inf, -np.inf], np.nan, inplace=True)

    abundance_corrected.replace(np.nan,0,inplace=True)
    abundance_corrected = abundance_corrected.astype('float')
    return abundance_corrected
    
locus_vc_df = vc_strain_locus_tag(count_df,mcseed_df)
# display(locus_vc_df)
abundance_corrected_expr = abundance_correct_expr_df(count_df,cecal_abundance_converted,locus_vc_df)
assert(len(count_df.loc[count_df.isna().any(axis=1)])==0)
assert(len(abundance_corrected_expr.loc[abundance_corrected_expr.isna().any(axis=1)])==0)
abundance_corrected_expr_int = np.floor(abundance_corrected_expr)
abundance_corrected_expr_int = abundance_corrected_expr_int.astype('int')
display(abundance_corrected_expr)
display(abundance_corrected_expr_int)

filt_tpm = tpm_df.loc[:,tpm_df.columns.isin(abundance_corrected_expr.columns)]
# display(filt_tpm)

full_dir_path = "formatted_output/4th_trial/full"
if not os.path.exists(full_dir_path):
    os.makedirs(full_dir_path)
overwrite_files = True
if overwrite_files or not os.path.exists("{0}/abundance_corrected_count.csv".format(full_dir_path)):
    abundance_corrected_expr.to_csv("{0}/abundance_corrected_count.csv".format(full_dir_path))
    abundance_corrected_expr_int.to_csv("{0}/abundance_corrected_count_int.csv".format(full_dir_path))
    filt_tpm.to_csv("{0}/column_filtered_tpm.csv".format(full_dir_path))

Unnamed: 0_level_0,Pup_1-cecal_contents_53_1C_Pup_1,Pup_2-cecal_contents_53_1C_Pup_2,Pup_3-cecal_contents_53_1C_Pup_3,Pup_4-cecal_contents_53_1C_Pup_4,Pup_5-cecal_contents_53_1C_Pup_5,Pup_6-cecal_contents_53_1C_Pup_6,Pup_7-cecal_contents_53_1C_Pup_7,Pup_8-cecal_contents_53_1C_Pup_8,Pup_1-cecal_contents_54_2B_Pup_1,Pup_2-cecal_contents_54_2B_Pup_2,Pup_3-cecal_contents_54_2B_Pup_3,Pup_4-cecal_contents_54_2B_Pup_4,Pup_5-cecal_contents_54_2B_Pup_5,Pup_6-cecal_contents_54_2B_Pup_6,Pup_7-cecal_contents_54_2B_Pup_7
target_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
ANCJAENF_00001,5586.280919,5174.323037,3244.251830,6355.334362,5213.628480,3051.344095,2877.279440,5582.569924,7935.467468,3352.167320,2277.947107,705.339424,1200.330028,22249.409698,13864.124597
ANCJAENF_00002,108577.471494,112351.330834,88399.926539,119328.512453,96049.853882,69706.300016,47068.528300,104737.508090,120582.208950,45779.786352,46320.455402,13465.515763,21372.914436,387117.537213,221018.827320
ANCJAENF_00003,5586.280919,5174.323037,3244.251830,6355.334362,5213.628480,3051.344095,2877.279440,5582.569924,7935.467468,3352.167320,2277.947107,705.339424,1200.330028,22249.409698,13864.124597
ANCJAENF_00004,108577.471494,112351.330834,88399.926539,119328.512453,96049.853882,69706.300016,47068.528300,104737.508090,120582.208950,45779.786352,46320.455402,13465.515763,21372.914436,387117.537213,221018.827320
ANCJAENF_00005,3.359627,1.080831,1.038338,0.838482,0.162994,0.232979,0.667122,0.117242,1.561649,0.000000,0.000000,0.000000,1.730563,0.000000,3.458889
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LDOIJNDB_02259,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
LDOIJNDB_02260,0.054040,0.283438,0.102126,0.108955,0.048657,0.026916,0.039919,0.025122,0.069623,0.220767,0.044953,0.227426,0.157428,0.095594,0.096899
LDOIJNDB_02261,1134.415146,4701.650370,1034.813125,1162.256605,1218.553802,694.954622,919.840761,1133.894294,1715.560181,1311.712176,1823.739361,3460.840593,449.344490,6509.280243,2517.254234
LDOIJNDB_02262,0.390101,1.484289,0.104254,0.190671,0.305118,0.199656,0.056760,0.255539,0.403010,0.321952,0.167169,0.895488,0.405735,0.192384,0.604104


Unnamed: 0_level_0,Pup_1-cecal_contents_53_1C_Pup_1,Pup_2-cecal_contents_53_1C_Pup_2,Pup_3-cecal_contents_53_1C_Pup_3,Pup_4-cecal_contents_53_1C_Pup_4,Pup_5-cecal_contents_53_1C_Pup_5,Pup_6-cecal_contents_53_1C_Pup_6,Pup_7-cecal_contents_53_1C_Pup_7,Pup_8-cecal_contents_53_1C_Pup_8,Pup_1-cecal_contents_54_2B_Pup_1,Pup_2-cecal_contents_54_2B_Pup_2,Pup_3-cecal_contents_54_2B_Pup_3,Pup_4-cecal_contents_54_2B_Pup_4,Pup_5-cecal_contents_54_2B_Pup_5,Pup_6-cecal_contents_54_2B_Pup_6,Pup_7-cecal_contents_54_2B_Pup_7
target_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
ANCJAENF_00001,5586,5174,3244,6355,5213,3051,2877,5582,7935,3352,2277,705,1200,22249,13864
ANCJAENF_00002,108577,112351,88399,119328,96049,69706,47068,104737,120582,45779,46320,13465,21372,387117,221018
ANCJAENF_00003,5586,5174,3244,6355,5213,3051,2877,5582,7935,3352,2277,705,1200,22249,13864
ANCJAENF_00004,108577,112351,88399,119328,96049,69706,47068,104737,120582,45779,46320,13465,21372,387117,221018
ANCJAENF_00005,3,1,1,0,0,0,0,0,1,0,0,0,1,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LDOIJNDB_02259,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
LDOIJNDB_02260,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
LDOIJNDB_02261,1134,4701,1034,1162,1218,694,919,1133,1715,1311,1823,3460,449,6509,2517
LDOIJNDB_02262,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [92]:


# print(len(count_df.columns))