In [1]:
#Module imports 
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import re 
import seaborn as sns 
import os 

In [3]:
##Preprocessing##
#Read all input files into DataFrames 
formatted_dir = "formatted_input"
bpm_path = "{0}/19isolates_BPM.csv".format(formatted_dir)
mcseed_path = "{0}/19isolates_mcseed_pathwaycomplete.csv".format(formatted_dir)
dc_tpm_path = "{0}/dc2ndadd_merged_tpm_data.tsv".format(formatted_dir) #Deprecated dataset 
rob_tpm_path = "{0}/rob3rd_merged_tpm_data.tsv".format(formatted_dir)
dc_count_path = "{0}/dc2ndadd_merged_count_data.tsv".format(formatted_dir)
rob_count_path = "{0}/rob3rd_merged_count_data.tsv".format(formatted_dir)
# Flexible whitespace parsing - tpm first column is tab separated then space separated 
rob_tpm_df = pd.read_csv(rob_tpm_path,sep=r"\s+")
dc_tpm_df = pd.read_csv(dc_tpm_path,sep=r"\s+")
rob_count_df = pd.read_csv(rob_count_path,sep=r"\s+")
dc_count_df = pd.read_csv(dc_count_path,sep=r"\s+")
count_4th_trial_path = "{0}/4thtrial_merged_count_data.tsv".format(formatted_dir)
tpm_4th_trial_path = "{0}/4thtrial_merged_tpm_data.tsv".format(formatted_dir)
count_4t_df = pd.read_csv(count_4th_trial_path,sep=r"\s+")
tpm_4t_df = pd.read_csv(tpm_4th_trial_path,sep=r"\s+")

bpm_df = pd.read_csv(bpm_path)
#Process GO annotations from semicolon separated string into list of entries for columns Functional Pathway and Phenotype
mcseed_df = pd.read_csv(mcseed_path)
for col in ["Functional pathway", "Phenotype"]:
    mcseed_df.loc[:,col] = mcseed_df.loc[:,col].str.split(";")

#Set indices: BPM -> Isolate name; mcseed -> Locus tag; TPM -> target_id (locus)
bpm_df.set_index("Isolate name",inplace=True)
mcseed_df.set_index("Locus tag",inplace=True)
rob_tpm_df.set_index("target_id",inplace=True)
dc_tpm_df.set_index("target_id",inplace=True)
rob_count_df.set_index("target_id",inplace=True)
dc_count_df.set_index("target_id",inplace=True)
count_4t_df.set_index("target_id",inplace=True)
tpm_4t_df.set_index("target_id",inplace=True)

bpm_df = bpm_df.transpose().drop(index="# functions")
STRAIN_ABBREVS = ["Bbr","Bca","Bli2D9","Blu","Rob","Dfo","Dlo","Eav","Eco","FprB","Lga4B6","Lru","Mmu","Pco","Pst",
                  "Rgn","Rto","Sga","Spa"]
STRAIN_TAGS = dict(zip(bpm_df.columns,STRAIN_ABBREVS))
SHORT_ABBREVS = [abbrev[:3] for abbrev in STRAIN_ABBREVS]
STRAIN_ABBREV_TO_FULL = dict(zip(SHORT_ABBREVS, bpm_df.columns))

for strain in STRAIN_TAGS:
    strain_abbrev = STRAIN_TAGS[strain]
    if not (strain == "Blautia obeum Bg7063_SSTS2015" and strain_abbrev == "Rob"): 
        assert(strain[0] == strain_abbrev[0])

#BPM Summary statistics 
print("BPM=1 pathways by strain")
print(bpm_df.sum())
print("Total BPM=1 pathways: {0}".format(bpm_df.sum().sum()))

BPM=1 pathways by strain
Isolate name
Bifidobacterium breve Bgsng463_m5_93            46
Bifidobacterium catenulatum Bgsng468_m22_84     44
Bifidobacterium longum infantis 40721_2D9_SN    50
Blautia luti Bg7063                             53
Blautia obeum Bg7063_SSTS2015                   53
Dorea formicigenerans Bg7063                    43
Dorea longicatena Bg7063                        45
Enterococcus_avium_Bang_SAM2_39_S1              57
Escherichia coli PS_131_S11                     77
Faecalibacterium prausnitzii Bg7063             43
Lactococcus garvieae Bang155_08_4B6_JG2017      32
Ligilactobacillus ruminis ATCC_25644            33
Mitsuokella multacida DSM_20544                 45
Prevotella copri PS_131_S11                     49
Prevotella stercorea DSM_18206                  29
Ruminococcus gnavus M8243_3A11_TMS_2014         63
Ruminococcus torques Bg7063                     50
Streptococcus gallolyticus PS_064_S07           42
Streptococcus pasteriuanus Bang_SAM2_39_S1  

In [8]:
# display(rob_count_df)
# display(dc_count_df)
full_count_df = pd.concat([dc_count_df,rob_count_df])
full_count_df.columns = full_count_df.columns.str.extract('(.*)\.est_counts',expand=False)
#Filter down to only 1C (Pre-weaning P.copri) and 2A (No P. copri)
samples_1C, samples_2B = [full_count_df.columns.str.contains(tag) for tag in ["1C_Pup","2B_Pup"]] #Boolean arrays 
samples_1C, samples_2B = full_count_df.columns[samples_1C], full_count_df.columns[samples_2B]#Filtered column lists
reordered_cols = pd.Index(list(samples_1C)+list(samples_2B))
print(reordered_cols)
full_count_df = full_count_df.loc[:,reordered_cols]
display(full_count_df)

Index(['Pup_1-cecal_contents_53_1C_Pup_1', 'Pup_1-ileal_contents_53_1C_Pup_1',
       'Pup_2-cecal_contents_53_1C_Pup_2', 'Pup_2-ileal_contents_53_1C_Pup_2',
       'Pup_3-cecal_contents_53_1C_Pup_3', 'Pup_3-ileal_contents_53_1C_Pup_3',
       'Pup_4-cecal_contents_53_1C_Pup_4', 'Pup_4-ileal_contents_53_1C_Pup_4',
       'Pup_5-cecal_contents_53_1C_Pup_5', 'Pup_5-ileal_contents_53_1C_Pup_5',
       'Pup_6-cecal_contents_53_1C_Pup_6', 'Pup_6-ileal_contents_53_1C_Pup_6',
       'Pup_7-cecal_contents_53_1C_Pup_7', 'Pup_7-ileal_contents_53_1C_Pup_7',
       'Pup_8-cecal_contents_53_1C_Pup_8', 'Pup_8-ileal_contents_53_1C_Pup_8',
       'Pup_1-cecal_contents_54_2B_Pup_1', 'Pup_1-ileal_contents_54_2B_Pup_1',
       'Pup_2-cecal_contents_54_2B_Pup_2', 'Pup_2-ileal_contents_54_2B_Pup_2',
       'Pup_3-cecal_contents_54_2B_Pup_3', 'Pup_3-ileal_contents_54_2B_Pup_3',
       'Pup_4-cecal_contents_54_2B_Pup_4', 'Pup_4-ileal_contents_54_2B_Pup_4',
       'Pup_5-cecal_contents_54_2B_Pup_5', 'Pup_5-il

Unnamed: 0_level_0,Pup_1-cecal_contents_53_1C_Pup_1,Pup_1-ileal_contents_53_1C_Pup_1,Pup_2-cecal_contents_53_1C_Pup_2,Pup_2-ileal_contents_53_1C_Pup_2,Pup_3-cecal_contents_53_1C_Pup_3,Pup_3-ileal_contents_53_1C_Pup_3,Pup_4-cecal_contents_53_1C_Pup_4,Pup_4-ileal_contents_53_1C_Pup_4,Pup_5-cecal_contents_53_1C_Pup_5,Pup_5-ileal_contents_53_1C_Pup_5,...,Pup_3-cecal_contents_54_2B_Pup_3,Pup_3-ileal_contents_54_2B_Pup_3,Pup_4-cecal_contents_54_2B_Pup_4,Pup_4-ileal_contents_54_2B_Pup_4,Pup_5-cecal_contents_54_2B_Pup_5,Pup_5-ileal_contents_54_2B_Pup_5,Pup_6-cecal_contents_54_2B_Pup_6,Pup_6-ileal_contents_54_2B_Pup_6,Pup_7-cecal_contents_54_2B_Pup_7,Pup_7-ileal_contents_54_2B_Pup_7
target_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ANCJAENF_00001,30237.2,2738.56,8749.38,1043.89,26962.1,6203.63,59116.9,1711.67,60898.8,3040.56,...,427.902,190.797,10.4777,14.6336,10382.4,1371.69,20849.4,51.3538,20010.4,118.953
ANCJAENF_00002,557122.0,31289.50,152409.00,10907.20,698817.0,56615.40,1057710.0,18267.60,1037680.0,31228.30,...,8447.150,771.666,189.2840,427.4510,181671.0,14000.40,357398.0,553.2270,313029.0,1432.360
ANCJAENF_00003,30237.2,2738.56,8749.38,1043.89,26962.1,6203.63,59116.9,1711.67,60898.8,3040.56,...,427.902,190.797,10.4777,14.6336,10382.4,1371.69,20849.4,51.3538,20010.4,118.953
ANCJAENF_00004,557122.0,31289.50,152409.00,10907.20,698817.0,56615.40,1057710.0,18267.60,1037680.0,31228.30,...,8447.150,771.666,189.2840,427.4510,181671.0,14000.40,357398.0,553.2270,313029.0,1432.360
ANCJAENF_00005,19.0,0.00,2.00,0.00,9.0,1.00,8.0,0.00,2.0,0.00,...,0.000,0.000,0.0000,0.0000,15.0,0.00,0.0,0.0000,5.0,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HIAFFLEM_03493,8.0,0.00,12.00,0.00,34.0,0.00,7.0,0.00,12.0,0.00,...,0.000,0.000,0.0000,0.0000,0.0,0.00,0.0,0.0000,0.0,0.000
HIAFFLEM_03494,76.0,0.00,68.00,0.00,54.0,0.00,45.0,0.00,53.0,0.00,...,0.000,0.000,0.0000,0.0000,0.0,0.00,0.0,0.0000,0.0,0.000
HIAFFLEM_03495,43.0,0.00,32.00,0.00,25.0,0.00,11.0,0.00,21.0,0.00,...,0.000,0.000,0.0000,0.0000,0.0,0.00,0.0,0.0000,0.0,0.000
HIAFFLEM_03496,0.0,0.00,0.00,0.00,1.0,0.00,1.0,0.00,1.0,0.00,...,0.000,0.000,0.0000,0.0000,0.0,0.00,0.0,0.0000,0.0,0.000


In [10]:
def vc_strain_locus_tag(expr_df,mcseed_df):
    """Returns a DataFrame indexed on locus tags for each strain, containing columns: 
        "Filtered Loci" - number of loci corresponding to that locus tag (ie loci per strain)
        "Strain" - full strain name corresponding to locus tag 
        :param pd.DataFrame expr_df: DataFrame indexed by ORFs containing (transformed) expression data, columns
        are samples 
        :param pd.DataFrame mcseed_df: DataFrame containing mcSEED annotations for loci, not necessarily for 
        all loci in expr_df 
    """
    vc_by_strain_locus_tag = expr_df.index.str.extract(r'(\w+)_\d+',expand=False).value_counts()
    vc_by_strain_locus_tag.drop("ROSSTS7063_a2",inplace=True) #2nd3rd_trial specific error 
    locus_tag_strains = [mcseed_df.loc[mcseed_df.index.str.contains(lt),"Isolate name"].values[0] 
                             for lt in vc_by_strain_locus_tag.index] #if lt != "ROSSTS7063_a2"]
    locus_vc_df = pd.DataFrame(index=vc_by_strain_locus_tag.index,columns=["Filtered Loci","Strain"])
    locus_vc_df.loc[:,"Filtered Loci"] = vc_by_strain_locus_tag
    locus_vc_df.loc[:,"Strain"] = locus_tag_strains
    locus_vc_df.loc[:,"Abbreviation"] = [STRAIN_TAGS[lts] for lts in locus_tag_strains]
    return locus_vc_df
locus_vc_df = vc_strain_locus_tag(full_count_df,mcseed_df)
display(locus_vc_df)

Unnamed: 0,Filtered Loci,Strain,Abbreviation
HMLHAAEP,5114,Escherichia coli PS_131_S11,Eco
AKLJOBCP,4367,Enterococcus_avium_Bang_SAM2_39_S1,Eav
GJBELKAJ,3984,Ruminococcus gnavus M8243_3A11_TMS_2014,Rgn
AGCELHME,3805,Blautia luti Bg7063,Blu
NJCFFJJN,3521,Prevotella copri PS_131_S11,Pco
HIAFFLEM,3497,Blautia obeum Bg7063_SSTS2015,Rob
OOAPABDJ,3484,Dorea longicatena Bg7063,Dlo
EMCKBHPA,3453,Dorea formicigenerans Bg7063,Dfo
CGCJBKJN,3025,Ruminococcus torques Bg7063,Rto
NBCBLOMG,2933,Prevotella stercorea DSM_18206,Pst


In [24]:
#Reads Percentage Mapping 
partition_abbrevs = ["Bli2D9","Pco","Pst"]
partition_lts = locus_vc_df.loc[locus_vc_df["Abbreviation"].isin(partition_abbrevs)]
# display(partition_lts)
partition_lt_re = '|'.join(list(partition_lts.index))
partition_match_counts = full_count_df.loc[full_count_df.index.str.contains(partition_lt_re)]
partition_rest_counts = full_count_df.loc[~full_count_df.index.str.contains(partition_lt_re)]

pm_sum = partition_match_counts.sum(axis=0)
pr_sum = partition_rest_counts.sum(axis=0)
fc_sum = full_count_df.sum(axis=0)

PR_FC_ratio = pr_sum/fc_sum
PR_FC_cecal = PR_FC_ratio[PR_FC_ratio.index.str.contains("cecal")]
PR_FC_cecal_1C = PR_FC_cecal[PR_FC_cecal.index.str.contains("1C")]
display(PR_FC_cecal_1C)
print("Average non-Pco/Pst/Bli2D9 mapped reads: {:.3f}%".format(PR_FC_cecal_1C.mean()*100))

Pup_1-cecal_contents_53_1C_Pup_1    0.457900
Pup_2-cecal_contents_53_1C_Pup_2    0.555972
Pup_3-cecal_contents_53_1C_Pup_3    0.627373
Pup_4-cecal_contents_53_1C_Pup_4    0.766143
Pup_5-cecal_contents_53_1C_Pup_5    0.801044
Pup_6-cecal_contents_53_1C_Pup_6    0.861177
Pup_7-cecal_contents_53_1C_Pup_7    0.593982
Pup_8-cecal_contents_53_1C_Pup_8    0.669007
dtype: float64

Average non-Pco/Pst/Bli2D9 mapped reads: 66.657%
