In [1]:
#Module imports 
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import re 
import seaborn as sns 
import os 

In [None]:
##Preprocessing##
#Read all input files into DataFrames 
formatted_dir = "formatted_input"
bpm_path = "{0}/19isolates_BPM.csv".format(formatted_dir)
mcseed_path = "{0}/19isolates_mcseed_pathwaycomplete.csv".format(formatted_dir)
dc_tpm_path = "{0}/dc2ndadd_merged_tpm_data.tsv".format(formatted_dir)
rob_tpm_path = "{0}/rob3rd_merged_tpm_data.tsv".format(formatted_dir)

dc_count_path = "{0}/dc2ndadd_merged_count_data.tsv".format(formatted_dir)
rob_count_path = "{0}/rob3rd_merged_count_data.tsv".format(formatted_dir)

bpm_df = pd.read_csv(bpm_path)
#Process GO annotations from semicolon separated string into list of entries for columns Functional Pathway and Phenotype
mcseed_df = pd.read_csv(mcseed_path)
for col in ["Functional pathway", "Phenotype"]:
    mcseed_df.loc[:,col] = mcseed_df.loc[:,col].str.split(";")
#Flexible whitespace parsing - tpm first column is tab separated then space separated 
rob_tpm_df = pd.read_csv(rob_tpm_path,sep=r"\s+")
dc_tpm_df = pd.read_csv(dc_tpm_path,sep=r"\s+")
rob_count_df = pd.read_csv(rob_count_path,sep=r"\s+")
dc_count_df = pd.read_csv(dc_count_path,sep=r"\s+")
#Set indices: BPM -> Isolate name; mcseed -> Locus tag; TPM -> target_id (locus)
bpm_df.set_index("Isolate name",inplace=True)
mcseed_df.set_index("Locus tag",inplace=True)
rob_tpm_df.set_index("target_id",inplace=True)
dc_tpm_df.set_index("target_id",inplace=True)
rob_count_df.set_index("target_id",inplace=True)
dc_count_df.set_index("target_id",inplace=True)


bpm_df = bpm_df.transpose().drop(index="# functions")
STRAIN_ABBREVS = ["Bbr","Bca","Bli2D9","Blu","Rob","Dfo","Dlo","Eav","Eco","FprB","Lga4B6","Lru","Mmu","Pco","Pst",
                  "Rgn","Rto","Sga","Spa"]
STRAIN_TAGS = dict(zip(bpm_df.columns,STRAIN_ABBREVS))
SHORT_ABBREVS = [abbrev[:3] for abbrev in STRAIN_ABBREVS]
STRAIN_ABBREV_TO_FULL = dict(zip(SHORT_ABBREVS, bpm_df.columns))

for strain in STRAIN_TAGS:
    strain_abbrev = STRAIN_TAGS[strain]
    if not (strain == "Blautia obeum Bg7063_SSTS2015" and strain_abbrev == "Rob"): 
        assert(strain[0] == strain_abbrev[0])

#BPM Summary statistics 
print("BPM=1 pathways by strain")
print(bpm_df.sum())
print("Total BPM=1 pathways: {0}".format(bpm_df.sum().sum()))

In [None]:
#Assertion testing and visual inspection of DFs 
#assert(len(mcseed_df)==len(mcseed_df.index.unique())) #False - locus tags can have multiple entries in mcseed_df, corresponding
#to different subcomponents of same locus; must handle duplicate locus entries in tpm_df 
assert(len(rob_tpm_df)==len(rob_tpm_df.index.unique()))
assert(len(dc_tpm_df)==len(dc_tpm_df.index.unique()))
#Concatenate DC and Rob tpm DFs
tpm_df = pd.concat([dc_tpm_df,rob_tpm_df])
#Concatenate defined community and Rob counts data, convert counts to int 
count_df = pd.concat([dc_count_df,rob_count_df])
#Sanity checks on concatenated tpm and counts
assert(len(tpm_df) == (len(rob_tpm_df) + len(dc_tpm_df)))
assert(len(count_df) == (len(rob_count_df) + len(dc_count_df)))

#Remove .tpm and .est_counts tags from column names in tpm and count dfs 
tpm_df.columns = tpm_df.columns.str.extract('(.*)\.tpm',expand=False)
count_df.columns = count_df.columns.str.extract('(.*)\.est_counts',expand=False)
assert((tpm_df.columns==count_df.columns).all())
#Filter down to only 1C (Pre-weaning P.copri) and 2A (No P. copri)
samples_1C, samples_2A = [tpm_df.columns.str.contains(tag) for tag in ["1C_Pup","2A_Pup"]] #Boolean arrays 
samples_1C, samples_2A = tpm_df.columns[samples_1C], tpm_df.columns[samples_2A]#Filtered column lists
reordered_cols = pd.Index(list(samples_1C)+list(samples_2A))
tpm_df, count_df = tpm_df.loc[:,reordered_cols], count_df.loc[:,reordered_cols]
#
count_df_int = count_df.copy()
count_df_int.iloc[:,1:] = count_df_int.iloc[:,1:].astype(int)
#mcseed_df_duplicates = mcseed_df.index[mcseed_df.index.duplicated(keep=False)]

show_tables = True 
if show_tables:
    display(mcseed_df)
    display(tpm_df)
    display(bpm_df)
    
if not os.path.exists("formatted_output"):
    os.makedirs("formatted_output")
overwrite_files = False 
if overwrite_files:
    tpm_df.to_csv("formatted_output/full_merged_tpm.csv")
    count_df.to_csv("formatted_output/full_merged_count.csv")
    count_df_int.to_csv("formatted_output/full_merged_count_int.csv")

display(count_df)