In [20]:
from IPython.display import Image, FileLink
import pandas as pd
import pandas.rpy.common as com

In [21]:
%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [22]:
%%R
library(data.table)
library(dplyr)
library(magrittr)

In [39]:
%%R
FDR = 0.10

df.l2fc = read.csv("../data/l2fc_table.csv") %>% 
    filter(padj <= FDR) %>%
    group_by(OTU, Treatment) %>%
    slice(which.max(log2FoldChange))

In [52]:
%%R
df.blast = tbl_df(read.table("../data/tmp/blast_out_otusn_LTP115.m6",
                  sep = "\t"))
names(df.blast) = c("OTU", "acc", "pid", "alnlen", "mismatches", "gaps", 
                    "qstart", "qend", "sstart", "send", "evalue", "bit")

df.blast

Source: local data frame [1,919,502 x 12]

     OTU      acc pid alnlen mismatches gaps qstart qend sstart send evalue bit
1  OTU.1 AF235091 100    219          0    0      1  219    508  726 2e-113 405
2  OTU.1 AF330692 100    219          0    0      1  219    502  720 2e-113 405
3  OTU.1 AB588633 100    219          0    0      1  219    539  757 2e-113 405
4  OTU.1   X80741 100    219          0    0      1  219    532  750 2e-113 405
5  OTU.1 AM176541 100    219          0    0      1  219    541  759 2e-113 405
6  OTU.1   X80740 100    219          0    0      1  219    533  751 2e-113 405
7  OTU.1   X83408 100    219          0    0      1  219    529  747 2e-113 405
8  OTU.1 AB279889 100    219          0    0      1  219    538  756 2e-113 405
9  OTU.1 GQ406811 100    219          0    0      1  219    492  710 2e-113 405
10 OTU.1 AB279890 100    219          0    0      1  219    542  760 2e-113 405
..   ...      ... ...    ...        ...  ...    ...  ...    ...  ...    ... .

In [53]:
%%R
df.tax = tbl_df(read.csv("/var/seq_data/silva/silva_blastdb/full_names.csv"))
df.tax

Source: local data frame [1,426,450 x 2]

        acc                           full_name
1  AX003092          Enterococcus casseliflavus
2  AX044029              Neisseria meningitidis
3  EU271959            Myzocytiopsis intermedia
4  AX039535            Dehalococcoides mccartyi
5  EU273602                   Acorus americanus
6  AX175616                Marinomonas communis
7  AX044033              Neisseria meningitidis
8  AB000389         Pseudoalteromonas elyakovii
9  EU271960          Myzocytiopsis sp. venatrix
10 AB001439 Pseudomonas syringae pv. actinidiae
..      ...                                 ...


In [54]:
%%R
df.blast = left_join(df.blast, df.tax)

Joining by: "acc"


In [55]:
%%R
df.blast = right_join(df.blast, df.l2fc %>% select(OTU, padj, log2FoldChange, Day, Treatment, Rank2, Rank3, Rank4))

Joining by: "OTU"


In [56]:
%%R
df.top.hits = df.blast %>%
    group_by(OTU, Treatment) %>%
    mutate(bit.rank = rank(desc(bit), ties.method = "min")) %>%
    filter(bit.rank == 1) %>%
    group_by()

In [58]:
%%R
df.top.hits

Source: local data frame [305 x 21]

        OTU      acc    pid alnlen mismatches gaps qstart qend sstart send
1   OTU.100 EF575564 100.00    219          0    0      1  219    563  781
2   OTU.100 DQ178977 100.00    219          0    0      1  219    543  761
3  OTU.1023 AB166885  80.54    221         39    4      1  219    542  760
4  OTU.1040 AF391124 100.00    218          0    0      1  218    555  772
5  OTU.1065   X62912  84.55    220         31    3      1  219    526  743
6  OTU.1069 AB248087 100.00    218          0    0      1  218    553  770
7   OTU.107 GQ281769  99.54    219          1    0      1  219    529  747
8   OTU.107 GQ281770  99.54    219          1    0      1  219    533  751
9  OTU.1087 DQ303125  99.09    219          2    0      1  219    503  721
10 OTU.1087 AJ549086  99.09    219          2    0      1  219    476  694
..      ...      ...    ...    ...        ...  ...    ...  ...    ...  ...
Variables not shown: evalue (dbl), bit (dbl), full_name (fctr),

In [60]:
%%R
FDR = 0.10

df.table = df.top.hits %>%
    filter(padj <= FDR) %>%
    group_by(OTU, Treatment) %>%
    summarize(hits = paste(full_name, collapse = "|"), 
              pid = first(pid), 
              log2FoldChange = first(log2FoldChange),
              Day = first(Day),
              Phylum = first(as.character(Rank2)),
              Class = first(as.character(Rank3)),
              Order = first(as.character(Rank4))) %>%
    arrange(Phylum, Class, Order, desc(log2FoldChange))

In [61]:
df_table = com.load_data("df.table")

In [62]:
df_table.head()

Unnamed: 0,OTU,Treatment,hits,pid,log2FoldChange,Day,Phylum,Class,Order
1,OTU.100,13CCPS,Pseudoxanthomonas sacheonensis|Pseudoxanthomon...,100.0,2.662703,14,Proteobacteria,Gammaproteobacteria,Xanthomonadales
2,OTU.1023,13CCPS,Stenotrophomonas koreensis,80.54,4.605162,30,Verrucomicrobia,Spartobacteria,Chthoniobacterales
3,OTU.1040,13CXPS,Paenibacillus daejeonensis,100.0,4.779735,1,Firmicutes,Bacilli,Bacillales
4,OTU.1065,13CCPS,Blastopirellula marina,84.55,5.31421,14,Planctomycetes,Planctomycetacia,Planctomycetales
5,OTU.1069,13CXPS,Paenibacillus terrigena,100.0,3.853167,1,Firmicutes,Bacilli,Bacillales


In [63]:
def list_genera(l):
    if len(l.split("|")) > 10:
        names = l.split("|")
        names_abbr = []
        seen = []
        for name in names:
            genus, species = name.split(" ", 1)
            if genus in seen:
                continue
            else:
                seen.append(genus)
                names_abbr.append("\mbox{" + "\\textit{" + genus + " spp." + "}" + "}")
        return ", ".join(names_abbr) 
    elif len(l.split("|")) > 1:
        return  ", ".join(["\mbox{" + "\\textit{" + i + "}" + "}" for i in set(l.split("|"))])
    else:
        return "\mbox{" + "\\textit{" + l + "}" + "}"
    
def get_latex(row):
    latex = ""
    OTU = row["OTU"]
    pid = row["pid"]
    Day = row["Day"]
    
    if pid >= 90:
        genera = list_genera(row["hits"])
    else:
        genera = "{No hits of at least 90\% identity}"

    tax = " ".join(["\mbox{" + "\\textit{" + i + "}" + "}" for i in row[["Phylum","Class","Order"]]]).replace("_","-")
    
    l2fc = pd.Series(row["log2FoldChange"]).round(2).values[0]
    #acc = df.acc.iloc[0]
    return OTU + " & " + str(l2fc) + " & " + str(Day) + " & " + genera + " & " + str(pid) + " & " + tax + r" \\ " + r"\midrule" + "\n"

In [64]:
s = df_table[df_table["Treatment"]=="13CCPS"].apply(get_latex, axis=1)

table_framework = r"""
\documentclass[10pt]{article}
\usepackage{multirow, array, booktabs, longtable, threeparttablex}
\usepackage{array}
\newcolumntype{P}[1]{>{\raggedright\arraybackslash}p{#1}}

\usepackage{geometry} 
\geometry{tmargin=1cm, bmargin=1cm, lmargin=0.25cm, rmargin=0.25cm} 

\begin{document}

\thispagestyle{empty}

\begin{ThreePartTable}
\begin{TableNotes}
\item[a] Maximum observed $log_{2}$ of fold change. 
\item[b] Day of maximum fold change.
\end{TableNotes}

\begin{longtable}{lrrP{5cm}rP{5cm}}

\caption{$^{13}$C-cellulose responders BLAST against Living Tree Project} \\
\toprule 
    \textbf{OTU ID} & 
    \textbf{Fold change} \tnote{a} & 
    \textbf{Day} \tnote{b} & 
    \textbf{Top BLAST hits} & 
    \textbf{BLAST \%%ID} & 
    \textbf{Phylum;Class;Order} \\
\midrule
\endfirsthead

\multicolumn{3}{c}
{{\tablename\ \thetable{} -- continued from previous page}} \\
\midrule
    \textbf{OTU ID} & 
    \textbf{Fold change} & 
    \textbf{Day} & 
    \textbf{Top BLAST hits} & 
    \textbf{BLAST \%%ID} & 
    \textbf{Phylum;Class;Order} \\
\midrule
\endhead
    %s
\bottomrule
\insertTableNotes
\end{longtable}

\end{ThreePartTable}
 
\end{document}"""%"".join(s.values)

with open("../data/tmp/table_cellulose.tex", "w") as out:
    out.write(table_framework)

In [65]:
s.shape

(63,)

In [66]:
!latex ../data/tmp/table_cellulose.tex >/dev/null
!dvipdf table_cellulose.dvi figs/LTP_blast_table_cellulose.pdf

In [67]:
FileLink("figs/LTP_blast_table_cellulose.pdf")

In [19]:
FileLink("../data/tmp/table_cellulose.tex")