In [1]:
import docx as dc
from docx.shared import Inches
from docx.enum.text import WD_ALIGN_PARAGRAPH

In [2]:
#Document Creation
doc = dc.Document()

In [3]:
sections = doc.sections
section = sections[0]

In [4]:
section.left_margin,section.right_margin,section.bottom_margin,section.top_margin = \
                            tuple(map(Inches,[0.5,0.5,0.5,0.5]))

In [5]:
# Title
doc.add_heading('Understanding the Cyanothece 51142 clock module using Mutual Information',0)

# Title paragraph
p = doc.add_paragraph("The cyanobacterial circadian clock mechanism has been experimentally validated in")
p.add_run('Synechococcus Elongatus').italic=True
p.add_run(" sp.7942. The core clock comprises of 3 proteins, KaiA, KaiB and KaiC. The clock proteins receive signals from external environmental cues through elements of the input pathway such as CikA which is a histidine kinase. The output pathway is mediated through the sensory histidine kinase sasA and a transcriptional regulator RpaA. In ")
p.add_run('Cyanothece').italic=True
p.add_run(" sp. 51142 however, there are multiple copies of the Kai genes, the two-component kinases that interact"
           " with the clock genes and the two component regulators that are supposed to be a part of the output"
           " pathway. The circadian clock signaling network in ")
p.add_run('Cyanothece').italic=True
p.add_run(" is thus much more complicated than Synechococcus clock network described above. We intend to enhance our"
           " knowledge of its signaling network by selecting and studying the expression profile of the two component "
            "sensors and regulators "
            "that are highly correlated with the clock genes. We will use Mutual Information as a correlation metric "
            "because of its ability to catch non-linear interaction between variables as opposed to Pearson "
            "correlation which only captures linear correlation. In addition to finding out the sensors and regulators "
            "that are highly correlated with the clock genes, we can also infer whether a sensor-regulator pair "
          "belongs"
            "to the same two-component system. Since they are a part of the same unit, their expression profiles must "
          "also be "
            "very highly correlated. This way, we can gain more insight into the clock module of ")
p.add_run('Cyanothece').italic=True
p.add_run(' by using Mutual Information.')

# Heading 1 - Mutual Information
doc.add_heading('Mutual Information',level=1)

# Mutual Information metric paragraph
p2 = doc.add_paragraph("If we consider two random variables X and Y, the Mutual Information between them would be "
    "I(X,Y) = H(X) + H(Y) - H(X,Y), where H(.) computes the Shannon Entropy. For continuous variables, the idea is to "
    "estimate H(.) from the average distance to the k nearest neighbors. MI is estimated as I(X,Y) = psi(l) - 1/l "
     "- <psi(n_x) + psi(n_y)> + psi(N), where N is the size of the dataset, k is the number of nearest neighbors and "
      "psi(x) is the digamma function, psi(x+1)=psi(x)+1/x, psi(1)= -0.5772156 and <...> denotes averages of n_x and "
      "n_y over all 1<=i<=N and over all realizations of the random samples, n_x(i) and n_y(i) are the number of "
      "points in the region ||x_i - x_j||<=\epsilon_x(i)/2 and ||y_i - y_j||<=\epsilon_y(i)/2, \epsilon_x(i) and "
      "\epsilon_y(i) are the edge lengths of the smallest rectangle around point i containing k nearest neighbour.")

# Heading 2 - MicroArray Dataset
doc.add_heading('MicroArray Dataset',level=1)

# MicroArray Dataset Description
p3 = doc.add_paragraph("The microarray expression datasets are obtained from the ArrayExpress database submitted by "
"Stockel et. al. and Toepel et. al.. The analysis was initially carried on in the Stockel et. al. dataset and the "
"Toepel et. al. dataset was later used to verify the results.")

# Heading 3 - Analysis

doc.add_heading('Analysis',level=1)

## Sub Heading 1 -  Stockel
doc.add_heading('Stockel Dataset',level=2)

p4 = doc.add_paragraph("The expression profile of the genes in the Stockel Dataset were first filtered to include "
                    "only those genes that are annotated as circadian clock protein or two-component sensors and regulators. The gene to "
                    "annotation mapping was obtained from the cyanothece genomic database submitted by Welsh et. al.. Using the expression "
                    "profiles of those selected genes, the mutual information between the individual clock genes and the sensors and "
                    "regulators were obtained. To visualize the interactions, correlation matrices were created as shown below. "
                    "The X-axis contains the Kai clock genes and the y-axis contains the functionally annotated sensors and regulators "
                    "in Cyanothece. The value corresponding to the x_i th column and the y_ith row represents the mutual information score "
                    "between the x_i th Kai Gene and the y_ith sensor/regulator. The interaction of the clock proteins with the two-"
                    "component sensors, regulators and the two-component hybrid sensor & regulator is geiven in Figure 1, Figure 2 and "
                    "Figure 3 respectively.")


# Add 3 pictures
doc.add_picture('Report_images/Two Component Sensors_Matrix_Stockel.png',width=Inches(7.25))
doc.add_picture('Report_images/Two Component Regulators_Matrix_Stockel.png',width=Inches(7.25))
doc.add_picture('Report_images/Two Component Hybrids_Matrix_Stockel.png',width=Inches(7.25))

p5 = doc.add_paragraph('The correlation matrix between the clock genes is also presented below.')
doc.add_picture('Report_images/Kai Genes_Matrix_Stockel.png',width=Inches(7.25))

## Sub Heading 2 -  Toepel
doc.add_heading('Toepel Dataset',level=2)

p4 = doc.add_paragraph("The expression profile of the genes in the Toepel Dataset were used to verify the results "
"obtained from the stockel dataset. The interaction of the clock proteins with the two-"
"component sensors, regulators and the two-component hybrid sensor & regulator is given in Figure 5, Figure 6 and "
"Figure 7 respectively.")


# Add 3 pictures
f1 = doc.add_picture('Report_images/Two Component Sensors_Matrix_Toepel.png',width=Inches(7.25))
doc.add_picture('Report_images/Two Component Regulators_Matrix_Toepel.png',width=Inches(7.25))
doc.add_picture('Report_images/Two Component Hybrids_Matrix_Toepel.png',width=Inches(7.25))

p6 = doc.add_paragraph('The correlation matrix between the clock genes is also presented below.')
doc.add_picture('Report_images/Kai Genes_Matrix_Toepel.png',width=Inches(7.25))

# Heading 4 - The clock gene copies that interact with the maximum number of sensors and regulators

doc.add_heading('Finding the most active set of KaiABC combination',level=1)

p7 = doc.add_paragraph("Since the clock genes have multiple copies, the objective of this analysis is to find the "
"KaiA, KaiB and KaiC combination that are correlated to the maximum number of sensors and regulators.")


## Subheading 1 

doc.add_heading('Stockel Dataset',level=2)

p8 = doc.add_paragraph("The table below presents the KaiABC combinations and the number of common sensors, regulators "
"and hybrid sensors & regulators that they are interact with. The table is arranged in descending order of the total "
"number of common sensors,regulators and hybrids that the given KaiABC combination interacts with.")

records_337 = {('kaiA', 'kaiB1', 'kaiC1'): [13, 19, 19, 51],
 ('kaiA', 'kaiB1', 'kaiC2'): [13, 19, 19, 51],
 ('kaiA', 'kaiB3', 'kaiC2'): [10, 13, 13, 36],
 ('kaiA', 'kaiB3', 'kaiC1'): [9, 12, 12, 33],
 ('kaiA', 'kaiB4', 'kaiC2'): [6, 12, 12, 30],
 ('kaiA', 'kaiB4', 'kaiC1'): [4, 12, 12, 28]}


table1 = doc.add_table(rows=1,cols=5)
hdr_cells = table1.rows[0].cells
hdr_cells[0].text = 'Kai Combination'
hdr_cells[1].text = 'No. of sensors'
hdr_cells[2].text = 'No. of regulators'
hdr_cells[3].text = 'No. of hybrids'
hdr_cells[4].text = 'Total'

for k,v in records_337.items():
    row_cells = table1.add_row().cells
    row_cells[0].text = 'kai'+ ''.join([s[3:] for s in k])
    row_cells[1].text = str(v[0])
    row_cells[2].text = str(v[1])
    row_cells[3].text = str(v[2])
    row_cells[4].text = str(v[3])

## Subheading 2 

doc.add_heading('Toepel Dataset',level=2)

p9 = doc.add_paragraph('A similar table as above is presented below using the Toepel Dataset.')

records_386 = {('kaiA', 'kaiB1', 'kaiC2'): [5, 5, 5, 15],
 ('kaiA', 'kaiB4', 'kaiC1'): [3, 6, 6, 15],
 ('kaiA', 'kaiB3', 'kaiC2'): [3, 5, 5, 13],
 ('kaiA', 'kaiB1', 'kaiC1'): [2, 5, 5, 12],
 ('kaiA', 'kaiB4', 'kaiC2'): [2, 5, 5, 12],
 ('kaiA', 'kaiB3', 'kaiC1'): [2, 4, 4, 10]}

table2 = doc.add_table(rows=1,cols=5)
hdr_cells = table2.rows[0].cells
hdr_cells[0].text = 'Kai Combination'
hdr_cells[1].text = 'No. of sensors'
hdr_cells[2].text = 'No. of regulators'
hdr_cells[3].text = 'No. of hybrids'
hdr_cells[4].text = 'Total'

for k,v in records_386.items():
    row_cells = table2.add_row().cells
    row_cells[0].text = 'kai'+ ''.join([s[3:] for s in k])
    row_cells[1].text = str(v[0])
    row_cells[2].text = str(v[1])
    row_cells[3].text = str(v[2])
    row_cells[4].text = str(v[3])

# Heading 5 - Case studies

doc.add_heading('Case Study of some genes of interest',level=1)

p10 = doc.add_paragraph('Here we present case studies of some genes that either were referenced in our previous report or were\
proposed to be a part of the signaling network in other cyanobacteria or were directly obtained from this study. The complete\
list of interactions is given in Appendix B. If it is a sensor, the list of interaction with the regulators are given in the \
list and vice versa.')

# Subheading 0 - cce_1983/aphA
doc.add_heading('cce_1983/aphA',level=2)
p11 = doc.add_paragraph('Cce_1983 has been shown to have a very high mutual information with multiple clock genes in both \
studies using Stockel or Toepel dataset. This histidine kinase has been functionally annotated as a probable phytochrome A\
 in the Cyanothece Genomic Database developed by Welsh et. al.. Moreover, it also shares high mutual information score with\
 the other genes of interest presented in this case study.')
# Subheading 1 - cce_0678
doc.add_heading('cce_0678',level=2)
p12 = doc.add_paragraph('Cce_0678 has previously been reported to share a high correlation with the RubisCo genes. In this study\
it is seen to share a high mutual information score with a number of important genes like cce_1983, cce_0220, cce_0164, cce_2232\
which have all been annotated as sensors. While cce_1983 has already been discussed above, cce_0220, cce_0164 and cce_2232 have a\
very high sequence similarity with the circadian input kinase cikA of Synechococcus 7942. The E-values of the BLAST search results\
are all presented in Appendix A.')
# Subheading 2 - cce_0298/rpaA
doc.add_heading('cce_0298/rpaA',level=2)
p13 = doc.add_paragraph('Cce_0298/rpaA has been shown to be a master regulator in Synechococcus. In this study as well, it is seen\
to interact with multiple clock genes, sensors and regulators. However, while in Synechococcus, sasA and rpaA belong to the\
two-component system, in Cyanothece, cce_1751, which has the highest sequence similarity to sasA of Synechococcus, does not \
have any correlation with rpaA. On the other hand, cce_1751 has a very high mutual information score with rpaB or cce_4002.\
Therefore, sasA or cce_1751 may not be the sensor that interacts with rpaA. Our analysis gives a list of probable sensors that\
interact with rpaA, among which the most probable ones are cce_0888 and cce_2546 both of which also have a high sequence similarity\
with sasA of Synechococcus.')
# Subheading 3 - cce_1751/sasA
doc.add_heading('cce_1751/sasA',level=2)
p14 = doc.add_paragraph('Cce_1751 has the highest sequence similarity with the sasA kinase of Synechococcus among all genes in \
Cyanothece. Unlike Synechococcus however, cce_1751 is most closely associated with rpaB instead of rpaA as evident from the \
analysis of both the datasets.')
# Subheading 4 - cce_4002/rpaB
doc.add_heading('cce_4002/rpaB',level=2)
p15 = doc.add_paragraph('From this analysis, it is clear that rpaB is also a key regulator of the circadian output pathway along\
with rpaA because of its association with multiple clock genes, sasA like sensor cce_1751 and multiple regulators. It has been also\
been shown experimentally by Hanaoka et. al. that RpaB binds to the kaiBC promoter and is a part of the circadian output pathway.')
# Subheading 5 - cce_4751/CikA
doc.add_heading('cce_4751/cikA',level=2)
p16 = doc.add_paragraph('Among the two-component hybrid sensor & regulators, cce_4751 stands out because of its high correlation with\
multiple clock genes and high sequence similarity with the cikA gene of Synechococcus.')
# Subheading 6 - cce_0888/nblS
doc.add_heading('cce_0888/nblS',level=2)
p17 = doc.add_paragraph('Cce_0888 is another interesting gene that is anotated as a two-component sensor histidine kinase and correlates\
highly with multiple clock genes and the regulator rpaA as seen in the analysis.')

# Heading 6 - Conclusion

doc.add_heading('Conclusion',level=1)

p18 = doc.add_paragraph('From the above analysis we can conclude that:')
p23 = doc.add_paragraph('cce_1983/aphA might be a photoreceptor that interacts with the clock.',style='List Bullet')
p19 = doc.add_paragraph("cce_0678's importance as a regulator is further highlighted in this study.",style='List Bullet')
p20 = doc.add_paragraph("rpaA and rpaB are equally important in the cyanothece circadian output pathway.",style='List Bullet')
p21= doc.add_paragraph("cce_1751/sasA may not be the kinase that interacts with rpaA. On the other hand, it may regulate \
rpaB.",style='List Bullet')
p22 = doc.add_paragraph("The Kai copies may be present not just to maintain robustness. There might be two separate clocks that "
            "operate simultaneosly and regulate similar transcription factors for different processes. This can be the reason why cyanothece has multiple "
            "copies of not only the clock genes but also the kinases and regulators that are supposed to interact with the clock. Maybe that's how they separate "
            "two conflicting processes, Photosynthesis and Nitrogen Fixation.",style='List Bullet')

In [6]:
doc.add_page_break()

<docx.text.paragraph.Paragraph at 0x10aa85a50>

In [7]:
doc.save('Report.docx')