# Imported the entire genome of cyanothece 51142

In [1]:
import numpy as np
import pandas as pd
import itertools

In [2]:
GenCyano = pd.read_excel("0 - EAWData/GenSeq.xlsx",index_col=0).reset_index(drop=True)

In [3]:
GenCyano.head()

Unnamed: 0,ORF,Function,CommonName
0,cce_0001,hypothetical protein,cce_0001
1,cce_0002,alcohol dehydrogenase,cce_0002
2,cce_0003,hypothetical protein,cce_0003
3,cce_0004,cation efflux system membrane protein,czcA
4,cce_0005,conserved hypothetical protein,cce_0005


# Gathered all the identified regulators in cyanothece 51142 for their cyclicity and the bottlenecks to capture their expression time
Source: 

2. A model of cyclic transcriptomic behavior in the cyanobacterium Cyanothece sp. ATCC 51142†

Jason E. McDermott,*a Christopher S. Oehmen,a Lee Ann McCue,a Eric Hill,b Daniel M. Choi,a Jana Stöckel,c Michelle Liberton,c Himadri B. Pakrasic and Louis A. Shermand

https://pubs.rsc.org/en/content/articlelanding/2011/MB/c1mb05006k#!divAbstract

In [4]:
import tabula

In [5]:
df = tabula.read_pdf("2 - JEMData/c1mb05006k.pdf", pages=['10','11','12','13'])
df.fillna("unknown",inplace=True)
trandf = tabula.read_pdf("Papers/2.pdf",pages=["5"])
new_header = trandf.iloc[0]
trandf = trandf.iloc[1:26]
trandf.columns = new_header
trandf = trandf[trandf.Transition!='*']

# Added their cyclicity and expression

In [6]:
def funcparse(orf):
    if orf in df['ORF'].values and orf in trandf['ID'].values:
        return pd.Series({'TfBool':True,
                          'Cyclicity':df.loc[df.ORF==orf,'Cyclicity'].values[0],
                          'Expression': trandf.loc[trandf.ID==orf,'Transition'].values[0]})
    elif orf in df['ORF'].values:
        return pd.Series({'TfBool':True,
                          'Cyclicity':df.loc[df.ORF==orf,'Cyclicity'].values[0],
                          'Expression': 'unknown'})

    elif orf in trandf['ID'].values:
        return pd.Series({'TfBool':False,
                          'Cyclicity':'unknown',
                          'Expression': trandf.loc[trandf.ID==orf,'Transition'].values[0]})
    else:
        return pd.Series({'TfBool':False,
                          'Cyclicity':'unknown',
                          'Expression':'unknown'})
    
GenCyano = GenCyano.merge(GenCyano.ORF.apply(funcparse),left_index=True,right_index=True)

In [7]:
GenCyano.head()

Unnamed: 0,ORF,Function,CommonName,TfBool,Cyclicity,Expression
0,cce_0001,hypothetical protein,cce_0001,False,unknown,unknown
1,cce_0002,alcohol dehydrogenase,cce_0002,False,unknown,unknown
2,cce_0003,hypothetical protein,cce_0003,False,unknown,unknown
3,cce_0004,cation efflux system membrane protein,czcA,False,unknown,unknown
4,cce_0005,conserved hypothetical protein,cce_0005,False,unknown,unknown


In [8]:
GenCyano['TFTG'] = [dict() if GenCyano.loc[i,'TfBool']==True else np.nan for i in range(len(GenCyano)) ]

# From the 6 papers, include the interactions  

 The number -1 is inhibitor, +1 is activator and 0 is an unknown interaction.

**1. A model of the circadian clock in the cyanobacterium Cyanothece sp. ATCC 51142**

Nguyen Xuan Vinh, Madhu Chetty, Ross Coppel, Sandeep Gaudana & Pramod P Wangikar

https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-14-S2-S14


In [9]:
# include the clock mechanism
# only included the interaction with valid images or files 
GenCyano.loc[GenCyano.ORF == 'cce_0424', 'TFTG'].values[0].update({'1': {'cce_2350':1,'cce_4751':1}})
GenCyano.loc[GenCyano.ORF == 'cce_0423', 'TFTG'].values[0].update({'1':{'cce_0145':1}})
GenCyano.loc[GenCyano.ORF == 'cce_0435', 'TFTG'].values[0].update({'1':{'cce_0423':1,'cce_1751':1}})
GenCyano.loc[GenCyano.ORF == 'cce_4716', 'TFTG'].values[0].update({'1':{'cce_0424':1,'cce_0422':1,'cce_3317':1,'cce_0298':1}})
GenCyano.loc[GenCyano.ORF == 'cce_0298', 'TFTG'].values[0].update({'1':{'cce_0435':1,'cce_2642':1}})
GenCyano.loc[GenCyano.ORF == 'cce_1751', 'TFTG'].values[0].update({'1':{'cce_4716':1}})

**2. A model of cyclic transcriptomic behavior in the cyanobacterium Cyanothece sp. ATCC 51142†**

Jason E. McDermott,*a Christopher S. Oehmen,a Lee Ann McCue,a Eric Hill,b Daniel M. Choi,a Jana Stöckel,c Michelle Liberton,c Himadri B. Pakrasic and Louis A. Shermand

https://pubs.rsc.org/en/content/articlelanding/2011/MB/c1mb05006k#!divAbstract

In [10]:
# inferring multiple things from 2 methods, have taken the common ones among them
GenCyano.loc[GenCyano.ORF == 'cce_0298', 'TFTG'].values[0].update({'2': {'cce_3378':1,'cce_2330':1,'cce_3594':1,'cce_0461':1}})
GenCyano.loc[GenCyano.ORF == 'cce_1898', 'TFTG'].values[0].update({'2': {'cce_0298':-1,'cce_0461':-1,'cce_0559':1,'cce_0560':1,'cce_0561':1}})
GenCyano.loc[GenCyano.ORF == 'cce_0644', 'TFTG'].values[0].update({'2': {'cce_0461':1}})
GenCyano.loc[GenCyano.ORF == 'cce_3298', 'TFTG'].values[0].update({'2': {'cce_0644':1,'cce_0461':1}})
GenCyano.loc[GenCyano.ORF == 'cce_4141', 'TFTG'].values[0].update({'2': {'cce_3298':1}})
GenCyano.loc[GenCyano.ORF == 'cce_0678', 'TFTG'].values[0].update({'2': {'cce_3164':1, 'cce_3166':1}})

**3. Metabolic Rhythms of the Cyanobacterium Cyanothece sp. ATCC 51142 Correlate with Modeled Dynamics of Circadian Clock**

Jan Cˇervený* and Ladislav Nedbal*,†

https://journals.sagepub.com/doi/pdf/10.1177/0748730409338367

**4. Global transcriptomic analysis of Cyanothece 51142 reveals robust diurnal oscillation of central metabolic processes**

Jana Stöckel, Eric A. Welsh, Michelle Liberton, Rangesh Kunnvakkam, Rajeev Aurora, and Himadri B. Pakrasi

https://www.pnas.org/content/105/16/6156

In [11]:
# import the expression and metabolic pathway file
jsdf = pd.read_excel("4 - JSData/EPinfo.xlsx",index_col=0,usecols=[0,1,3,4])

In [12]:
jsdf['ORF'] = jsdf['ORF'].str.strip()

Add pathway information and only if expression data is not given, then add expression information. TO add new value for a specific index at a new column, follow the way given below.

In [13]:
for orf in jsdf['ORF']:
    if orf in GenCyano.ORF.values:
        GenCyano.loc[GenCyano.index[GenCyano.ORF == orf][0],'pathway'] \
        = jsdf.loc[jsdf.ORF==orf,'pathway'].values[0]
        if GenCyano.loc[GenCyano.ORF == orf,'Expression'].values[0] == 'unknown':
            GenCyano.loc[GenCyano.ORF == orf,'Expression'] \
            = jsdf.loc[jsdf.ORF==orf,'Expression'].values[0]
    else:
        print(orf)

Contig0_4506_4631407


**5. Identifying Regulatory Changes to Facilitate Nitrogen Fixation in the Nondiazotroph Synechocystis sp. PCC 6803**

Thomas J. Mueller†Eric A. Welsh‡Himadri B. Pakrasi§∥Costas D. Maranas*†

https://pubs.acs.org/doi/full/10.1021/acssynbio.

In [14]:
# import TJM data file made before

tjmdf = pd.read_excel("5 - TJMData/interTJM.xlsx",index_col=0)

alltfs = GenCyano.loc[GenCyano.TfBool==True,'ORF']
a = set(alltfs)
tjmtfs = tjmdf['TF ORF']
b = set(tjmtfs)
b-a

{'cce_0556', 'cce_2943', 'cce_4219'}

In [15]:
alltfs = GenCyano.loc[GenCyano.TfBool==True,'ORF']
a = set(alltfs)
tjmtfs = tjmdf['TF ORF']
b = set(tjmtfs)
b-a

{'cce_0556', 'cce_2943', 'cce_4219'}

Found 1 ferrodoxin and 2 kinases. Will eliminate them. 

In [16]:
for tforf in b-(b-a):
    #find the interaction
    tglist = list(tjmdf.loc[tjmdf.loc[:,'TF ORF']==tforf]['TG ORF'])
    wlist  = list(tjmdf.loc[tjmdf.loc[:,'TF ORF']==tforf]['cjval'])
    innerdict  = {tg:w for tg,w in zip(tglist,wlist)}
    GenCyano.loc[GenCyano.ORF == tforf, 'TFTG'].values[0].update({'5': innerdict})

**6. TRANSCRIPTIONAL ANALYSIS OF THE UNICELLULAR, DIAZOTROPHIC CYANOBACTERIUM CYANOTHECE SP. ATCC 51142 GROWN UNDER SHORT DAY/NIGHT CYCLES**

Jo¨rg Toepel Jason E. McDermott Tina C. Summerfield Louis A. Sherman

https://onlinelibrary.wiley.com/doi/full/10.1111/j.1529-8817.2009.00674.x

The paper has specified in Table 1 some genes under diurnal/circadian rhythm and if they are light/dark expressed. Our objective is to manually check if those details have already been included in our model and to inlcude them if not. 

In [17]:
def numRange(x,y):
    '''Takes in a range of numbers for ORF and spills out a modified 
     string ORF'''
    nums = list(range(x,y+1))
    for idx in range(len(nums)):
        pre = '0' * (4-int(len(str(nums[idx]))))
        nums[idx] = pre+str(nums[idx])
    ORF = ['cce_'+num for num in nums]
    return ORF

include Diurnal D up-regulated genes

In [18]:
ORFS = numRange(2315,2319) + ['cce_3746','cce_2536'] + numRange(2236,2237) \
        + numRange(32,33) + numRange(1854,1856)

for orf in ORFS:
    if orf in GenCyano.ORF.values:

        if GenCyano.loc[GenCyano.ORF == orf,'Expression'].values[0] == 'unknown':
            GenCyano.loc[GenCyano.ORF == orf,'Expression'] \
            = 'D'
        if GenCyano.loc[GenCyano.ORF == orf,'Cyclicity'].values[0] == 'unknown':
            GenCyano.loc[GenCyano.ORF == orf,'Cyclicity'] \
            = 'diurnal'
    else:
        print(orf)

include Diurnal L up-regulated genes

In [19]:
# PSI and PSII genes w/o reaction centers
def CycFunc(row):
    if 'reaction center' not in row['Function']:
            row['Cyclicity'] = 'diurnal'
            row['Expression'] = 'L'
    return row

GenCyano.loc[GenCyano.pathway=='Photosystem I'] = \
    GenCyano.loc[GenCyano.pathway=='Photosystem I'].apply(CycFunc,axis=1)

GenCyano.loc[GenCyano.pathway=='Photosystem II'] = \
    GenCyano.loc[GenCyano.pathway=='Photosystem II'].apply(CycFunc,axis=1)

In [20]:
ORFS = ['cce_0636'] + ['cce_1214','cce_1222','cce_1223'] + numRange(1681,1689) \
        + numRange(1690,1693)

for orf in ORFS:
    if orf in GenCyano.ORF.values:

        if GenCyano.loc[GenCyano.ORF == orf,'Expression'].values[0] == 'unknown':
            GenCyano.loc[GenCyano.ORF == orf,'Expression'] \
            = 'L'
        if GenCyano.loc[GenCyano.ORF == orf,'Cyclicity'].values[0] == 'unknown':
            GenCyano.loc[GenCyano.ORF == orf,'Cyclicity'] \
            = 'diurnal'
    else:
        print(orf)

include Circadian L up-regulated genes

In [21]:
ORFS = numRange(2908,2909) + numRange(2651,2656) + numRange(2812,2813) \
         + numRange(4482,4489) + numRange(989,990) + ['cce_0267'] + \
        numRange(2106,2112) + numRange(969,970) + ['cce_4542']

for orf in ORFS:
    if orf in GenCyano.ORF.values:

        if GenCyano.loc[GenCyano.ORF == orf,'Expression'].values[0] == 'unknown':
            GenCyano.loc[GenCyano.ORF == orf,'Expression'] \
            = 'L'
        if GenCyano.loc[GenCyano.ORF == orf,'Cyclicity'].values[0] == 'unknown':
            GenCyano.loc[GenCyano.ORF == orf,'Cyclicity'] \
            = 'circadian'
    else:
        print(orf)

include Circadian D up-regulated genes

In [22]:
ORFS = numRange(1975,1977) +  ['cce_1603'] + ['cce_3465'] + ['cce_3477'] + \
     ['cce_3126'] +numRange(1063,1064) +numRange(545,588) + numRange(575,576) \
    + ['cce_0298']

for orf in ORFS:
    if orf in GenCyano.ORF.values:

        if GenCyano.loc[GenCyano.ORF == orf,'Expression'].values[0] == 'unknown':
            GenCyano.loc[GenCyano.ORF == orf,'Expression'] \
            = 'D'
        if GenCyano.loc[GenCyano.ORF == orf,'Cyclicity'].values[0] == 'unknown':
            GenCyano.loc[GenCyano.ORF == orf,'Cyclicity'] \
            = 'circadian'
    else:
        print(orf)

In [23]:
GenCyano.loc[GenCyano.ORF=='cce_0576']

Unnamed: 0,ORF,Function,CommonName,TfBool,Cyclicity,Expression,TFTG,pathway
574,cce_0576,ferrous iron transport protein A,feoA2,False,circadian,D,,


# The Final Dataframe to excel for use by the class

In [24]:
GenCyano.to_excel('GRNCyanoDB/GenCyanoDB.xlsx')