In [274]:
import pandas as pd
import re
import matplotlib.pyplot as plt

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Data import and transformation

## Data import from RegulonDB:
Regulatory interactions between a Transcription Factor and its regulated entity - promoter, gene, or Transcriptional Units (subset of RISet).

`TF-RIset_selected`:
1) riId. Regulatory interaction (RI) identifier assigned by RegulonDB
2) riType. Regulatory interaction type [tf-promoter, tf-tu, tf-gene] as we are considering only tf.
3) regulatorId. Transcription Factor (TF) identifier assigned by RegulonDB
4) regulatorName. Regulator name
5) cnfName. regulator active conformation name [now: "reg_active_conf_Name"]
19) targetTuOrGene. Transcription unit or gene (id:name) regulated by the TF [now: "TF_target_TuOrGene"]
27) riPMIDS. PMIDS of the regulatory interaction

In [275]:
#tf_riset = pd.read_csv("./Data from RegulonDB/TF-RIset_selected.csv")
tf_riset = pd.read_table("./Data from RegulonDB/TF-RIset_selected.tsv", sep="\t")
tf_riset.columns = ['riId','riType','regulatorId','regulatorName','reg_active_conf_Name','TF_target_TuOrGene','riPMIDS'] # 

tf_riset.shape
tf_riset.head()

(5582, 7)

Unnamed: 0,riId,riType,regulatorId,regulatorName,reg_active_conf_Name,TF_target_TuOrGene,riPMIDS
0,RDBECOLIRIC00001,tf-promoter,RDBECOLITFC00023,GadW,GadW,RDBECOLITUC03133:slp-dctR,12730179;18808381
1,RDBECOLIRIC00002,tf-promoter,RDBECOLITFC00023,GadW,GadW,RDBECOLITUC03315:hdeAB-yhiD,12730179;18083817;18808381
2,RDBECOLIRIC00003,tf-promoter,RDBECOLITFC00088,GalR,GalR-D-galactose,RDBECOLITUC02515:galP,19007420
3,RDBECOLIRIC00004,tf-promoter,RDBECOLITFC00033,GalS,GalS-D-galactose,RDBECOLITUC02515:galP,19007420
4,RDBECOLIRIC00005,tf-promoter,RDBECOLITFC00088,GalR,GalR-D-galactose,RDBECOLITUC02515:galP,19007420


Transcription units with information of operon, promoter and terminator.

`TUset_selected`:
1) tuId. Transcription Unit identifier assigned by RegulonDB
2) tuName. Transcription unit name
3) operonName. Operon name containing the transcription unit
4) tuGenes. Name of the gene(s) contained in the transcription unit

In [276]:
tuset = pd.read_table("./Data from RegulonDB/TUset_selected.tsv", sep="\t")
tuset.columns = ['tuId','tuName','operonName','tuGenes']

tuset.shape
tuset

(3718, 4)

Unnamed: 0,tuId,tuName,operonName,tuGenes
0,RDBECOLITUC00080,yjtD,yjtD,yjtD;
1,RDBECOLITUC00081,yaaX,yaaX,yaaX;
2,RDBECOLITUC00082,yaaA,yaaA,yaaA;
3,RDBECOLITUC00083,yaaJ,yaaJ,yaaJ;
4,RDBECOLITUC00084,mog,mog,mog;
...,...,...,...,...
3713,RDBECOLITUC03879,ydfABC-dicFB-ydfD,dicB-ydfDE-insD-intQ,ydfD;dicB;dicF;ydfC;ydfB;ydfA;
3714,RDBECOLITUC03880,fepA-entD,fepA-entD,entD;fepA;
3715,RDBECOLITUC03881,fhuACDB,fhuACDB,fhuB;fhuD;fhuC;fhuA;
3716,RDBECOLITUC03882,glnLG,glnAZLG,glnG;glnL;


Transcription factors and their conformations (subset of RegulatorSet).

`TFset_selected`:
1) tfId. Factor (TF) identifier assigned by RegulonDB
2) tfName. Name
3) tfSynonyms. Synonyms List
4) geneCodingForTF. Coding for the TF
5) tfActiveConformations. Active Conformations
6) tfInactiveConformations. Inactive Conformations
17) tfConformationPMID . conformation reference identifier (PMID)

In [277]:
tfset_conf = pd.read_table("./Data from RegulonDB/TFset_selected.tsv", sep = '\t')
tfset_conf.columns = ['tfId','tfName','tfSynonyms','geneCodingForTF', 'tfActiveConformations', 'tfInactiveConformations','tfConformationPMID']

tfset_conf.shape
tfset_conf.head()

(240, 7)

Unnamed: 0,tfId,tfName,tfSynonyms,geneCodingForTF,tfActiveConformations,tfInactiveConformations,tfConformationPMID
0,RDBECOLITFC00001,ExuR,"ExuR;negative regulator of exu regulon, exuT, ...",exuR,"ExuR;ExuR-&alpha;-D-galacturonate, &alpha;-D-g...",ExuR-&alpha;-D-glucuronate,6357945
1,RDBECOLITFC00002,AsnC,AsnC,asnC,DNA-binding transcriptional dual regulator AsnC,AsnC-L-asparagine,6357950; 7686882; 16528101; 2864330
2,RDBECOLITFC00003,Dan,Dan;TtdR;YgiP,ttdR,Dan-L-tartrate DNA-binding transcriptional act...,,19661178; 20156994
3,RDBECOLITFC00004,EbgR,EbgR,ebgR,EbgR;EbgR,,
4,RDBECOLITFC00005,AgaR,YhaW;AgaR,agaR,AgaR;AgaR,,14731281


In [278]:
regex = r'.*[Pp]hosphorylated.*'

temp_col = pd.Series((i for i in tf_riset['reg_active_conf_Name'] if re.search(regex, i)), name = 'active_when_phosphorylated')
temp_col

0      DNA-binding transcriptional activator EvgA-pho...
1      DNA-binding transcriptional dual regulator Arc...
2      DNA-binding transcriptional dual regulator Nar...
3      DNA-binding transcriptional dual regulator Nar...
4      DNA-binding transcriptional dual regulator Nar...
                             ...                        
817                                  CpxR-phosphorylated
818                                  CpxR-phosphorylated
819    DNA-binding transcriptional dual regulator Pho...
820    DNA-binding transcriptional dual regulator Pho...
821    DNA-binding transcriptional activator ZraR-pho...
Name: active_when_phosphorylated, Length: 822, dtype: object

In [279]:
#tf_riset.pop('TF_target_Id')
ins_col = tf_riset['TF_target_TuOrGene'].str[:16]
tf_riset.insert(6, 'TF_target_Id', ins_col)

tf_riset

Unnamed: 0,riId,riType,regulatorId,regulatorName,reg_active_conf_Name,TF_target_TuOrGene,TF_target_Id,riPMIDS
0,RDBECOLIRIC00001,tf-promoter,RDBECOLITFC00023,GadW,GadW,RDBECOLITUC03133:slp-dctR,RDBECOLITUC03133,12730179;18808381
1,RDBECOLIRIC00002,tf-promoter,RDBECOLITFC00023,GadW,GadW,RDBECOLITUC03315:hdeAB-yhiD,RDBECOLITUC03315,12730179;18083817;18808381
2,RDBECOLIRIC00003,tf-promoter,RDBECOLITFC00088,GalR,GalR-D-galactose,RDBECOLITUC02515:galP,RDBECOLITUC02515,19007420
3,RDBECOLIRIC00004,tf-promoter,RDBECOLITFC00033,GalS,GalS-D-galactose,RDBECOLITUC02515:galP,RDBECOLITUC02515,19007420
4,RDBECOLIRIC00005,tf-promoter,RDBECOLITFC00088,GalR,GalR-D-galactose,RDBECOLITUC02515:galP,RDBECOLITUC02515,19007420
...,...,...,...,...,...,...,...,...
5577,RDBECOLIRIC06256,tf-tu,RDBECOLITFC00093,Fur,Fur-a [2Fe-2S] iron-sulfur cluster DNA-binding...,RDBECOLITUC03882:glnLG,RDBECOLITUC03882,37026477
5578,RDBECOLIRIC06257,tf-gene,RDBECOLITFC00093,Fur,Fur-a [2Fe-2S] iron-sulfur cluster DNA-binding...,RDBECOLIGNC01387:waaL,RDBECOLIGNC01387,37026477
5579,RDBECOLIRIC06258,tf-gene,RDBECOLITFC00093,Fur,Fur-a [2Fe-2S] iron-sulfur cluster DNA-binding...,RDBECOLIGNC02391:ybaN,RDBECOLIGNC02391,37026477
5580,RDBECOLIRIC06259,tf-tu,RDBECOLITFC00093,Fur,Fur-a [2Fe-2S] iron-sulfur cluster DNA-binding...,RDBECOLITUC03883:dppBCDF,RDBECOLITUC03883,37026477


Let's merge the TF-RIset and the TUset.

In [280]:
temp_merge = tf_riset.merge(tuset, left_on = 'TF_target_Id', right_on='tuId', how = 'inner')
temp_merge

Unnamed: 0,riId,riType,regulatorId,regulatorName,reg_active_conf_Name,TF_target_TuOrGene,TF_target_Id,riPMIDS,tuId,tuName,operonName,tuGenes
0,RDBECOLIRIC00001,tf-promoter,RDBECOLITFC00023,GadW,GadW,RDBECOLITUC03133:slp-dctR,RDBECOLITUC03133,12730179;18808381,RDBECOLITUC03133,slp-dctR,slp-dctR,dctR;slp;
1,RDBECOLIRIC00002,tf-promoter,RDBECOLITFC00023,GadW,GadW,RDBECOLITUC03315:hdeAB-yhiD,RDBECOLITUC03315,12730179;18083817;18808381,RDBECOLITUC03315,hdeAB-yhiD,hdeAB-yhiD,yhiD;hdeB;hdeA;
2,RDBECOLIRIC00003,tf-promoter,RDBECOLITFC00088,GalR,GalR-D-galactose,RDBECOLITUC02515:galP,RDBECOLITUC02515,19007420,RDBECOLITUC02515,galP,galP,galP;
3,RDBECOLIRIC00004,tf-promoter,RDBECOLITFC00033,GalS,GalS-D-galactose,RDBECOLITUC02515:galP,RDBECOLITUC02515,19007420,RDBECOLITUC02515,galP,galP,galP;
4,RDBECOLIRIC00005,tf-promoter,RDBECOLITFC00088,GalR,GalR-D-galactose,RDBECOLITUC02515:galP,RDBECOLITUC02515,19007420,RDBECOLITUC02515,galP,galP,galP;
...,...,...,...,...,...,...,...,...,...,...,...,...
4236,RDBECOLIRIC06252,tf-promoter,RDBECOLITFC00093,Fur,Fur-a [2Fe-2S] iron-sulfur cluster DNA-binding...,RDBECOLITUC01910:uxaB,RDBECOLITUC01910,37026477,RDBECOLITUC01910,uxaB,uxaB,uxaB;
4237,RDBECOLIRIC06253,tf-tu,RDBECOLITFC00093,Fur,Fur-a [2Fe-2S] iron-sulfur cluster DNA-binding...,RDBECOLITUC03880:fepA-entD,RDBECOLITUC03880,37026477,RDBECOLITUC03880,fepA-entD,fepA-entD,entD;fepA;
4238,RDBECOLIRIC06255,tf-tu,RDBECOLITFC00093,Fur,Fur-a [2Fe-2S] iron-sulfur cluster DNA-binding...,RDBECOLITUC03881:fhuACDB,RDBECOLITUC03881,37026477,RDBECOLITUC03881,fhuACDB,fhuACDB,fhuB;fhuD;fhuC;fhuA;
4239,RDBECOLIRIC06256,tf-tu,RDBECOLITFC00093,Fur,Fur-a [2Fe-2S] iron-sulfur cluster DNA-binding...,RDBECOLITUC03882:glnLG,RDBECOLITUC03882,37026477,RDBECOLITUC03882,glnLG,glnAZLG,glnG;glnL;


In [281]:
temp_merge['mod_conformation'] = temp_merge['regulatorName'] != temp_merge['reg_active_conf_Name']

In [282]:
temp_merge

Unnamed: 0,riId,riType,regulatorId,regulatorName,reg_active_conf_Name,TF_target_TuOrGene,TF_target_Id,riPMIDS,tuId,tuName,operonName,tuGenes,mod_conformation
0,RDBECOLIRIC00001,tf-promoter,RDBECOLITFC00023,GadW,GadW,RDBECOLITUC03133:slp-dctR,RDBECOLITUC03133,12730179;18808381,RDBECOLITUC03133,slp-dctR,slp-dctR,dctR;slp;,False
1,RDBECOLIRIC00002,tf-promoter,RDBECOLITFC00023,GadW,GadW,RDBECOLITUC03315:hdeAB-yhiD,RDBECOLITUC03315,12730179;18083817;18808381,RDBECOLITUC03315,hdeAB-yhiD,hdeAB-yhiD,yhiD;hdeB;hdeA;,False
2,RDBECOLIRIC00003,tf-promoter,RDBECOLITFC00088,GalR,GalR-D-galactose,RDBECOLITUC02515:galP,RDBECOLITUC02515,19007420,RDBECOLITUC02515,galP,galP,galP;,True
3,RDBECOLIRIC00004,tf-promoter,RDBECOLITFC00033,GalS,GalS-D-galactose,RDBECOLITUC02515:galP,RDBECOLITUC02515,19007420,RDBECOLITUC02515,galP,galP,galP;,True
4,RDBECOLIRIC00005,tf-promoter,RDBECOLITFC00088,GalR,GalR-D-galactose,RDBECOLITUC02515:galP,RDBECOLITUC02515,19007420,RDBECOLITUC02515,galP,galP,galP;,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4236,RDBECOLIRIC06252,tf-promoter,RDBECOLITFC00093,Fur,Fur-a [2Fe-2S] iron-sulfur cluster DNA-binding...,RDBECOLITUC01910:uxaB,RDBECOLITUC01910,37026477,RDBECOLITUC01910,uxaB,uxaB,uxaB;,True
4237,RDBECOLIRIC06253,tf-tu,RDBECOLITFC00093,Fur,Fur-a [2Fe-2S] iron-sulfur cluster DNA-binding...,RDBECOLITUC03880:fepA-entD,RDBECOLITUC03880,37026477,RDBECOLITUC03880,fepA-entD,fepA-entD,entD;fepA;,True
4238,RDBECOLIRIC06255,tf-tu,RDBECOLITFC00093,Fur,Fur-a [2Fe-2S] iron-sulfur cluster DNA-binding...,RDBECOLITUC03881:fhuACDB,RDBECOLITUC03881,37026477,RDBECOLITUC03881,fhuACDB,fhuACDB,fhuB;fhuD;fhuC;fhuA;,True
4239,RDBECOLIRIC06256,tf-tu,RDBECOLITFC00093,Fur,Fur-a [2Fe-2S] iron-sulfur cluster DNA-binding...,RDBECOLITUC03882:glnLG,RDBECOLITUC03882,37026477,RDBECOLITUC03882,glnLG,glnAZLG,glnG;glnL;,True


In [283]:
# Let's see how many trues and falses we got.
counter = temp_merge['mod_conformation'].value_counts()
counter

mod_conformation
True     2593
False    1648
Name: count, dtype: int64

In [284]:
useful_set = temp_merge[['regulatorId','regulatorName','reg_active_conf_Name','mod_conformation', 'tuId', 'tuName', 'tuGenes']].copy()
useful_set

Unnamed: 0,regulatorId,regulatorName,reg_active_conf_Name,mod_conformation,tuId,tuName,tuGenes
0,RDBECOLITFC00023,GadW,GadW,False,RDBECOLITUC03133,slp-dctR,dctR;slp;
1,RDBECOLITFC00023,GadW,GadW,False,RDBECOLITUC03315,hdeAB-yhiD,yhiD;hdeB;hdeA;
2,RDBECOLITFC00088,GalR,GalR-D-galactose,True,RDBECOLITUC02515,galP,galP;
3,RDBECOLITFC00033,GalS,GalS-D-galactose,True,RDBECOLITUC02515,galP,galP;
4,RDBECOLITFC00088,GalR,GalR-D-galactose,True,RDBECOLITUC02515,galP,galP;
...,...,...,...,...,...,...,...
4236,RDBECOLITFC00093,Fur,Fur-a [2Fe-2S] iron-sulfur cluster DNA-binding...,True,RDBECOLITUC01910,uxaB,uxaB;
4237,RDBECOLITFC00093,Fur,Fur-a [2Fe-2S] iron-sulfur cluster DNA-binding...,True,RDBECOLITUC03880,fepA-entD,entD;fepA;
4238,RDBECOLITFC00093,Fur,Fur-a [2Fe-2S] iron-sulfur cluster DNA-binding...,True,RDBECOLITUC03881,fhuACDB,fhuB;fhuD;fhuC;fhuA;
4239,RDBECOLITFC00093,Fur,Fur-a [2Fe-2S] iron-sulfur cluster DNA-binding...,True,RDBECOLITUC03882,glnLG,glnG;glnL;


Let's count the duplicated rows:

In [285]:
duplicates_count = useful_set.duplicated().sum()
print(duplicates_count)

1576


We have a lot of duplicated rows so we have to remove them:

In [286]:
no_dupl_set = useful_set.drop_duplicates()
no_dupl_set.reset_index(drop = True, inplace = True) # resettin the indeces
no_dupl_set

Unnamed: 0,regulatorId,regulatorName,reg_active_conf_Name,mod_conformation,tuId,tuName,tuGenes
0,RDBECOLITFC00023,GadW,GadW,False,RDBECOLITUC03133,slp-dctR,dctR;slp;
1,RDBECOLITFC00023,GadW,GadW,False,RDBECOLITUC03315,hdeAB-yhiD,yhiD;hdeB;hdeA;
2,RDBECOLITFC00088,GalR,GalR-D-galactose,True,RDBECOLITUC02515,galP,galP;
3,RDBECOLITFC00033,GalS,GalS-D-galactose,True,RDBECOLITUC02515,galP,galP;
4,RDBECOLITFC00027,NagC,NagC,False,RDBECOLITUC02515,galP,galP;
...,...,...,...,...,...,...,...
2660,RDBECOLITFC00093,Fur,Fur-a [2Fe-2S] iron-sulfur cluster DNA-binding...,True,RDBECOLITUC01910,uxaB,uxaB;
2661,RDBECOLITFC00093,Fur,Fur-a [2Fe-2S] iron-sulfur cluster DNA-binding...,True,RDBECOLITUC03880,fepA-entD,entD;fepA;
2662,RDBECOLITFC00093,Fur,Fur-a [2Fe-2S] iron-sulfur cluster DNA-binding...,True,RDBECOLITUC03881,fhuACDB,fhuB;fhuD;fhuC;fhuA;
2663,RDBECOLITFC00093,Fur,Fur-a [2Fe-2S] iron-sulfur cluster DNA-binding...,True,RDBECOLITUC03882,glnLG,glnG;glnL;


In [287]:
exploded_set = pd.DataFrame()

for num in no_dupl_set.index:

    row = no_dupl_set.iloc[num, :]

    tugenes_list = row['tuGenes'].split(';')
    tugenes_list.remove('')
    
    row = row.drop(['tuGenes'])
    
    if len(tugenes_list) > 1:
        for gene in tugenes_list:
            riga = pd.DataFrame([row])
            
            riga['tuGenes'] = gene
            
            exploded_set = pd.concat([exploded_set, riga], axis = 'rows')
            
    else:
        gene = tugenes_list[0]
        riga = pd.DataFrame([row])
        riga['tuGenes'] = gene
        exploded_set = pd.concat([exploded_set, riga])
        
exploded_set.reset_index(drop = True, inplace = True)
exploded_set

Unnamed: 0,regulatorId,regulatorName,reg_active_conf_Name,mod_conformation,tuId,tuName,tuGenes
0,RDBECOLITFC00023,GadW,GadW,False,RDBECOLITUC03133,slp-dctR,dctR
1,RDBECOLITFC00023,GadW,GadW,False,RDBECOLITUC03133,slp-dctR,slp
2,RDBECOLITFC00023,GadW,GadW,False,RDBECOLITUC03315,hdeAB-yhiD,yhiD
3,RDBECOLITFC00023,GadW,GadW,False,RDBECOLITUC03315,hdeAB-yhiD,hdeB
4,RDBECOLITFC00023,GadW,GadW,False,RDBECOLITUC03315,hdeAB-yhiD,hdeA
...,...,...,...,...,...,...,...
5939,RDBECOLITFC00093,Fur,Fur-a [2Fe-2S] iron-sulfur cluster DNA-binding...,True,RDBECOLITUC03882,glnLG,glnL
5940,RDBECOLITFC00093,Fur,Fur-a [2Fe-2S] iron-sulfur cluster DNA-binding...,True,RDBECOLITUC03883,dppBCDF,dppF
5941,RDBECOLITFC00093,Fur,Fur-a [2Fe-2S] iron-sulfur cluster DNA-binding...,True,RDBECOLITUC03883,dppBCDF,dppD
5942,RDBECOLITFC00093,Fur,Fur-a [2Fe-2S] iron-sulfur cluster DNA-binding...,True,RDBECOLITUC03883,dppBCDF,dppC


Searching for duplicates in the `exploded_set`:

In [288]:
duplicates_count = exploded_set.duplicated().sum()
print(duplicates_count)

0


Searching a regulator which needs to be phosphorylated to be active.

In [289]:
cpxr = exploded_set[exploded_set['regulatorName'] == 'CpxR']
cpxr

Unnamed: 0,regulatorId,regulatorName,reg_active_conf_Name,mod_conformation,tuId,tuName,tuGenes
394,RDBECOLITFC00170,CpxR,CpxR-phosphorylated,True,RDBECOLITUC03415,csgDEFG,csgG
395,RDBECOLITFC00170,CpxR,CpxR-phosphorylated,True,RDBECOLITUC03415,csgDEFG,csgF
396,RDBECOLITFC00170,CpxR,CpxR-phosphorylated,True,RDBECOLITUC03415,csgDEFG,csgE
397,RDBECOLITFC00170,CpxR,CpxR-phosphorylated,True,RDBECOLITUC03415,csgDEFG,csgD
714,RDBECOLITFC00170,CpxR,CpxR-phosphorylated,True,RDBECOLITUC03413,csgBAC,csgC
...,...,...,...,...,...,...,...
5392,RDBECOLITFC00170,CpxR,CpxR-phosphorylated,True,RDBECOLITUC02574,tsr,tsr
5799,RDBECOLITFC00170,CpxR,CpxR-phosphorylated,True,RDBECOLITUC03145,carAB,carA
5800,RDBECOLITFC00170,CpxR,CpxR-phosphorylated,True,RDBECOLITUC03145,carAB,carB
5801,RDBECOLITFC00170,CpxR,CpxR-phosphorylated,True,RDBECOLITUC03401,focA-pflB,pflB


## Data Import from GitHub
I downloaded the [GitHub SBRG repository](https://github.com/SBRG/precise-db/tree/master/data) and imported the `log_tpm_norm.csv` file as it'll be our working dataset.

In [301]:
log_tpm = pd.read_csv("./SBRG/data/log_tpm_norm.csv")
log_tpm = log_tpm.rename(columns={"Unnamed: 0":'ecoli_b_id'}) # renaming the first column

log_tpm.shape
log_tpm

(3923, 279)

Unnamed: 0,ecoli_b_id,control__wt_glc__1,control__wt_glc__2,fur__wt_dpd__1,fur__wt_dpd__2,fur__wt_fe__1,fur__wt_fe__2,fur__delfur_dpd__1,fur__delfur_dpd__2,fur__delfur_fe2__1,...,efeU__menFentC_ale29__1,efeU__menFentC_ale29__2,efeU__menFentC_ale30__1,efeU__menFentC_ale30__2,efeU__menFentCubiC_ale36__1,efeU__menFentCubiC_ale36__2,efeU__menFentCubiC_ale37__1,efeU__menFentCubiC_ale37__2,efeU__menFentCubiC_ale38__1,efeU__menFentCubiC_ale38__2
0,b0002,-0.061772,0.061772,0.636527,0.819793,-0.003615,-0.289353,-1.092023,-0.777289,0.161343,...,-0.797097,-0.791859,0.080114,0.102154,0.608180,0.657673,0.813105,0.854813,0.427986,0.484338
1,b0003,-0.053742,0.053742,0.954439,1.334385,0.307588,0.128414,-0.872563,-0.277893,0.428542,...,-0.309105,-0.352535,-0.155074,-0.077145,0.447030,0.439881,0.554528,0.569030,0.154905,0.294799
2,b0004,-0.065095,0.065095,-0.202697,0.119195,-0.264995,-0.546017,-1.918349,-1.577736,-0.474815,...,-0.184898,-0.225615,0.019575,0.063986,0.483343,0.452754,0.524828,0.581878,0.293239,0.341040
3,b0005,0.028802,-0.028802,-0.865171,-0.951179,0.428769,0.123564,-1.660351,-1.531147,0.240353,...,-0.308221,-0.581714,0.018820,0.004040,-1.228763,-1.451750,-0.839203,-0.529349,-0.413336,-0.478682
4,b0006,0.009087,-0.009087,-0.131039,-0.124079,-0.144870,-0.090152,-0.219917,-0.046648,-0.044537,...,1.464603,1.415706,1.230831,1.165153,0.447447,0.458852,0.421417,0.408077,1.151066,1.198529
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3918,b4688,-0.261325,0.261325,-1.425581,-2.734490,0.181893,0.514395,-1.943947,-1.992701,0.066037,...,-0.885297,-0.462485,-2.734490,-1.451148,-1.379069,-1.567420,-0.999610,-1.726577,-2.734490,-1.189069
3919,b4693,-0.278909,0.278909,1.361362,1.020310,0.608108,0.988541,2.558416,2.142724,3.120867,...,-0.374963,0.856574,-1.147824,-0.814089,2.054471,1.853620,1.957717,1.943582,2.233115,2.023755
3920,b4696_1,0.050526,-0.050526,1.166436,1.043373,-0.531441,-0.581626,0.914055,0.731165,-0.127269,...,0.261604,0.278426,0.201089,-0.017780,0.138178,0.122287,0.504402,0.425213,0.629383,0.805945
3921,b4696_2,-0.031653,0.031653,0.785573,0.881353,-0.477271,-0.916095,0.837603,0.801393,-0.071710,...,-0.499371,0.398783,0.096609,-0.103446,-0.519098,0.615363,0.343959,0.580288,0.366905,0.702608


In [ ]:
# all_b = pd.read_csv("./SBRG/data/gene_info.csv")
# all_b.shape
# 
# all_b.drop_duplicates()
# all_b.reset_index(drop = True, inplace = True)
# all_b.shape
# 
# all_b = all_b.rename(columns={"Unnamed: 0":'ecoli_b_id'})

I downloaded this set from RegulonDB and not used the `gene_info.csv` file from the SBRG repository because it doesn't return duplicates in the GeneName column.

In [307]:
second_way = pd.read_csv('./Data from RegulonDB/GeneName-bcodeset_selected.tsv', sep = '\t')

second_way = second_way.rename(columns={'2)geneName':'GeneName','7)otherDbsGeneIds':'IdsFromOtherDbs'})

second_way

# extracting only the b_id
regex2 = r'\[REFSEQ:(b\d{4})\]'

for num in second_way.index:
    matches = re.findall(regex2, second_way.loc[num, 'IdsFromOtherDbs'])
    
    if matches:
        second_way.loc[num, 'IdsFromOtherDbs'] = matches[0]
    else:
        second_way.drop(index = num, inplace=True)

second_way.reset_index(drop = True, inplace = True)
second_way.shape
second_way

# things that might be useful in the future:
# temp_col = pd.Series((i for i in tf_riset['reg_active_conf_Name'] if re.search(regex, i)), name = 'active_when_phosphorylated')
# second_way.to_csv('./Data from RegulonDB/let_me_see.txt', sep = '\t')

Unnamed: 0,GeneName,IdsFromOtherDbs
0,alr,[STRING:511145.b4053][ASAP:ABE-0013272][ECHOBA...
1,modB,[STRING:511145.b0764][ASAP:ABE-0002603][ECHOBA...
2,cysZ,[STRING:511145.b2413][ASAP:ABE-0007953][ECHOBA...
3,dfp,[STRING:511145.b3639][ASAP:ABE-0011896][ECHOBA...
4,dcuB,[STRING:511145.b4123][ASAP:ABE-0013503][ECHOBA...
...,...,...
4754,asphoP,[ECOCYC:G0-17124][REFSEQ:b4843]
4755,ameF,[ECOCYC:G0-17125][REFSEQ:b4844]
4756,xtpA,[ECOCYC:G0-17126][REFSEQ:b4845]
4757,yrdG,[ECOCYC:G0-17130][REFSEQ:b4846]


(4702, 2)

Unnamed: 0,GeneName,IdsFromOtherDbs
0,alr,b4053
1,modB,b0764
2,cysZ,b2413
3,dfp,b3639
4,dcuB,b4123
...,...,...
4697,asompR,b4842
4698,asphoP,b4843
4699,ameF,b4844
4700,xtpA,b4845


In [328]:
all_b = second_way.drop_duplicates().copy()
all_b.shape
all_b
all_b = all_b.rename(columns = {'IdsFromOtherDbs':'ecoli_b_id'})
all_b

(4691, 2)

Unnamed: 0,GeneName,IdsFromOtherDbs
0,alr,b4053
1,modB,b0764
2,cysZ,b2413
3,dfp,b3639
4,dcuB,b4123
...,...,...
4697,asompR,b4842
4698,asphoP,b4843
4699,ameF,b4844
4700,xtpA,b4845


Unnamed: 0,GeneName,ecoli_b_id
0,alr,b4053
1,modB,b0764
2,cysZ,b2413
3,dfp,b3639
4,dcuB,b4123
...,...,...
4697,asompR,b4842
4698,asphoP,b4843
4699,ameF,b4844
4700,xtpA,b4845


Merging of the datasets:

In [330]:
merged_set = all_b.merge(log_tpm, left_on="ecoli_b_id", right_on="ecoli_b_id", how = 'inner')

# merged_set = merged_set.drop(columns = ["start", "stop", "strand", "operon", "length"])

merged_set.shape
merged_set

(3865, 280)

Unnamed: 0,GeneName,ecoli_b_id,control__wt_glc__1,control__wt_glc__2,fur__wt_dpd__1,fur__wt_dpd__2,fur__wt_fe__1,fur__wt_fe__2,fur__delfur_dpd__1,fur__delfur_dpd__2,...,efeU__menFentC_ale29__1,efeU__menFentC_ale29__2,efeU__menFentC_ale30__1,efeU__menFentC_ale30__2,efeU__menFentCubiC_ale36__1,efeU__menFentCubiC_ale36__2,efeU__menFentCubiC_ale37__1,efeU__menFentCubiC_ale37__2,efeU__menFentCubiC_ale38__1,efeU__menFentCubiC_ale38__2
0,alr,b4053,-0.042292,0.042292,-0.740852,-0.866985,-0.168235,-0.376016,-0.633646,-0.713788,...,-0.476805,-0.508813,0.026904,-0.127094,0.109230,0.000617,0.271930,0.297802,-0.131690,0.150876
1,modB,b0764,0.184964,-0.184964,-0.596042,-0.593549,-0.005954,-0.235184,-0.595247,-0.655526,...,-0.332093,-0.034741,0.279264,-0.003593,0.201033,0.177794,0.248041,0.141078,-0.165527,-0.100868
2,cysZ,b2413,-0.043865,0.043865,0.107350,0.180211,-0.212306,-0.282697,0.123492,0.074825,...,-0.461764,-0.377653,0.203116,0.016200,0.002025,-0.012040,0.628398,0.698518,0.420355,0.275015
3,dfp,b3639,-0.052513,0.052513,0.010719,-0.068170,-0.089001,-0.044974,-0.167640,-0.319147,...,0.191096,0.149360,0.009208,0.066414,-0.123367,-0.268587,-0.247457,-0.231516,0.519031,0.343247
4,dcuB,b4123,0.365215,-0.365215,-0.896131,-0.173375,0.400422,0.258877,-0.348530,-0.482547,...,4.129609,4.256892,0.929593,0.728666,1.911724,2.347201,2.091809,1.923489,2.933930,2.729055
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3860,mhpD,b0350,0.158453,-0.158453,0.662125,0.086680,0.063934,-0.294443,0.553998,0.481348,...,-1.524784,-0.673153,-0.066243,-0.691109,-0.328744,-0.698742,0.331235,0.157622,-0.773144,-0.025589
3861,mhpF,b0351,-0.372588,0.372588,1.513200,1.251159,0.063993,-0.412425,1.083952,0.441693,...,0.695099,0.888539,0.432664,0.482035,0.656983,0.454324,0.499764,0.693546,0.286952,0.759214
3862,hcaF,b2539,0.065282,-0.065282,0.149477,-0.352588,-0.122738,-0.402794,-0.357749,-0.873491,...,-0.915097,-0.939694,-0.597442,-1.301669,-1.471270,-1.357409,-0.504646,-0.870155,-1.046379,-0.757495
3863,ruvA,b1861,-0.079333,0.079333,0.251261,0.038981,-0.247013,-0.240293,0.210413,0.113396,...,0.025995,-0.019448,-0.019484,-0.116220,0.113955,0.125625,0.121400,0.304738,0.436502,0.384450


In [333]:
# checking for duplicates:
dup1 = merged_set[merged_set['ecoli_b_id'].duplicated()]
dup2 = merged_set[merged_set['GeneName'].duplicated()]


dup1 # empty
dup2 # empty

Unnamed: 0,GeneName,ecoli_b_id,control__wt_glc__1,control__wt_glc__2,fur__wt_dpd__1,fur__wt_dpd__2,fur__wt_fe__1,fur__wt_fe__2,fur__delfur_dpd__1,fur__delfur_dpd__2,...,efeU__menFentC_ale29__1,efeU__menFentC_ale29__2,efeU__menFentC_ale30__1,efeU__menFentC_ale30__2,efeU__menFentCubiC_ale36__1,efeU__menFentCubiC_ale36__2,efeU__menFentCubiC_ale37__1,efeU__menFentCubiC_ale37__2,efeU__menFentCubiC_ale38__1,efeU__menFentCubiC_ale38__2


Unnamed: 0,GeneName,ecoli_b_id,control__wt_glc__1,control__wt_glc__2,fur__wt_dpd__1,fur__wt_dpd__2,fur__wt_fe__1,fur__wt_fe__2,fur__delfur_dpd__1,fur__delfur_dpd__2,...,efeU__menFentC_ale29__1,efeU__menFentC_ale29__2,efeU__menFentC_ale30__1,efeU__menFentC_ale30__2,efeU__menFentCubiC_ale36__1,efeU__menFentCubiC_ale36__2,efeU__menFentCubiC_ale37__1,efeU__menFentCubiC_ale37__2,efeU__menFentCubiC_ale38__1,efeU__menFentCubiC_ale38__2


___
___
## Correlation tests
___

### 1) Let's check the normality

To check for linearity, we can visually inspect the relationship between the variables using scatter plots. If the relationship appears linear, it suggests that the Pearson correlation coefficient may be appropriate.

In [ ]:
# Assuming you have a DataFrame df with expression data
# and a DataFrame tf_target_mapping with TFs and their most important target genes

# Example: Extract TF gene expression and most important target gene expression
for i in merged_set:
    tf_name = merged_set.loc['ecoli_b_id']
    for i in useful_set:
        if tf_name == useful_set['regulatorName']:
            index = useful_set.loc[i]
            useful_set['tuGenes'][i] = merged_set
        
        target = useful_set['tuName']
    
    tf_gene = df.loc['TF_gene']
    target_gene = df.loc[tf_target_mapping['Most_important_target_gene']]
    
    # Calculate Pearson correlation coefficient
    correlation_coefficient = np.corrcoef(tf_gene, target_gene)[0, 1]

# Visualize the relationship
plt.scatter(tf_gene, target_gene)
plt.xlabel('TF Gene Expression')
plt.ylabel('Target Gene Expression')
plt.title(f'Scatter Plot of TF Gene vs. Most Important Target Gene (Correlation: {correlation_coefficient:.2f})')
plt.show()
