#0 - Basic Settings

In [None]:
#Permission to access any Google Drive file
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#increasing the viewing capacity of columns and rows
import pandas as pd

pd.set_option('display.max_columns', 7000)
pd.set_option('display.max_rows',1000000)
pd.set_option('display.width', 7000)

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Tue Dec 20 11:49:45 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P0    26W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('To enable a high-RAM runtime, select the Runtime > "Change runtime type"')
  print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
  print('re-execute this cell.')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 27.3 gigabytes of available RAM

You are using a high-RAM runtime!


#1 - Reading and processing RING edge files


In this section, the **edgesDB.txt** file will be read, which contains all the edge files of all wild PDBs (from all tissues processed and recorded in the **PDB_id_All.txt** file in *drive/MyDrive/ProcessaNovaBase/Junta_PDBs_id_wild/Tecidos_PDB_wild_id*) that have notation in RING

The joining of all edge files was processed through adaptations of Laíse's **1ArqNodesEdges** script. This script is located in the **TrataArqsRING** folder of this drive.

he generated file has 12 GB, it was necessary to break it into 2 files (**edgesDB_01.txt** and **edgesDB_02.txt**). Using the following Linux commands:

First, I found out how many lines the file had:

$ wc -l < edgesDB.txt

Second, I applied the split command:

$ split -l 91917025

The attributes that will be used from the edge files are:

- **PDB_id_RING**: PDB cod
- **NodeId1_RING**: The source node of the interaction. The node can be an amino acid or a ligand molecule. It contains the following information: Chain, Node Position and the node itself.
- **Interaction_RING**: contains the interaction type and the subtype of node1 and the subtype of node2. The subtype values are: *main chain* (MC), *side chain* (SC) e *ligand* (LIG).
- **NodeId2_RING**: The target node of the interaction. The node can be an amino acid or a ligand molecule. It contains the following information: chain, node position and the node itself   

##1.1 Processing the *edgesDB_01.txt* database

In [None]:
import pandas as pd

df_RING_edge_01 = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRING/edgesDB_01.txt",index_col=False, header=None, delimiter='\t')


  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
df_RING_edge_01.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91917025 entries, 0 to 91917024
Data columns (total 13 columns):
 #   Column  Dtype  
---  ------  -----  
 0   0       object 
 1   1       object 
 2   2       object 
 3   3       object 
 4   4       float64
 5   5       float64
 6   6       float64
 7   7       object 
 8   8       object 
 9   9       object 
 10  10      object 
 11  11      object 
 12  12      object 
dtypes: float64(3), object(10)
memory usage: 8.9+ GB


In [None]:
df_RING_edge_01.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,10gs,A:210:_:VWW,VDW:LIG_SC,A:7:_:TYR,4.012,-999.9,6.0,SG,CE1,,,,
1,10gs,A:210:_:VWW,VDW:LIG_SC,A:7:_:TYR,4.061,-999.9,6.0,SG,CZ,,,,
2,10gs,A:210:_:VWW,IAC:LIG_SC,A:7:_:TYR,2.728,-999.9,0.0,SG,HH,,,,
3,10gs,A:210:_:VWW,IAC:LIG_SC,A:7:_:TYR,2.816,-999.9,0.0,CZ2,HH,,,,
4,10gs,A:210:_:VWW,IAC:LIG_SC,A:7:_:TYR,3.213,-999.9,0.0,SG,OH,,,,


###1.1.1 Renaming the fields

In [None]:
df_RING_edge_01.rename(columns={0: 'PDB_id_RING',
                       1: 'NodeId1_RING',
                       2: 'Interaction_RING',
                       3: 'NodeId2_RING',
                       4: 'Distance',
                       5: 'Angle',
                       6: 'Energy',
                       7: 'Atom1',
                       8: 'Atom2',
                       9: 'Donor',
                       10: 'Positive',
                       11: 'Cation',
                       12: 'Orientation'}, inplace=True)

In [None]:
df_RING_edge_01.head()

Unnamed: 0,PDB_id_RING,NodeId1_RING,Interaction_RING,NodeId2_RING,Distance,Angle,Energy,Atom1,Atom2,Donor,Positive,Cation,Orientation
0,10gs,A:210:_:VWW,VDW:LIG_SC,A:7:_:TYR,4.012,-999.9,6.0,SG,CE1,,,,
1,10gs,A:210:_:VWW,VDW:LIG_SC,A:7:_:TYR,4.061,-999.9,6.0,SG,CZ,,,,
2,10gs,A:210:_:VWW,IAC:LIG_SC,A:7:_:TYR,2.728,-999.9,0.0,SG,HH,,,,
3,10gs,A:210:_:VWW,IAC:LIG_SC,A:7:_:TYR,2.816,-999.9,0.0,CZ2,HH,,,,
4,10gs,A:210:_:VWW,IAC:LIG_SC,A:7:_:TYR,3.213,-999.9,0.0,SG,OH,,,,


###1.1.2 Selecting the fields that will be used

In [None]:
#Field Selection
df_RING_edge_01 = df_RING_edge_01.loc[:,['PDB_id_RING','NodeId1_RING', 'Interaction_RING', 'NodeId2_RING']]

In [None]:
df_RING_edge_01.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91917025 entries, 0 to 91917024
Data columns (total 4 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   PDB_id_RING       object
 1   NodeId1_RING      object
 2   Interaction_RING  object
 3   NodeId2_RING      object
dtypes: object(4)
memory usage: 2.7+ GB


###1.1.3 Generating an intermediate file with the selected fields from the *edgesDB_01.txt* database  

In [None]:
df_RING_edge_01.to_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRING/edgesDB_01_sel.csv",sep='\t',index=False)

###1.1.4 Dividing the *Interaction* field into two fields

The **Interaction** field will be divided into two: interaction and subinteraction

In [None]:
#increasing the viewing capacity of columns and rows
import pandas as pd

pd.set_option('display.max_columns', 7000)
pd.set_option('display.max_rows',90000)

In [None]:
import pandas as pd
df_RING_edge_01 = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRING/edgesDB_01_sel.csv",delimiter='\t')

In [None]:
df_RING_edge_01.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91917025 entries, 0 to 91917024
Data columns (total 4 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   PDB_id_RING       object
 1   NodeId1_RING      object
 2   Interaction_RING  object
 3   NodeId2_RING      object
dtypes: object(4)
memory usage: 2.7+ GB


In [None]:
df_RING_edge_01["interacao_RING"] = df_RING_edge_01["Interaction_RING"].apply(lambda x: x.split(":")[0])

df_RING_edge_01["subinteracao_RING"] = df_RING_edge_01["Interaction_RING"].apply(lambda x: x.split(":")[1])

In [None]:
df_RING_edge_01.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91917025 entries, 0 to 91917024
Data columns (total 6 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   PDB_id_RING        object
 1   NodeId1_RING       object
 2   Interaction_RING   object
 3   NodeId2_RING       object
 4   interacao_RING     object
 5   subinteracao_RING  object
dtypes: object(6)
memory usage: 4.1+ GB


In [None]:
df_RING_edge_01.head()

Unnamed: 0,PDB_id_RING,NodeId1_RING,Interaction_RING,NodeId2_RING,interacao_RING,subinteracao_RING
0,10gs,A:210:_:VWW,VDW:LIG_SC,A:7:_:TYR,VDW,LIG_SC
1,10gs,A:210:_:VWW,VDW:LIG_SC,A:7:_:TYR,VDW,LIG_SC
2,10gs,A:210:_:VWW,IAC:LIG_SC,A:7:_:TYR,IAC,LIG_SC
3,10gs,A:210:_:VWW,IAC:LIG_SC,A:7:_:TYR,IAC,LIG_SC
4,10gs,A:210:_:VWW,IAC:LIG_SC,A:7:_:TYR,IAC,LIG_SC


###1.1.5 Processing the *PDB_id* attribute

In [None]:
df_RING_edge_01["PDB_id_RING"] = df_RING_edge_01["PDB_id_RING"].apply(lambda x: x.upper())

###1.1.6 Generating an intermediate file with the processing of the *PDB_id attribute* from the *edgesDB_01.txt* database  

In [None]:
df_RING_edge_01.to_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRING/edgesDB_01_sel_proc_PDB.csv",sep='\t',index=False)

###1.1.7 Extracting the residue, its position and chain in the *NodeId1* attribute

The source residue, its position and its chain contained in **NodeId1** will be extracted:

**Residue1_RING**: source node of the edge

**Residue1_pos_RING**: the position of the node

**Residue1_chain_RING**: the chain where the node is located

In [None]:
#increasing the viewing capacity of columns and rows
import pandas as pd

pd.set_option('display.max_columns', 7000)
pd.set_option('display.max_rows',90000)

In [None]:
import pandas as pd
df_RING_edge_01 = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRING/edgesDB_01_sel_proc_PDB.csv",delimiter='\t')

In [None]:
df_RING_edge_01.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91917025 entries, 0 to 91917024
Data columns (total 6 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   PDB_id_RING        object
 1   NodeId1_RING       object
 2   Interaction_RING   object
 3   NodeId2_RING       object
 4   interacao_RING     object
 5   subinteracao_RING  object
dtypes: object(6)
memory usage: 4.1+ GB


In [None]:
df_RING_edge_01["NodeId1_RING"].value_counts()

A:142:_:HEM     448327
B:147:_:HEM     408763
C:142:_:HEM     381901
D:147:_:HEM     342517
D:1:_:DG        254235
                 ...  
N:150:_:GLN          1
I:224:_:CYS          1
P:157:_:GLY          1
B:-344:_:GLY         1
C:1711:_:ARG         1
Name: NodeId1_RING, Length: 294932, dtype: int64

In [None]:
df_RING_edge_01["Residue1_RING"] = df_RING_edge_01["NodeId1_RING"].apply(lambda x: x.split(":")[3])

In [None]:
#Converting to standard: Ala, Arg, Asn,...
Amin = ['ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HIS', 'ILE', 'LEU', 'LYS',
      'MET', 'PHE', 'PRO', 'PYL', 'SER', 'SEC', 'THR', 'TRP', 'TYR', 'VAL', 'ASX', 'GLX']

df_RING_edge_01["Residue1_RING"] = df_RING_edge_01["Residue1_RING"].apply(lambda x: x.capitalize() if x in Amin else x)

In [None]:
df_RING_edge_01["Residue1_RING"].value_counts()

HEM    3548604
MSE    2978827
Leu    2542365
NAG    2463048
GOL    2313845
Phe    2280713
SO4    2167392
DG     2111103
EDO    1758493
Tyr    1748982
FAD    1681364
Val    1546229
DC     1533779
NAP    1504557
Arg    1419786
Ile    1407592
DT     1355388
DA     1286555
Lys    1110185
Glu    1068679
Ala     982259
Trp     931759
Asn     889599
Gln     878626
Asp     860479
Thr     859371
Ser     826280
NAD     820915
His     783991
Pro     779056
ADP     746356
Met     710381
ZN      581181
Cys     542197
GDP     525826
PTR     518155
Gly     510891
SAH     475698
NDP     467869
GNP     462302
PO4     452815
A       450865
CA      378666
NAI     369023
ATP     352889
U       351805
TPO     349902
ANP     340658
G       338432
SEP     311574
TYS     310446
ACT     303835
C       298473
MYR     254707
CL      250660
MG      237762
GSH     236309
GTP     224124
CME     223946
GLC     203779
PEG     192063
UNX     188481
FMN     185855
UDP     185149
COA     167762
CAS     166474
MES     16

In [None]:
df_RING_edge_01["Residue1_pos_RING"] = df_RING_edge_01["NodeId1_RING"].apply(lambda x: x.split(":")[1])

In [None]:
df_RING_edge_01["Residue1_chain_RING"] = df_RING_edge_01["NodeId1_RING"].apply(lambda x: x.split(":")[0].upper())

In [None]:
df_RING_edge_01["Residue1_chain_RING"].value_counts()

A    42183011
B    19768042
C     8136864
D     6730684
E     2451759
H     2260883
F     1966027
G     1372531
P     1239844
T     1220758
I     1000904
L      963596
J      647964
K      316682
X      286494
R      244909
M      202331
N      180737
Q      168739
O      154652
S      148179
U       76350
Y       58801
V       44519
Z       33306
W       17638
2       14248
1       10634
3        7975
4        1743
5        1449
0        1436
8        1272
7        1073
9         597
6         394
Name: Residue1_chain_RING, dtype: int64

In [None]:
df_RING_edge_01.head(25)

Unnamed: 0,PDB_id_RING,NodeId1_RING,Interaction_RING,NodeId2_RING,interacao_RING,subinteracao_RING,Residue1_RING,Residue1_pos_RING,Residue1_chain_RING
0,10GS,A:210:_:VWW,VDW:LIG_SC,A:7:_:TYR,VDW,LIG_SC,VWW,210,A
1,10GS,A:210:_:VWW,VDW:LIG_SC,A:7:_:TYR,VDW,LIG_SC,VWW,210,A
2,10GS,A:210:_:VWW,IAC:LIG_SC,A:7:_:TYR,IAC,LIG_SC,VWW,210,A
3,10GS,A:210:_:VWW,IAC:LIG_SC,A:7:_:TYR,IAC,LIG_SC,VWW,210,A
4,10GS,A:210:_:VWW,IAC:LIG_SC,A:7:_:TYR,IAC,LIG_SC,VWW,210,A
5,10GS,A:210:_:VWW,IAC:LIG_SC,A:7:_:TYR,IAC,LIG_SC,VWW,210,A
6,10GS,A:210:_:VWW,IAC:LIG_SC,A:7:_:TYR,IAC,LIG_SC,VWW,210,A
7,10GS,A:210:_:VWW,IAC:LIG_SC,A:7:_:TYR,IAC,LIG_SC,VWW,210,A
8,10GS,A:210:_:VWW,IAC:LIG_SC,A:7:_:TYR,IAC,LIG_SC,VWW,210,A
9,10GS,A:210:_:VWW,IAC:LIG_SC,A:7:_:TYR,IAC,LIG_SC,VWW,210,A


###1.1.8 Generating an intermediate file with the extraction of the residue from the *Node_id1* attribute of the *edgesDB_01.txt* database

In [None]:
df_RING_edge_01.to_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRING/edgesDB_01_sel_proc_PDB_Res1.csv",sep='\t',index=False)

###1.1.9 Extracting the residue, its position and chain in the *NodeId2* attribute

The target residue, its position and chain contained in **NodeId2** will be extracted

**Residue2_RING**: target node of the edge

**Residue2_pos_RING**: the position of the node

**Residue2_chain_RING**: the chain where the node is located

In [None]:
#increasing the viewing capacity of columns and rows
import pandas as pd

pd.set_option('display.max_columns', 7000)
pd.set_option('display.max_rows',90000)

In [None]:
import pandas as pd
df_RING_edge_01 = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRING/edgesDB_01_sel_proc_PDB_Res1.csv",delimiter='\t')

In [None]:
df_RING_edge_01.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91917025 entries, 0 to 91917024
Data columns (total 9 columns):
 #   Column               Dtype 
---  ------               ----- 
 0   PDB_id_RING          object
 1   NodeId1_RING         object
 2   Interaction_RING     object
 3   NodeId2_RING         object
 4   interacao_RING       object
 5   subinteracao_RING    object
 6   Residue1_RING        object
 7   Residue1_pos_RING    int64 
 8   Residue1_chain_RING  object
dtypes: int64(1), object(8)
memory usage: 6.2+ GB


In [None]:
df_RING_edge_01["NodeId2_RING"].value_counts()

A:83:_:LEU     85566
A:234:_:LYS    81051
A:198:_:LEU    76208
A:10:_:ILE     72465
A:134:_:LEU    67807
               ...  
a:106:_:SER        1
O:544:_:LEU        1
E:560:_:GLN        1
H:383:_:THR        1
4:33:_:ALA         1
Name: NodeId2_RING, Length: 270533, dtype: int64

In [None]:
df_RING_edge_01["Residue2_RING"] = df_RING_edge_01["NodeId2_RING"].apply(lambda x: x.split(":")[3])

In [None]:
#Converting to standard: Ala, Arg, Asn,...
Amin = ['ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HIS', 'ILE', 'LEU', 'LYS',
      'MET', 'PHE', 'PRO', 'PYL', 'SER', 'SEC', 'THR', 'TRP', 'TYR', 'VAL', 'ASX', 'GLX']

df_RING_edge_01["Residue2_RING"] = df_RING_edge_01["Residue2_RING"].apply(lambda x: x.capitalize() if x in Amin else x)

In [None]:
df_RING_edge_01["Residue2_pos_RING"] = df_RING_edge_01["NodeId2_RING"].apply(lambda x: x.split(":")[1])

In [None]:
df_RING_edge_01.to_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRING/edgesDB_01_sel_proc_PDB_Res2.csv",sep='\t',index=False)

In [None]:
#increasing the viewing capacity of columns and rows
import pandas as pd

pd.set_option('display.max_columns', 7000)
pd.set_option('display.max_rows',90000)

In [None]:
import pandas as pd
df_RING_edge_01 = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRING/edgesDB_01_sel_proc_PDB_Res2.csv",delimiter='\t')

In [None]:
df_RING_edge_01.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91917025 entries, 0 to 91917024
Data columns (total 11 columns):
 #   Column               Dtype 
---  ------               ----- 
 0   PDB_id_RING          object
 1   NodeId1_RING         object
 2   Interaction_RING     object
 3   NodeId2_RING         object
 4   interacao_RING       object
 5   subinteracao_RING    object
 6   Residue1_RING        object
 7   Residue1_pos_RING    int64 
 8   Residue1_chain_RING  object
 9   Residue2_RING        object
 10  Residue2_pos_RING    int64 
dtypes: int64(2), object(9)
memory usage: 7.5+ GB


In [None]:
df_RING_edge_01["Residue2_chain_RING"] = df_RING_edge_01["NodeId2_RING"].apply(lambda x: x.split(":")[0].upper())

In [None]:
df_RING_edge_01["Residue2_chain_RING"].value_counts()

A    47151992
B    20147498
C     7256997
D     5709076
H     2297421
E     1955871
F     1448521
G     1082157
I      988579
L      836785
J      726664
T      416461
P      342364
X      261350
M      229698
K      228276
O      150638
R      140521
S      130317
Q      106805
N       96643
U       62113
Y       49532
Z       24546
V       22340
2       18270
W       12477
1        9375
3        4272
4        3444
0        1384
5        1353
7        1237
8        1209
9         496
6         343
Name: Residue2_chain_RING, dtype: int64

In [None]:
df_RING_edge_01.head(25)

Unnamed: 0,PDB_id_RING,NodeId1_RING,Interaction_RING,NodeId2_RING,interacao_RING,subinteracao_RING,Residue1_RING,Residue1_pos_RING,Residue1_chain_RING,Residue2_RING,Residue2_pos_RING,Residue2_chain_RING
0,10GS,A:210:_:VWW,VDW:LIG_SC,A:7:_:TYR,VDW,LIG_SC,VWW,210,A,Tyr,7,A
1,10GS,A:210:_:VWW,VDW:LIG_SC,A:7:_:TYR,VDW,LIG_SC,VWW,210,A,Tyr,7,A
2,10GS,A:210:_:VWW,IAC:LIG_SC,A:7:_:TYR,IAC,LIG_SC,VWW,210,A,Tyr,7,A
3,10GS,A:210:_:VWW,IAC:LIG_SC,A:7:_:TYR,IAC,LIG_SC,VWW,210,A,Tyr,7,A
4,10GS,A:210:_:VWW,IAC:LIG_SC,A:7:_:TYR,IAC,LIG_SC,VWW,210,A,Tyr,7,A
5,10GS,A:210:_:VWW,IAC:LIG_SC,A:7:_:TYR,IAC,LIG_SC,VWW,210,A,Tyr,7,A
6,10GS,A:210:_:VWW,IAC:LIG_SC,A:7:_:TYR,IAC,LIG_SC,VWW,210,A,Tyr,7,A
7,10GS,A:210:_:VWW,IAC:LIG_SC,A:7:_:TYR,IAC,LIG_SC,VWW,210,A,Tyr,7,A
8,10GS,A:210:_:VWW,IAC:LIG_SC,A:7:_:TYR,IAC,LIG_SC,VWW,210,A,Tyr,7,A
9,10GS,A:210:_:VWW,IAC:LIG_SC,A:7:_:TYR,IAC,LIG_SC,VWW,210,A,Tyr,7,A


In [None]:
df_RING_edge_01.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91917025 entries, 0 to 91917024
Data columns (total 12 columns):
 #   Column               Dtype 
---  ------               ----- 
 0   PDB_id_RING          object
 1   NodeId1_RING         object
 2   Interaction_RING     object
 3   NodeId2_RING         object
 4   interacao_RING       object
 5   subinteracao_RING    object
 6   Residue1_RING        object
 7   Residue1_pos_RING    int64 
 8   Residue1_chain_RING  object
 9   Residue2_RING        object
 10  Residue2_pos_RING    int64 
 11  Residue2_chain_RING  object
dtypes: int64(2), object(10)
memory usage: 8.2+ GB


In [None]:

def categories_column(df):
    for col in ['PDB_id_RING', 'Residue1_RING', 'Residue2_RING', 'interacao_RING']:
        mydic= df[col].value_counts().to_dict()
        print(col, mydic)
        print('\n')

categories_column(df_RING_edge_01)

PDB_id_RING {'3SOM': 158140, '2NZ4': 128986, '4BL5': 110833, '2Q3E': 109988, '4DL1': 102084, '1ZMD': 94881, '3KO0': 92030, '1DO8': 90763, '4DVQ': 90401, '4UHL': 90379, '1GZ4': 88268, '1KX5': 86644, '1ZMC': 85393, '4BC2': 84646, '4CQM': 82994, '2F5Z': 82961, '2VTB': 80806, '1CVJ': 80613, '4BC4': 79424, '4BC3': 75866, '3D1N': 75763, '4IEM': 73308, '3REK': 70328, '2CV5': 69845, '3GQC': 69344, '3AFA': 69314, '2QG4': 68824, '2PYO': 68823, '3REJ': 68538, '3REH': 68082, '3AZG': 67543, '1KX3': 67203, '1F66': 66583, '1KX4': 66458, '3AV1': 66120, '2VIG': 66091, '2NQB': 64976, '1P3L': 63813, '1P3I': 63788, '4AY1': 63534, '4K4I': 58742, '2VCV': 58734, '4K4H': 58006, '2QC8': 57875, '4X4V': 57788, '1O01': 57253, '4K4G': 57155, '1O02': 56781, '4R08': 55472, '4EJH': 54994, '4R07': 54691, '3JSX': 54647, '4X4T': 54253, '3PTZ': 53317, '4GLS': 53241, '3ODI': 51919, '3ODL': 51518, '4OKN': 49945, '3TMJ': 49626, '3W3L': 49505, '4N7O': 49302, '4EJG': 49159, '2C6Q': 49133, '2J6L': 48514, '4D0Z': 48283, '1CW3':

In [None]:
df_RING_edge_01.query('PDB_id_RING == "10GS" and Residue1_RING == "Asp" and Residue1_pos_RING == "98" and Residue1_chain_RING == "A"')

Unnamed: 0,PDB_id_RING,NodeId1_RING,Interaction_RING,NodeId2_RING,interacao_RING,subinteracao_RING,Residue1_RING,Residue1_pos_RING,Residue1_chain_RING,Residue2_RING,Residue2_pos_RING,Residue2_chain_RING
3177,10GS,A:98:_:ASP,HBOND:MC_MC,A:101:_:CYS,HBOND,MC_MC,Asp,98,A,Cys,101,A
3178,10GS,A:98:_:ASP,HBOND:MC_MC,A:102:_:LYS,HBOND,MC_MC,Asp,98,A,Lys,102,A


In [None]:
df_RING_edge_01.query('PDB_id_RING == "10GS" and Residue2_RING == "Asp" and Residue2_pos_RING == "98" and Residue2_chain_RING == "A"')

Unnamed: 0,PDB_id_RING,NodeId1_RING,Interaction_RING,NodeId2_RING,interacao_RING,subinteracao_RING,Residue1_RING,Residue1_pos_RING,Residue1_chain_RING,Residue2_RING,Residue2_pos_RING,Residue2_chain_RING
3160,10GS,A:94:_:ASP,HBOND:MC_MC,A:98:_:ASP,HBOND,MC_MC,Asp,94,A,Asp,98,A
3161,10GS,A:95:_:GLY,HBOND:MC_MC,A:98:_:ASP,HBOND,MC_MC,Gly,95,A,Asp,98,A
3560,10GS,B:210:_:VWW,IAC:LIG_SC,A:98:_:ASP,IAC,LIG_SC,VWW,210,B,Asp,98,A
3561,10GS,B:210:_:VWW,IAC:LIG_SC,A:98:_:ASP,IAC,LIG_SC,VWW,210,B,Asp,98,A
3562,10GS,B:210:_:VWW,IAC:LIG_SC,A:98:_:ASP,IAC,LIG_SC,VWW,210,B,Asp,98,A
3563,10GS,B:210:_:VWW,IAC:LIG_SC,A:98:_:ASP,IAC,LIG_SC,VWW,210,B,Asp,98,A
3564,10GS,B:210:_:VWW,IAC:LIG_SC,A:98:_:ASP,IAC,LIG_SC,VWW,210,B,Asp,98,A
3565,10GS,B:210:_:VWW,IAC:LIG_SC,A:98:_:ASP,IAC,LIG_SC,VWW,210,B,Asp,98,A
3566,10GS,B:210:_:VWW,IAC:LIG_SC,A:98:_:ASP,IAC,LIG_SC,VWW,210,B,Asp,98,A
3567,10GS,B:210:_:VWW,IAC:LIG_SC,A:98:_:ASP,IAC,LIG_SC,VWW,210,B,Asp,98,A


###1.1.10 Generating an intermediate file with the extraction of the residue from the *Node_id2* attribute of the *edgesDB_01.txt* database

In [None]:
df_RING_edge_01.to_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRING/edgesDB_01_sel_proc_PDB_Res1_Res2.csv",sep='\t',index=False)

###1.1.11 Processing interaction types of Source nodes of edges

In [None]:
#increasing the viewing capacity of columns and rows
import pandas as pd

pd.set_option('display.max_columns', 7000)
pd.set_option('display.max_rows',90000)

In [None]:
import pandas as pd
df_RING_edge_01 = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRING/edgesDB_01_sel_proc_PDB_Res1_Res2.csv",delimiter='\t')

In [None]:
df_RING_edge_01.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91917025 entries, 0 to 91917024
Data columns (total 12 columns):
 #   Column               Dtype 
---  ------               ----- 
 0   PDB_id_RING          object
 1   NodeId1_RING         object
 2   Interaction_RING     object
 3   NodeId2_RING         object
 4   interacao_RING       object
 5   subinteracao_RING    object
 6   Residue1_RING        object
 7   Residue1_pos_RING    int64 
 8   Residue1_chain_RING  object
 9   Residue2_RING        object
 10  Residue2_pos_RING    int64 
 11  Residue2_chain_RING  object
dtypes: int64(2), object(10)
memory usage: 8.2+ GB


In [None]:
df_RING_edge_01.head(100)

Unnamed: 0,PDB_id_RING,NodeId1_RING,Interaction_RING,NodeId2_RING,interacao_RING,subinteracao_RING,Residue1_RING,Residue1_pos_RING,Residue1_chain_RING,Residue2_RING,Residue2_pos_RING,Residue2_chain_RING
0,10GS,A:210:_:VWW,VDW:LIG_SC,A:7:_:TYR,VDW,LIG_SC,VWW,210,A,Tyr,7,A
1,10GS,A:210:_:VWW,VDW:LIG_SC,A:7:_:TYR,VDW,LIG_SC,VWW,210,A,Tyr,7,A
2,10GS,A:210:_:VWW,IAC:LIG_SC,A:7:_:TYR,IAC,LIG_SC,VWW,210,A,Tyr,7,A
3,10GS,A:210:_:VWW,IAC:LIG_SC,A:7:_:TYR,IAC,LIG_SC,VWW,210,A,Tyr,7,A
4,10GS,A:210:_:VWW,IAC:LIG_SC,A:7:_:TYR,IAC,LIG_SC,VWW,210,A,Tyr,7,A
5,10GS,A:210:_:VWW,IAC:LIG_SC,A:7:_:TYR,IAC,LIG_SC,VWW,210,A,Tyr,7,A
6,10GS,A:210:_:VWW,IAC:LIG_SC,A:7:_:TYR,IAC,LIG_SC,VWW,210,A,Tyr,7,A
7,10GS,A:210:_:VWW,IAC:LIG_SC,A:7:_:TYR,IAC,LIG_SC,VWW,210,A,Tyr,7,A
8,10GS,A:210:_:VWW,IAC:LIG_SC,A:7:_:TYR,IAC,LIG_SC,VWW,210,A,Tyr,7,A
9,10GS,A:210:_:VWW,IAC:LIG_SC,A:7:_:TYR,IAC,LIG_SC,VWW,210,A,Tyr,7,A


In [None]:
df_RING_edge_01.query('PDB_id_RING == "10GS" & NodeId1_RING == "A:100:_:ARG"')

Unnamed: 0,PDB_id_RING,NodeId1_RING,Interaction_RING,NodeId2_RING,interacao_RING,subinteracao_RING,Residue1_RING,Residue1_pos_RING,Residue1_chain_RING,Residue2_RING,Residue2_pos_RING,Residue2_chain_RING
3185,10GS,A:100:_:ARG,HBOND:MC_MC,A:103:_:TYR,HBOND,MC_MC,Arg,100,A,Tyr,103,A
3186,10GS,A:100:_:ARG,VDW:MC_SC,A:103:_:TYR,VDW,MC_SC,Arg,100,A,Tyr,103,A
3187,10GS,A:100:_:ARG,HBOND:MC_MC,A:104:_:ILE,HBOND,MC_MC,Arg,100,A,Ile,104,A
3188,10GS,A:100:_:ARG,VDW:SC_SC,A:104:_:ILE,VDW,SC_SC,Arg,100,A,Ile,104,A
3189,10GS,A:100:_:ARG,HBOND:SC_SC,A:154:_:ASN,HBOND,SC_SC,Arg,100,A,Asn,154,A
3190,10GS,A:100:_:ARG,HBOND:SC_MC,A:154:_:ASN,HBOND,SC_MC,Arg,100,A,Asn,154,A
3191,10GS,A:100:_:ARG,VDW:SC_SC,A:154:_:ASN,VDW,SC_SC,Arg,100,A,Asn,154,A



Let's group the edge Source node records by: **PDB_id_RING**, **Residue1_RING**, **Residue1_pos_RING**, **Residue1_chain_RING** and counting the types of interaction they have.

Solution based on: https://stackoverflow.com/questions/29836477/pandas-create-new-column-with-count-from-groupby

In [None]:
df_RING_edge_01.groupby(['PDB_id_RING',	'NodeId1_RING', 'interacao_RING'])["Interaction_RING"].count().reset_index(name="count")

Unnamed: 0,PDB_id_RING,NodeId1_RING,interacao_RING,count
0,10GS,A:100:_:ARG,HBOND,4
1,10GS,A:100:_:ARG,VDW,3
2,10GS,A:101:_:CYS,HBOND,2
3,10GS,A:102:_:LYS,HBOND,3
4,10GS,A:102:_:LYS,VDW,2
...,...,...,...,...
8220876,4X6J,A:6:_:ASP,HBOND,1
8220877,4X6J,A:6:_:ASP,VDW,3
8220878,4X6J,A:7:_:TYR,HBOND,3
8220879,4X6J,A:7:_:TYR,PIPISTACK,1


In [None]:
df_groupnode1_0 = df_RING_edge_01.groupby(['PDB_id_RING',	'Residue1_RING', 'Residue1_pos_RING','Residue1_chain_RING','interacao_RING'])["Interaction_RING"].count().reset_index(name="count")

In [None]:
df_groupnode1_0.head(10)

Unnamed: 0,PDB_id_RING,Residue1_RING,Residue1_pos_RING,Residue1_chain_RING,interacao_RING,count
0,10GS,Ala,15,A,HBOND,2
1,10GS,Ala,15,A,VDW,1
2,10GS,Ala,15,B,HBOND,2
3,10GS,Ala,15,B,VDW,2
4,10GS,Ala,16,A,HBOND,2
5,10GS,Ala,16,A,VDW,1
6,10GS,Ala,16,B,HBOND,2
7,10GS,Ala,16,B,VDW,1
8,10GS,Ala,22,A,HBOND,1
9,10GS,Ala,22,A,VDW,3


In [None]:
df_groupnode1 = df_RING_edge_01.groupby(['PDB_id_RING',	'Residue1_RING', 'Residue1_pos_RING','Residue1_chain_RING','Interaction_RING'])["interacao_RING"].count().reset_index(name="count")

In [None]:
df_groupnode1.head(10)

Unnamed: 0,PDB_id_RING,Residue1_RING,Residue1_pos_RING,Residue1_chain_RING,Interaction_RING,count
0,10GS,Ala,15,A,HBOND:MC_MC,2
1,10GS,Ala,15,A,VDW:SC_SC,1
2,10GS,Ala,15,B,HBOND:MC_MC,2
3,10GS,Ala,15,B,VDW:MC_SC,1
4,10GS,Ala,15,B,VDW:SC_SC,1
5,10GS,Ala,16,A,HBOND:MC_MC,2
6,10GS,Ala,16,A,VDW:SC_MC,1
7,10GS,Ala,16,B,HBOND:MC_MC,2
8,10GS,Ala,16,B,VDW:SC_MC,1
9,10GS,Ala,22,A,HBOND:MC_MC,1


In [None]:
df_groupnode1.query('PDB_id_RING == "10GS" & Residue1_RING == "Tyr" & Residue1_pos_RING == "7" & Residue1_chain_RING == "A"')

Unnamed: 0,PDB_id_RING,Residue1_RING,Residue1_pos_RING,Residue1_chain_RING,Interaction_RING,count
525,10GS,Tyr,7,A,HBOND:MC_MC,2
526,10GS,Tyr,7,A,HBOND:SC_MC,1
527,10GS,Tyr,7,A,VDW:SC_SC,12


In [None]:
df_groupnode1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10359108 entries, 0 to 10359107
Data columns (total 6 columns):
 #   Column               Dtype 
---  ------               ----- 
 0   PDB_id_RING          object
 1   Residue1_RING        object
 2   Residue1_pos_RING    int64 
 3   Residue1_chain_RING  object
 4   Interaction_RING     object
 5   count                int64 
dtypes: int64(2), object(4)
memory usage: 474.2+ MB


Creating a Dataframe that will store the processing of the **df_groupnode1** dataframe.
The Dataframe will have the following attributes:
- **PDB_id_RING**: PDB              
- **Node_RING**: the node can be a residue or a ligand        
- **Node_pos_RING**: position of the node in the chain
- **Node_chain_RING**: chain       
- **Node_type**: node type: *source* (S) or *target* (T)              
- **Inter_Lig_tot**: total residue-ligand interactions            
- **Inter_Res_tot**: total residue-residue interactions        
- **Inter_IAC_Lig_tot**: total interactions of type IAC with ligand
- **Inter_VDW_Lig_tot**: total interactions of type VDW with ligand
- **Inter_HBOND_Lig_tot**: total interactions of type HBOND with ligand       
- **Inter_PIPISTACK_Lig_tot**: total interactions of type PIPISTACK with ligand   
- **Inter_IONIC_Lig_tot**: total interactions of type IONIC with ligand      
- **Inter_SSBOND_Lig_tot**: total interactions of type SSBOND with ligand      
- **Inter_PICATION_Lig_tot**: total interactions of type PICATION with ligand    
- **Inter_IAC_Res_tot**: total interactions of type IAC between residues
- **Inter_VDW_Res_tot**: total interactions of type VDW between residues
- **Inter_HBOND_Res_tot**: total interactions of type HBOND between residues               
- **Inter_PIPISTACK_Res_tot**: total interactions of type PIPISTACKK between residues           
- **Inter_IONIC_Res_tot**: total interactions of type IONIC between residues       
- **Inter_SSBOND_Res_tot**: total interactions of type SSBOND between residues              
- **Inter_PICATION_Res_tot**: total interactions of type PICATION between residues         


In [None]:
COLUMN_NAMES=['PDB_id_RING','Node_RING','Node_pos_RING','Node_chain_RING','Node_type','Inter_Lig_tot','Inter_Res_tot',
              'Inter_IAC_Lig_tot','Inter_VDW_Lig_tot', 'Inter_HBOND_Lig_tot', 'Inter_PIPISTACK_Lig_tot', 'Inter_IONIC_Lig_tot',
              'Inter_SSBOND_Lig_tot', 'Inter_PICATION_Lig_tot','Inter_IAC_Res_tot','Inter_VDW_Res_tot', 'Inter_HBOND_Res_tot',
              'Inter_PIPISTACK_Res_tot', 'Inter_IONIC_Res_tot','Inter_SSBOND_Res_tot', 'Inter_PICATION_Res_tot']
df_proc_RING_1 = pd.DataFrame(columns=COLUMN_NAMES)

In [None]:
df_proc_RING_1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   PDB_id_RING              0 non-null      object
 1   Node_RING                0 non-null      object
 2   Node_pos_RING            0 non-null      object
 3   Node_chain_RING          0 non-null      object
 4   Node_type                0 non-null      object
 5   Inter_Lig_tot            0 non-null      object
 6   Inter_Res_tot            0 non-null      object
 7   Inter_IAC_Lig_tot        0 non-null      object
 8   Inter_VDW_Lig_tot        0 non-null      object
 9   Inter_HBOND_Lig_tot      0 non-null      object
 10  Inter_PIPISTACK_Lig_tot  0 non-null      object
 11  Inter_IONIC_Lig_tot      0 non-null      object
 12  Inter_SSBOND_Lig_tot     0 non-null      object
 13  Inter_PICATION_Lig_tot   0 non-null      object
 14  Inter_IAC_Res_tot        0 non-null      object
 15  Inter_

Processing nodes that are source (nodes present in the **Residue1_RING** attribute) stored in the **df_groupnode1** dataframe

In [None]:

l_PDB_id_RING = []
l_Node_RING = []
l__Node_pos_RING = []
l_Node_chain_RING = []
l_Node_type = []
l_Inter_Lig_tot = []
l_Inter_Res_tot = []
l_Inter_IAC_Lig_tot = []
l_Inter_VDW_Lig_tot = []
l_Inter_HBOND_Lig_tot = []
l_Inter_PIPISTACK_Lig_tot = []
l_Inter_IONIC_Lig_tot = []
l_Inter_SSBOND_Lig_tot = []
l_Inter_PICATION_Lig_tot = []
l_Inter_IAC_Res_tot = []
l_Inter_VDW_Res_tot = []
l_Inter_HBOND_Res_tot = []
l_Inter_PIPISTACK_Res_tot = []
l_Inter_IONIC_Res_tot = []
l_Inter_SSBOND_Res_tot = []
l_Inter_PICATION_Res_tot = []

l_PDB = []
l_Node = []
l_pos = []
l_chain = []
l_type = []
l_lig = []
l_res = []
l_IAC_L = []
l_VDW_L = []
l_HBOND_L = []
l_PIPISTACK_L = []
l_IONIC_L = []
l_SSBOND_L = []
l_PICATION_L = []
l_IAC_R = []
l_VDW_R = []
l_HBOND_R = []
l_PIPISTACK_R = []
l_IONIC_R = []
l_SSBOND_R = []
l_PICATION_R = []

def process_reg_group(df):
  pdb = 0
  no = 0
  pos = 0
  chain = 0
  mudou = False
  primeiro = False

  totlig = 0
  totres = 0
  tot1 = 0
  tot2 = 0
  tot3 = 0
  tot4 = 0
  tot5 = 0
  tot6 = 0
  tot7 = 0
  tot8 = 0
  tot9 = 0
  tot10 = 0
  tot11 = 0
  tot12 = 0
  tot13 = 0
  tot14 = 0

  for i in df.itertuples():

    if (i.Index == 0):  #First record
       primeiro = True
       #print("e o primeiro")
    if (((i.PDB_id_RING == pdb) and (i.Residue1_RING  == no) and (i.Residue1_pos_RING == pos) and (i.Residue1_chain_RING == chain))
    or primeiro):
      if(primeiro == True):
        primeiro = False
      pdb = i.PDB_id_RING
      no = i.Residue1_RING
      pos = i.Residue1_pos_RING
      chain = i.Residue1_chain_RING
      interacao = i.Interaction_RING.split(":")[0]
      if ("LIG" in i.Interaction_RING):
        totlig = totlig + i.count
        if (interacao == "IAC"):
          tot1 = tot1 + i.count
        elif (interacao == "VDW"):
          tot2 = tot2 + i.count
        elif (interacao == "HBOND"):
          tot3 = tot3 + i.count
        elif (interacao == "PIPISTACK"):
          tot4 = tot4 + i.count
        elif (interacao == "IONIC"):
          tot5 = tot5 + i.count
        elif (interacao == "SSBOND"):
          tot6 = tot6 + i.count
        elif (interacao == "PICATION"):
          tot7 = tot7 + i.count
      else:    #interacao residuo - residuo
        totres = totres + i.count
        if (interacao == "IAC"):
          tot8 = tot8 + i.count
        elif (interacao == "VDW"):
          tot9 = tot9 + i.count
        elif (interacao == "HBOND"):
          tot10 = tot10 + i.count
        elif (interacao == "PIPISTACK"):
          tot11 = tot11 + i.count
        elif (interacao == "IONIC"):
          tot12 = tot12 + i.count
        elif (interacao == "SSBOND"):
          tot13 = tot13 + i.count
        elif (interacao == "PICATION"):
          tot14 = tot14 + i.count
    else:  #change the key ((i.PDB_id_RING == pdb) & (i.Residue1_RING  == no) & (i.Residue1_pos_RING == pos) & (i.Residue1_chain_RING == chain))
      l_PDB_id_RING.append(pdb)
      l_Node_RING.append(no)
      l__Node_pos_RING.append(pos)
      l_Node_chain_RING.append(chain)
      l_Node_type.append("S")
      l_Inter_Lig_tot.append(totlig)
      l_Inter_Res_tot.append(totres)
      l_Inter_IAC_Lig_tot.append(tot1)
      l_Inter_VDW_Lig_tot.append(tot2)
      l_Inter_HBOND_Lig_tot.append(tot3)
      l_Inter_PIPISTACK_Lig_tot.append(tot4)
      l_Inter_IONIC_Lig_tot.append(tot5)
      l_Inter_SSBOND_Lig_tot.append(tot6)
      l_Inter_PICATION_Lig_tot.append(tot7)
      l_Inter_IAC_Res_tot.append(tot8)
      l_Inter_VDW_Res_tot.append(tot9)
      l_Inter_HBOND_Res_tot.append(tot10)
      l_Inter_PIPISTACK_Res_tot.append(tot11)
      l_Inter_IONIC_Res_tot.append(tot12)
      l_Inter_SSBOND_Res_tot.append(tot13)
      l_Inter_PICATION_Res_tot.append(tot14)
      totlig = 0
      totres = 0
      tot1 = 0
      tot2 = 0
      tot3 = 0
      tot4 = 0
      tot5 = 0
      tot6 = 0
      tot7 = 0
      tot8 = 0
      tot9 = 0
      tot10 = 0
      tot11 = 0
      tot12 = 0
      tot13 = 0
      tot14 = 0
      pdb = i.PDB_id_RING
      no = i.Residue1_RING
      pos = i.Residue1_pos_RING
      chain = i.Residue1_chain_RING
      interacao = i.Interaction_RING.split(":")[0]
      if ("LIG" in i.Interaction_RING):
        totlig = totlig + i.count
        if (interacao == "IAC"):
          tot1 = tot1 + i.count
        elif (interacao == "VDW"):
          tot2 = tot2 + i.count
        elif (interacao == "HBOND"):
          tot3 = tot3 + i.count
        elif (interacao == "PIPISTACK"):
          tot4 = tot4 + i.count
        elif (interacao == "IONIC"):
          tot5 = tot5 + i.count
        elif (interacao == "SSBOND"):
          tot6 = tot6 + i.count
        elif (interacao == "PICATION"):
          tot7 = tot7 + i.count
      else:    #interacao residuo - residuo
        totres = totres + i.count
        if (interacao == "IAC"):
          tot8 = tot8 + i.count
        elif (interacao == "VDW"):
          tot9 = tot9 + i.count
        elif (interacao == "HBOND"):
          tot10 = tot10 + i.count
        elif (interacao == "PIPISTACK"):
          tot11 = tot11 + i.count
        elif (interacao == "IONIC"):
          tot12 = tot12 + i.count
        elif (interacao == "SSBOND"):
          tot13 = tot13 + i.count
        elif (interacao == "PICATION"):
          tot14 = tot14 + i.count
  return l_PDB_id_RING,l_Node_RING,l__Node_pos_RING,l_Node_chain_RING,l_Node_type,l_Inter_Lig_tot,l_Inter_Res_tot,l_Inter_IAC_Lig_tot,l_Inter_VDW_Lig_tot,l_Inter_HBOND_Lig_tot,l_Inter_PIPISTACK_Lig_tot,l_Inter_IONIC_Lig_tot,l_Inter_SSBOND_Lig_tot,l_Inter_PICATION_Lig_tot,l_Inter_IAC_Res_tot,l_Inter_VDW_Res_tot,l_Inter_HBOND_Res_tot,l_Inter_PIPISTACK_Res_tot,l_Inter_IONIC_Res_tot,l_Inter_SSBOND_Res_tot,l_Inter_PICATION_Res_tot


In [None]:
#Processing nodes that are source in the edges
l_PDB1,l_Node1,l_pos1,l_chain1,l_type1,l_lig1,l_res1,l_IAC_L1,l_VDW_L1,l_HBOND_L1,l_PIPISTACK_L1,l_IONIC_L1,l_SSBOND_L1,l_PICATION_L1,l_IAC_R1,l_VDW_R1,l_HBOND_R1,l_PIPISTACK_R1,l_IONIC_R1,l_SSBOND_R1,l_PICATION_R1 = process_reg_group(df_groupnode1)

In [None]:
#Resulting size of processing Source nodes
tam = len(l_PDB1)
print(tam)

5162883


In [None]:
df_proc_RING_1['PDB_id_RING'] = l_PDB1
df_proc_RING_1['Node_RING'] = l_Node1
df_proc_RING_1['Node_pos_RING'] = l_pos1
df_proc_RING_1['Node_chain_RING'] = l_chain1
df_proc_RING_1['Node_type'] = l_type1
df_proc_RING_1['Inter_Lig_tot'] = l_lig1
df_proc_RING_1['Inter_Res_tot'] = l_res1
df_proc_RING_1['Inter_IAC_Lig_tot'] = l_IAC_L1
df_proc_RING_1['Inter_VDW_Lig_tot'] = l_VDW_L1
df_proc_RING_1['Inter_HBOND_Lig_tot'] = l_HBOND_L1
df_proc_RING_1['Inter_PIPISTACK_Lig_tot'] = l_PIPISTACK_L1
df_proc_RING_1['Inter_IONIC_Lig_tot'] = l_IONIC_L1
df_proc_RING_1['Inter_SSBOND_Lig_tot'] = l_SSBOND_L1
df_proc_RING_1['Inter_PICATION_Lig_tot'] = l_PICATION_L1
df_proc_RING_1['Inter_IAC_Res_tot'] = l_IAC_R1
df_proc_RING_1['Inter_VDW_Res_tot'] = l_VDW_R1
df_proc_RING_1['Inter_HBOND_Res_tot'] = l_HBOND_R1
df_proc_RING_1['Inter_PIPISTACK_Res_tot'] = l_PIPISTACK_R1
df_proc_RING_1['Inter_IONIC_Res_tot'] = l_IONIC_R1
df_proc_RING_1['Inter_SSBOND_Res_tot'] = l_SSBOND_R1
df_proc_RING_1['Inter_PICATION_Res_tot'] = l_PICATION_R1

In [None]:
df_proc_RING_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5162883 entries, 0 to 5162882
Data columns (total 21 columns):
 #   Column                   Dtype 
---  ------                   ----- 
 0   PDB_id_RING              object
 1   Node_RING                object
 2   Node_pos_RING            int64 
 3   Node_chain_RING          object
 4   Node_type                object
 5   Inter_Lig_tot            int64 
 6   Inter_Res_tot            int64 
 7   Inter_IAC_Lig_tot        int64 
 8   Inter_VDW_Lig_tot        int64 
 9   Inter_HBOND_Lig_tot      int64 
 10  Inter_PIPISTACK_Lig_tot  int64 
 11  Inter_IONIC_Lig_tot      int64 
 12  Inter_SSBOND_Lig_tot     int64 
 13  Inter_PICATION_Lig_tot   int64 
 14  Inter_IAC_Res_tot        int64 
 15  Inter_VDW_Res_tot        int64 
 16  Inter_HBOND_Res_tot      int64 
 17  Inter_PIPISTACK_Res_tot  int64 
 18  Inter_IONIC_Res_tot      int64 
 19  Inter_SSBOND_Res_tot     int64 
 20  Inter_PICATION_Res_tot   int64 
dtypes: int64(17), object(4)
memory 

In [None]:
df_proc_RING_1.head(10)

Unnamed: 0,PDB_id_RING,Node_RING,Node_pos_RING,Node_chain_RING,Node_type,Inter_Lig_tot,Inter_Res_tot,Inter_IAC_Lig_tot,Inter_VDW_Lig_tot,Inter_HBOND_Lig_tot,Inter_PIPISTACK_Lig_tot,Inter_IONIC_Lig_tot,Inter_SSBOND_Lig_tot,Inter_PICATION_Lig_tot,Inter_IAC_Res_tot,Inter_VDW_Res_tot,Inter_HBOND_Res_tot,Inter_PIPISTACK_Res_tot,Inter_IONIC_Res_tot,Inter_SSBOND_Res_tot,Inter_PICATION_Res_tot
0,10GS,Ala,15,A,S,0,3,0,0,0,0,0,0,0,0,1,2,0,0,0,0
1,10GS,Ala,15,B,S,0,4,0,0,0,0,0,0,0,0,2,2,0,0,0,0
2,10GS,Ala,16,A,S,0,3,0,0,0,0,0,0,0,0,1,2,0,0,0,0
3,10GS,Ala,16,B,S,0,3,0,0,0,0,0,0,0,0,1,2,0,0,0,0
4,10GS,Ala,22,A,S,0,4,0,0,0,0,0,0,0,0,3,1,0,0,0,0
5,10GS,Ala,22,B,S,0,4,0,0,0,0,0,0,0,0,3,1,0,0,0,0
6,10GS,Ala,86,A,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
7,10GS,Ala,86,B,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
8,10GS,Ala,87,A,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
9,10GS,Ala,87,B,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0


In [None]:
df_proc_RING_1.query('PDB_id_RING == "10GS" & Node_RING == "Tyr" & Node_pos_RING == "7" & Node_chain_RING == "A"')

Unnamed: 0,PDB_id_RING,Node_RING,Node_pos_RING,Node_chain_RING,Node_type,Inter_Lig_tot,Inter_Res_tot,Inter_IAC_Lig_tot,Inter_VDW_Lig_tot,Inter_HBOND_Lig_tot,Inter_PIPISTACK_Lig_tot,Inter_IONIC_Lig_tot,Inter_SSBOND_Lig_tot,Inter_PICATION_Lig_tot,Inter_IAC_Res_tot,Inter_VDW_Res_tot,Inter_HBOND_Res_tot,Inter_PIPISTACK_Res_tot,Inter_IONIC_Res_tot,Inter_SSBOND_Res_tot,Inter_PICATION_Res_tot
275,10GS,Tyr,7,A,S,0,15,0,0,0,0,0,0,0,0,12,3,0,0,0,0


###1.1.12 Processing interaction types of Target nodes of edges

In [None]:
#increasing the viewing capacity of columns and rows
import pandas as pd

pd.set_option('display.max_columns', 7000)
pd.set_option('display.max_rows',90000)

In [None]:
import pandas as pd
df_RING_edge_01 = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRING/edgesDB_01_sel_proc_PDB_Res1_Res2.csv",delimiter='\t')

In [None]:
df_RING_edge_01.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91917025 entries, 0 to 91917024
Data columns (total 12 columns):
 #   Column               Dtype 
---  ------               ----- 
 0   PDB_id_RING          object
 1   NodeId1_RING         object
 2   Interaction_RING     object
 3   NodeId2_RING         object
 4   interacao_RING       object
 5   subinteracao_RING    object
 6   Residue1_RING        object
 7   Residue1_pos_RING    int64 
 8   Residue1_chain_RING  object
 9   Residue2_RING        object
 10  Residue2_pos_RING    int64 
 11  Residue2_chain_RING  object
dtypes: int64(2), object(10)
memory usage: 8.2+ GB


In [None]:
df_RING_edge_01.head(100)

Unnamed: 0,PDB_id_RING,NodeId1_RING,Interaction_RING,NodeId2_RING,interacao_RING,subinteracao_RING,Residue1_RING,Residue1_pos_RING,Residue1_chain_RING,Residue2_RING,Residue2_pos_RING,Residue2_chain_RING
0,10GS,A:210:_:VWW,VDW:LIG_SC,A:7:_:TYR,VDW,LIG_SC,VWW,210,A,Tyr,7,A
1,10GS,A:210:_:VWW,VDW:LIG_SC,A:7:_:TYR,VDW,LIG_SC,VWW,210,A,Tyr,7,A
2,10GS,A:210:_:VWW,IAC:LIG_SC,A:7:_:TYR,IAC,LIG_SC,VWW,210,A,Tyr,7,A
3,10GS,A:210:_:VWW,IAC:LIG_SC,A:7:_:TYR,IAC,LIG_SC,VWW,210,A,Tyr,7,A
4,10GS,A:210:_:VWW,IAC:LIG_SC,A:7:_:TYR,IAC,LIG_SC,VWW,210,A,Tyr,7,A
5,10GS,A:210:_:VWW,IAC:LIG_SC,A:7:_:TYR,IAC,LIG_SC,VWW,210,A,Tyr,7,A
6,10GS,A:210:_:VWW,IAC:LIG_SC,A:7:_:TYR,IAC,LIG_SC,VWW,210,A,Tyr,7,A
7,10GS,A:210:_:VWW,IAC:LIG_SC,A:7:_:TYR,IAC,LIG_SC,VWW,210,A,Tyr,7,A
8,10GS,A:210:_:VWW,IAC:LIG_SC,A:7:_:TYR,IAC,LIG_SC,VWW,210,A,Tyr,7,A
9,10GS,A:210:_:VWW,IAC:LIG_SC,A:7:_:TYR,IAC,LIG_SC,VWW,210,A,Tyr,7,A


Let's group the edge Target node records by: **PDB_id_RING**, **Residue2_RING**, **Residue2_pos_RING**, **Residue2_chain_RING** and counting the types of interaction they have

Solution based on: https://stackoverflow.com/questions/29836477/pandas-create-new-column-with-count-from-groupby

In [None]:
df_groupnode2 = df_RING_edge_01.groupby(['PDB_id_RING',	'Residue2_RING', 'Residue2_pos_RING','Residue2_chain_RING','Interaction_RING'])["interacao_RING"].count().reset_index(name="count")

In [None]:
df_groupnode2.head(10)

Unnamed: 0,PDB_id_RING,Residue2_RING,Residue2_pos_RING,Residue2_chain_RING,Interaction_RING,count
0,10GS,Ala,15,A,HBOND:MC_MC,1
1,10GS,Ala,15,A,VDW:SC_SC,1
2,10GS,Ala,15,B,HBOND:MC_MC,1
3,10GS,Ala,15,B,VDW:SC_SC,1
4,10GS,Ala,22,A,HBOND:MC_MC,1
5,10GS,Ala,22,A,IAC:LIG_MC,19
6,10GS,Ala,22,A,IAC:LIG_SC,32
7,10GS,Ala,22,B,HBOND:MC_MC,1
8,10GS,Ala,22,B,IAC:LIG_MC,19
9,10GS,Ala,22,B,IAC:LIG_SC,32


In [None]:
df_groupnode2.query('PDB_id_RING == "10GS" & Residue2_RING == "Tyr" & Residue2_pos_RING == "7" & Residue2_chain_RING == "A"')

Unnamed: 0,PDB_id_RING,Residue2_RING,Residue2_pos_RING,Residue2_chain_RING,Interaction_RING,count
731,10GS,Tyr,7,A,IAC:LIG_SC,120
732,10GS,Tyr,7,A,VDW:LIG_SC,2


Creating a Dataframe that will store the processing of the **df_groupnode2** dataframe.
The Dataframe will have the following attributes:
- **PDB_id_RING**: PDB              
- **Node_RING**: the node can be a residue or a ligand        
- **Node_pos_RING**: position of the node in the chain
- **Node_chain_RING**: chain       
- **Node_type**: node type: *source* (S) or *target* (T)              
- **Inter_Lig_tot**: total residue-ligand interactions            
- **Inter_Res_tot**: total residue-residue interactions        
- **Inter_IAC_Lig_tot**: total interactions of type IAC with ligand
- **Inter_VDW_Lig_tot**: total interactions of type VDW with ligand
- **Inter_HBOND_Lig_tot**: total interactions of type HBOND with ligand       
- **Inter_PIPISTACK_Lig_tot**: total interactions of type PIPISTACK with ligand   
- **Inter_IONIC_Lig_tot**: total interactions of type IONIC with ligand      
- **Inter_SSBOND_Lig_tot**: total interactions of type SSBOND with ligand      
- **Inter_PICATION_Lig_tot**: total interactions of type PICATION with ligand    
- **Inter_IAC_Res_tot**: total interactions of type IAC between residues
- **Inter_VDW_Res_tot**: total interactions of type VDW between residues
- **Inter_HBOND_Res_tot**: total interactions of type HBOND between residues               
- **Inter_PIPISTACK_Res_tot**: total interactions of type PIPISTACKK between residues           
- **Inter_IONIC_Res_tot**: total interactions of type IONIC between residues       
- **Inter_SSBOND_Res_tot**: total interactions of type SSBOND between residues              
- **Inter_PICATION_Res_tot**: total interactions of type PICATION between residues         


In [None]:
COLUMN_NAMES=['PDB_id_RING','Node_RING','Node_pos_RING','Node_chain_RING','Node_type','Inter_Lig_tot','Inter_Res_tot',
              'Inter_IAC_Lig_tot','Inter_VDW_Lig_tot', 'Inter_HBOND_Lig_tot', 'Inter_PIPISTACK_Lig_tot', 'Inter_IONIC_Lig_tot',
              'Inter_SSBOND_Lig_tot', 'Inter_PICATION_Lig_tot','Inter_IAC_Res_tot','Inter_VDW_Res_tot', 'Inter_HBOND_Res_tot',
              'Inter_PIPISTACK_Res_tot', 'Inter_IONIC_Res_tot','Inter_SSBOND_Res_tot', 'Inter_PICATION_Res_tot']
df_proc_RING_2 = pd.DataFrame(columns=COLUMN_NAMES)

In [None]:
df_proc_RING_2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   PDB_id_RING              0 non-null      object
 1   Node_RING                0 non-null      object
 2   Node_pos_RING            0 non-null      object
 3   Node_chain_RING          0 non-null      object
 4   Node_type                0 non-null      object
 5   Inter_Lig_tot            0 non-null      object
 6   Inter_Res_tot            0 non-null      object
 7   Inter_IAC_Lig_tot        0 non-null      object
 8   Inter_VDW_Lig_tot        0 non-null      object
 9   Inter_HBOND_Lig_tot      0 non-null      object
 10  Inter_PIPISTACK_Lig_tot  0 non-null      object
 11  Inter_IONIC_Lig_tot      0 non-null      object
 12  Inter_SSBOND_Lig_tot     0 non-null      object
 13  Inter_PICATION_Lig_tot   0 non-null      object
 14  Inter_IAC_Res_tot        0 non-null      object
 15  Inter_

Processing nodes that are Target (nodes present in the **Residue2_RING** attribute) stored in the **df_groupnode2** dataframe

In [None]:

l_PDB_id_RING = []
l_Node_RING = []
l__Node_pos_RING = []
l_Node_chain_RING = []
l_Node_type = []
l_Inter_Lig_tot = []
l_Inter_Res_tot = []
l_Inter_IAC_Lig_tot = []
l_Inter_VDW_Lig_tot = []
l_Inter_HBOND_Lig_tot = []
l_Inter_PIPISTACK_Lig_tot = []
l_Inter_IONIC_Lig_tot = []
l_Inter_SSBOND_Lig_tot = []
l_Inter_PICATION_Lig_tot = []
l_Inter_IAC_Res_tot = []
l_Inter_VDW_Res_tot = []
l_Inter_HBOND_Res_tot = []
l_Inter_PIPISTACK_Res_tot = []
l_Inter_IONIC_Res_tot = []
l_Inter_SSBOND_Res_tot = []
l_Inter_PICATION_Res_tot = []

l_PDB = []
l_Node = []
l_pos = []
l_chain = []
l_type = []
l_lig = []
l_res = []
l_IAC_L = []
l_VDW_L = []
l_HBOND_L = []
l_PIPISTACK_L = []
l_IONIC_L = []
l_SSBOND_L = []
l_PICATION_L = []
l_IAC_R = []
l_VDW_R = []
l_HBOND_R = []
l_PIPISTACK_R = []
l_IONIC_R = []
l_SSBOND_R = []
l_PICATION_R = []

def process_reg_group(df):
  pdb = 0
  no = 0
  pos = 0
  chain = 0
  mudou = False
  primeiro = False

  totlig = 0
  totres = 0
  tot1 = 0
  tot2 = 0
  tot3 = 0
  tot4 = 0
  tot5 = 0
  tot6 = 0
  tot7 = 0
  tot8 = 0
  tot9 = 0
  tot10 = 0
  tot11 = 0
  tot12 = 0
  tot13 = 0
  tot14 = 0

  for i in df.itertuples():
    if (i.Index == 0):  #First record
       primeiro = True
       #print("é o primeiro")
    if (((i.PDB_id_RING == pdb) and (i.Residue2_RING  == no) and (i.Residue2_pos_RING == pos) and (i.Residue2_chain_RING == chain))
    or primeiro):
      if(primeiro == True):
        primeiro = False
      pdb = i.PDB_id_RING
      no = i.Residue2_RING
      pos = i.Residue2_pos_RING
      chain = i.Residue2_chain_RING
      interacao = i.Interaction_RING.split(":")[0]
      if ("LIG" in i.Interaction_RING):
        totlig = totlig + i.count
        if (interacao == "IAC"):
          tot1 = tot1 + i.count
        elif (interacao == "VDW"):
          tot2 = tot2 + i.count
        elif (interacao == "HBOND"):
          tot3 = tot3 + i.count
        elif (interacao == "PIPISTACK"):
          tot4 = tot4 + i.count
        elif (interacao == "IONIC"):
          tot5 = tot5 + i.count
        elif (interacao == "SSBOND"):
          tot6 = tot6 + i.count
        elif (interacao == "PICATION"):
          tot7 = tot7 + i.count
      else:    #interacao residuo - residuo
        totres = totres + i.count
        if (interacao == "IAC"):
          tot8 = tot8 + i.count
        elif (interacao == "VDW"):
          tot9 = tot9 + i.count
        elif (interacao == "HBOND"):
          tot10 = tot10 + i.count
        elif (interacao == "PIPISTACK"):
          tot11 = tot11 + i.count
        elif (interacao == "IONIC"):
          tot12 = tot12 + i.count
        elif (interacao == "SSBOND"):
          tot13 = tot13 + i.count
        elif (interacao == "PICATION"):
          tot14 = tot14 + i.count
    else: #changed the key ((i.PDB_id_RING == pdb) & (i.Residue2_RING  == no) & (i.Residue2_pos_RING == pos) & (i.Residue2_chain_RING == chain))
      l_PDB_id_RING.append(pdb)
      l_Node_RING.append(no)
      l__Node_pos_RING.append(pos)
      l_Node_chain_RING.append(chain)
      l_Node_type.append("T")
      l_Inter_Lig_tot.append(totlig)
      l_Inter_Res_tot.append(totres)
      l_Inter_IAC_Lig_tot.append(tot1)
      l_Inter_VDW_Lig_tot.append(tot2)
      l_Inter_HBOND_Lig_tot.append(tot3)
      l_Inter_PIPISTACK_Lig_tot.append(tot4)
      l_Inter_IONIC_Lig_tot.append(tot5)
      l_Inter_SSBOND_Lig_tot.append(tot6)
      l_Inter_PICATION_Lig_tot.append(tot7)
      l_Inter_IAC_Res_tot.append(tot8)
      l_Inter_VDW_Res_tot.append(tot9)
      l_Inter_HBOND_Res_tot.append(tot10)
      l_Inter_PIPISTACK_Res_tot.append(tot11)
      l_Inter_IONIC_Res_tot.append(tot12)
      l_Inter_SSBOND_Res_tot.append(tot13)
      l_Inter_PICATION_Res_tot.append(tot14)
      totlig = 0
      totres = 0
      tot1 = 0
      tot2 = 0
      tot3 = 0
      tot4 = 0
      tot5 = 0
      tot6 = 0
      tot7 = 0
      tot8 = 0
      tot9 = 0
      tot10 = 0
      tot11 = 0
      tot12 = 0
      tot13 = 0
      tot14 = 0
      pdb = i.PDB_id_RING
      no = i.Residue2_RING
      pos = i.Residue2_pos_RING
      chain = i.Residue2_chain_RING
      interacao = i.Interaction_RING.split(":")[0]
      if ("LIG" in i.Interaction_RING):
        totlig = totlig + i.count
        if (interacao == "IAC"):
          tot1 = tot1 + i.count
        elif (interacao == "VDW"):
          tot2 = tot2 + i.count
        elif (interacao == "HBOND"):
          tot3 = tot3 + i.count
        elif (interacao == "PIPISTACK"):
          tot4 = tot4 + i.count
        elif (interacao == "IONIC"):
          tot5 = tot5 + i.count
        elif (interacao == "SSBOND"):
          tot6 = tot6 + i.count
        elif (interacao == "PICATION"):
          tot7 = tot7 + i.count
      else:    #interacao residuo - residuo
        totres = totres + i.count
        if (interacao == "IAC"):
          tot8 = tot8 + i.count
        elif (interacao == "VDW"):
          tot9 = tot9 + i.count
        elif (interacao == "HBOND"):
          tot10 = tot10 + i.count
        elif (interacao == "PIPISTACK"):
          tot11 = tot11 + i.count
        elif (interacao == "IONIC"):
          tot12 = tot12 + i.count
        elif (interacao == "SSBOND"):
          tot13 = tot13 + i.count
        elif (interacao == "PICATION"):
          tot14 = tot14 + i.count
  return l_PDB_id_RING,l_Node_RING,l__Node_pos_RING,l_Node_chain_RING,l_Node_type,l_Inter_Lig_tot,l_Inter_Res_tot,l_Inter_IAC_Lig_tot,l_Inter_VDW_Lig_tot,l_Inter_HBOND_Lig_tot,l_Inter_PIPISTACK_Lig_tot,l_Inter_IONIC_Lig_tot,l_Inter_SSBOND_Lig_tot,l_Inter_PICATION_Lig_tot,l_Inter_IAC_Res_tot,l_Inter_VDW_Res_tot,l_Inter_HBOND_Res_tot,l_Inter_PIPISTACK_Res_tot,l_Inter_IONIC_Res_tot,l_Inter_SSBOND_Res_tot,l_Inter_PICATION_Res_tot


In [None]:
#Processing nodes that are Target in the edges
l_PDB2,l_Node2,l_pos2,l_chain2,l_type2,l_lig2,l_res2,l_IAC_L2,l_VDW_L2,l_HBOND_L2,l_PIPISTACK_L2,l_IONIC_L2,l_SSBOND_L2,l_PICATION_L2,l_IAC_R2,l_VDW_R2,l_HBOND_R2,l_PIPISTACK_R2,l_IONIC_R2,l_SSBOND_R2,l_PICATION_R2 = process_reg_group(df_groupnode2)

In [None]:
#Resulting size of processing Target nodes
tam = len(l_PDB2)
print(tam)

5436515


In [None]:
df_proc_RING_2['PDB_id_RING'] = l_PDB2
df_proc_RING_2['Node_RING'] = l_Node2
df_proc_RING_2['Node_pos_RING'] = l_pos2
df_proc_RING_2['Node_chain_RING'] = l_chain2
df_proc_RING_2['Node_type'] = l_type2
df_proc_RING_2['Inter_Lig_tot'] = l_lig2
df_proc_RING_2['Inter_Res_tot'] = l_res2
df_proc_RING_2['Inter_IAC_Lig_tot'] = l_IAC_L2
df_proc_RING_2['Inter_VDW_Lig_tot'] = l_VDW_L2
df_proc_RING_2['Inter_HBOND_Lig_tot'] = l_HBOND_L2
df_proc_RING_2['Inter_PIPISTACK_Lig_tot'] = l_PIPISTACK_L2
df_proc_RING_2['Inter_IONIC_Lig_tot'] = l_IONIC_L2
df_proc_RING_2['Inter_SSBOND_Lig_tot'] = l_SSBOND_L2
df_proc_RING_2['Inter_PICATION_Lig_tot'] = l_PICATION_L2
df_proc_RING_2['Inter_IAC_Res_tot'] = l_IAC_R2
df_proc_RING_2['Inter_VDW_Res_tot'] = l_VDW_R2
df_proc_RING_2['Inter_HBOND_Res_tot'] = l_HBOND_R2
df_proc_RING_2['Inter_PIPISTACK_Res_tot'] = l_PIPISTACK_R2
df_proc_RING_2['Inter_IONIC_Res_tot'] = l_IONIC_R2
df_proc_RING_2['Inter_SSBOND_Res_tot'] = l_SSBOND_R2
df_proc_RING_2['Inter_PICATION_Res_tot'] = l_PICATION_R2

In [None]:
df_proc_RING_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5436515 entries, 0 to 5436514
Data columns (total 21 columns):
 #   Column                   Dtype 
---  ------                   ----- 
 0   PDB_id_RING              object
 1   Node_RING                object
 2   Node_pos_RING            int64 
 3   Node_chain_RING          object
 4   Node_type                object
 5   Inter_Lig_tot            int64 
 6   Inter_Res_tot            int64 
 7   Inter_IAC_Lig_tot        int64 
 8   Inter_VDW_Lig_tot        int64 
 9   Inter_HBOND_Lig_tot      int64 
 10  Inter_PIPISTACK_Lig_tot  int64 
 11  Inter_IONIC_Lig_tot      int64 
 12  Inter_SSBOND_Lig_tot     int64 
 13  Inter_PICATION_Lig_tot   int64 
 14  Inter_IAC_Res_tot        int64 
 15  Inter_VDW_Res_tot        int64 
 16  Inter_HBOND_Res_tot      int64 
 17  Inter_PIPISTACK_Res_tot  int64 
 18  Inter_IONIC_Res_tot      int64 
 19  Inter_SSBOND_Res_tot     int64 
 20  Inter_PICATION_Res_tot   int64 
dtypes: int64(17), object(4)
memory 

In [None]:
df_proc_RING_2.head(20)

Unnamed: 0,PDB_id_RING,Node_RING,Node_pos_RING,Node_chain_RING,Node_type,Inter_Lig_tot,Inter_Res_tot,Inter_IAC_Lig_tot,Inter_VDW_Lig_tot,Inter_HBOND_Lig_tot,Inter_PIPISTACK_Lig_tot,Inter_IONIC_Lig_tot,Inter_SSBOND_Lig_tot,Inter_PICATION_Lig_tot,Inter_IAC_Res_tot,Inter_VDW_Res_tot,Inter_HBOND_Res_tot,Inter_PIPISTACK_Res_tot,Inter_IONIC_Res_tot,Inter_SSBOND_Res_tot,Inter_PICATION_Res_tot
0,10GS,Ala,15,A,T,0,2,0,0,0,0,0,0,0,0,1,1,0,0,0,0
1,10GS,Ala,15,B,T,0,2,0,0,0,0,0,0,0,0,1,1,0,0,0,0
2,10GS,Ala,22,A,T,51,1,51,0,0,0,0,0,0,0,0,1,0,0,0,0
3,10GS,Ala,22,B,T,51,1,51,0,0,0,0,0,0,0,0,1,0,0,0,0
4,10GS,Ala,45,A,T,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
5,10GS,Ala,45,B,T,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
6,10GS,Ala,86,A,T,0,5,0,0,0,0,0,0,0,0,3,2,0,0,0,0
7,10GS,Ala,86,B,T,0,5,0,0,0,0,0,0,0,0,3,2,0,0,0,0
8,10GS,Ala,87,A,T,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
9,10GS,Ala,87,B,T,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0


In [None]:
df_proc_RING_2.query('PDB_id_RING == "10GS" & Node_RING == "Tyr" & Node_pos_RING == "7" & Node_chain_RING == "A"')

Unnamed: 0,PDB_id_RING,Node_RING,Node_pos_RING,Node_chain_RING,Node_type,Inter_Lig_tot,Inter_Res_tot,Inter_IAC_Lig_tot,Inter_VDW_Lig_tot,Inter_HBOND_Lig_tot,Inter_PIPISTACK_Lig_tot,Inter_IONIC_Lig_tot,Inter_SSBOND_Lig_tot,Inter_PICATION_Lig_tot,Inter_IAC_Res_tot,Inter_VDW_Res_tot,Inter_HBOND_Res_tot,Inter_PIPISTACK_Res_tot,Inter_IONIC_Res_tot,Inter_SSBOND_Res_tot,Inter_PICATION_Res_tot
311,10GS,Tyr,7,A,T,122,0,120,2,0,0,0,0,0,0,0,0,0,0,0,0


###1.1.13 Integration of Databases that have the interactions of source and target nodes

In [None]:
df_proc_RING  = df_proc_RING_1.append(df_proc_RING_2, ignore_index=True)

In [None]:
df_proc_RING.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10599398 entries, 0 to 10599397
Data columns (total 21 columns):
 #   Column                   Dtype 
---  ------                   ----- 
 0   PDB_id_RING              object
 1   Node_RING                object
 2   Node_pos_RING            int64 
 3   Node_chain_RING          object
 4   Node_type                object
 5   Inter_Lig_tot            int64 
 6   Inter_Res_tot            int64 
 7   Inter_IAC_Lig_tot        int64 
 8   Inter_VDW_Lig_tot        int64 
 9   Inter_HBOND_Lig_tot      int64 
 10  Inter_PIPISTACK_Lig_tot  int64 
 11  Inter_IONIC_Lig_tot      int64 
 12  Inter_SSBOND_Lig_tot     int64 
 13  Inter_PICATION_Lig_tot   int64 
 14  Inter_IAC_Res_tot        int64 
 15  Inter_VDW_Res_tot        int64 
 16  Inter_HBOND_Res_tot      int64 
 17  Inter_PIPISTACK_Res_tot  int64 
 18  Inter_IONIC_Res_tot      int64 
 19  Inter_SSBOND_Res_tot     int64 
 20  Inter_PICATION_Res_tot   int64 
dtypes: int64(17), object(4)
memor

In [None]:
df_proc_RING.head(20)

Unnamed: 0,PDB_id_RING,Node_RING,Node_pos_RING,Node_chain_RING,Node_type,Inter_Lig_tot,Inter_Res_tot,Inter_IAC_Lig_tot,Inter_VDW_Lig_tot,Inter_HBOND_Lig_tot,Inter_PIPISTACK_Lig_tot,Inter_IONIC_Lig_tot,Inter_SSBOND_Lig_tot,Inter_PICATION_Lig_tot,Inter_IAC_Res_tot,Inter_VDW_Res_tot,Inter_HBOND_Res_tot,Inter_PIPISTACK_Res_tot,Inter_IONIC_Res_tot,Inter_SSBOND_Res_tot,Inter_PICATION_Res_tot
0,10GS,Ala,15,A,S,0,3,0,0,0,0,0,0,0,0,1,2,0,0,0,0
1,10GS,Ala,15,B,S,0,4,0,0,0,0,0,0,0,0,2,2,0,0,0,0
2,10GS,Ala,16,A,S,0,3,0,0,0,0,0,0,0,0,1,2,0,0,0,0
3,10GS,Ala,16,B,S,0,3,0,0,0,0,0,0,0,0,1,2,0,0,0,0
4,10GS,Ala,22,A,S,0,4,0,0,0,0,0,0,0,0,3,1,0,0,0,0
5,10GS,Ala,22,B,S,0,4,0,0,0,0,0,0,0,0,3,1,0,0,0,0
6,10GS,Ala,86,A,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
7,10GS,Ala,86,B,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
8,10GS,Ala,87,A,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
9,10GS,Ala,87,B,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0


In [None]:
df_proc_RING.query('PDB_id_RING == "10GS" & Node_RING == "Tyr" & Node_pos_RING == "7" & Node_chain_RING == "A"')

Unnamed: 0,PDB_id_RING,Node_RING,Node_pos_RING,Node_chain_RING,Node_type,Inter_Lig_tot,Inter_Res_tot,Inter_IAC_Lig_tot,Inter_VDW_Lig_tot,Inter_HBOND_Lig_tot,Inter_PIPISTACK_Lig_tot,Inter_IONIC_Lig_tot,Inter_SSBOND_Lig_tot,Inter_PICATION_Lig_tot,Inter_IAC_Res_tot,Inter_VDW_Res_tot,Inter_HBOND_Res_tot,Inter_PIPISTACK_Res_tot,Inter_IONIC_Res_tot,Inter_SSBOND_Res_tot,Inter_PICATION_Res_tot
275,10GS,Tyr,7,A,S,0,15,0,0,0,0,0,0,0,0,12,3,0,0,0,0
5163194,10GS,Tyr,7,A,T,122,0,120,2,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
df_q1 = df_proc_RING.query('PDB_id_RING == "10GS" & Node_RING == "Phe" & Node_pos_RING == "8" & Node_chain_RING == "A"')

In [None]:
tam = len(df_q1)
print(tam)

2


In [None]:
df_q1

Unnamed: 0,PDB_id_RING,Node_RING,Node_pos_RING,Node_chain_RING,Node_type,Inter_Lig_tot,Inter_Res_tot,Inter_IAC_Lig_tot,Inter_VDW_Lig_tot,Inter_HBOND_Lig_tot,Inter_PIPISTACK_Lig_tot,Inter_IONIC_Lig_tot,Inter_SSBOND_Lig_tot,Inter_PICATION_Lig_tot,Inter_IAC_Res_tot,Inter_VDW_Res_tot,Inter_HBOND_Res_tot,Inter_PIPISTACK_Res_tot,Inter_IONIC_Res_tot,Inter_SSBOND_Res_tot,Inter_PICATION_Res_tot
213,10GS,Phe,8,A,S,0,12,0,0,0,0,0,0,0,0,11,0,1,0,0,0
5163136,10GS,Phe,8,A,T,269,0,266,3,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
df_q1 = df_proc_RING.query('PDB_id_RING == "10GS" & Node_RING == "Arg" & Node_pos_RING == "11" & Node_chain_RING == "A"')

In [None]:
tam = len(df_q1)
print(tam)

1


In [None]:
df_q1

Unnamed: 0,PDB_id_RING,Node_RING,Node_pos_RING,Node_chain_RING,Node_type,Inter_Lig_tot,Inter_Res_tot,Inter_IAC_Lig_tot,Inter_VDW_Lig_tot,Inter_HBOND_Lig_tot,Inter_PIPISTACK_Lig_tot,Inter_IONIC_Lig_tot,Inter_SSBOND_Lig_tot,Inter_PICATION_Lig_tot,Inter_IAC_Res_tot,Inter_VDW_Res_tot,Inter_HBOND_Res_tot,Inter_PIPISTACK_Res_tot,Inter_IONIC_Res_tot,Inter_SSBOND_Res_tot,Inter_PICATION_Res_tot
22,10GS,Arg,11,A,S,0,13,0,0,0,0,0,0,0,0,6,7,0,0,0,0


In [None]:
df_proc_RING.query('PDB_id_RING == "10GS" & Node_RING == "Asp" & Node_pos_RING == "98" & Node_chain_RING == "A"')

Unnamed: 0,PDB_id_RING,Node_RING,Node_pos_RING,Node_chain_RING,Node_type,Inter_Lig_tot,Inter_Res_tot,Inter_IAC_Lig_tot,Inter_VDW_Lig_tot,Inter_HBOND_Lig_tot,Inter_PIPISTACK_Lig_tot,Inter_IONIC_Lig_tot,Inter_SSBOND_Lig_tot,Inter_PICATION_Lig_tot,Inter_IAC_Res_tot,Inter_VDW_Res_tot,Inter_HBOND_Res_tot,Inter_PIPISTACK_Res_tot,Inter_IONIC_Res_tot,Inter_SSBOND_Res_tot,Inter_PICATION_Res_tot
60,10GS,Asp,98,A,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
5162945,10GS,Asp,98,A,T,32,2,32,0,0,0,0,0,0,0,0,2,0,0,0,0


###1.1.14 Generating an intermediate file with the processed **edgesDB_01.txt** database

In [None]:
df_proc_RING.to_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRING/edgesDB_01_proc.csv",sep='\t',index=False)

##1.2 Processing the *edgesDB_02.txt* database

In [None]:
import pandas as pd

df_RING_edge_02 = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRING/edgesDB_02.txt",index_col=False, header=None, delimiter='\t')


  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
df_RING_edge_02.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91917025 entries, 0 to 91917024
Data columns (total 13 columns):
 #   Column  Dtype  
---  ------  -----  
 0   0       object 
 1   1       object 
 2   2       object 
 3   3       object 
 4   4       float64
 5   5       float64
 6   6       float64
 7   7       object 
 8   8       object 
 9   9       object 
 10  10      object 
 11  11      object 
 12  12      object 
dtypes: float64(3), object(10)
memory usage: 8.9+ GB


In [None]:
df_RING_edge_02.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,4x6j,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,4.251,-999.9,0.0,C17,CA,,,,
1,4x6j,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,4.262,-999.9,0.0,C16,CA,,,,
2,4x6j,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,4.264,-999.9,0.0,N21,N,,,,
3,4x6j,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,4.276,-999.9,0.0,O5,CA,,,,
4,4x6j,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,4.337,-999.9,0.0,O5,C,,,,


###1.2.1 Renaming the fields

In [None]:
df_RING_edge_02.rename(columns={0: 'PDB_id_RING',
                       1: 'NodeId1_RING',
                       2: 'Interaction_RING',
                       3: 'NodeId2_RING',
                       4: 'Distance',
                       5: 'Angle',
                       6: 'Energy',
                       7: 'Atom1',
                       8: 'Atom2',
                       9: 'Donor',
                       10: 'Positive',
                       11: 'Cation',
                       12: 'Orientation'}, inplace=True)

In [None]:
df_RING_edge_02.head()

Unnamed: 0,PDB_id_RING,NodeId1_RING,Interaction_RING,NodeId2_RING,Distance,Angle,Energy,Atom1,Atom2,Donor,Positive,Cation,Orientation
0,4x6j,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,4.251,-999.9,0.0,C17,CA,,,,
1,4x6j,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,4.262,-999.9,0.0,C16,CA,,,,
2,4x6j,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,4.264,-999.9,0.0,N21,N,,,,
3,4x6j,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,4.276,-999.9,0.0,O5,CA,,,,
4,4x6j,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,4.337,-999.9,0.0,O5,C,,,,


###1.2.2 Selecting the fields that will be used

In [None]:
#Field Selection
df_RING_edge_02 = df_RING_edge_02.loc[:,['PDB_id_RING','NodeId1_RING', 'Interaction_RING', 'NodeId2_RING']]

In [None]:
df_RING_edge_02.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91917025 entries, 0 to 91917024
Data columns (total 4 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   PDB_id_RING       object
 1   NodeId1_RING      object
 2   Interaction_RING  object
 3   NodeId2_RING      object
dtypes: object(4)
memory usage: 2.7+ GB


###1.2.3 Generating an intermediate file with the selected fields from the *edgesDB_02.txt* database  

In [None]:
df_RING_edge_02.to_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRING/edgesDB_02_sel.csv",sep='\t',index=False)

###1.2.4 Dividing the *Interaction* field into two fields

The **Interaction** field will be divided into two: interaction and subinteraction

In [None]:
#aumentado a capacidade de visualização de colunas e linhas
import pandas as pd

pd.set_option('display.max_columns', 7000)
pd.set_option('display.max_rows',90000)

In [None]:
import pandas as pd
df_RING_edge_02 = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRING/edgesDB_02_sel.csv",delimiter='\t')

In [None]:
df_RING_edge_02.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91917025 entries, 0 to 91917024
Data columns (total 4 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   PDB_id_RING       object
 1   NodeId1_RING      object
 2   Interaction_RING  object
 3   NodeId2_RING      object
dtypes: object(4)
memory usage: 2.7+ GB


In [None]:
df_RING_edge_02["interacao_RING"] = df_RING_edge_02["Interaction_RING"].apply(lambda x: x.split(":")[0])

df_RING_edge_02["subinteracao_RING"] = df_RING_edge_02["Interaction_RING"].apply(lambda x: x.split(":")[1])

In [None]:
df_RING_edge_02.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91917025 entries, 0 to 91917024
Data columns (total 6 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   PDB_id_RING        object
 1   NodeId1_RING       object
 2   Interaction_RING   object
 3   NodeId2_RING       object
 4   interacao_RING     object
 5   subinteracao_RING  object
dtypes: object(6)
memory usage: 4.1+ GB


In [None]:
df_RING_edge_02.head()

Unnamed: 0,PDB_id_RING,NodeId1_RING,Interaction_RING,NodeId2_RING,interacao_RING,subinteracao_RING
0,4x6j,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,IAC,LIG_MC
1,4x6j,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,IAC,LIG_MC
2,4x6j,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,IAC,LIG_MC
3,4x6j,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,IAC,LIG_MC
4,4x6j,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,IAC,LIG_MC


###1.2.5 Processing the *PDB_id* attribute

In [None]:
df_RING_edge_02["PDB_id_RING"] = df_RING_edge_02["PDB_id_RING"].apply(lambda x: x.upper())

###1.2.6 Generating an intermediate file with the processing of the *PDB_id* attribute from the *edgesDB_02.txt* database  

In [None]:
df_RING_edge_02.to_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRING/edgesDB_02_sel_proc_PDB.csv",sep='\t',index=False)

###1.2.7 Extracting the residue, its position and chain in the *NodeId1* attribute

The source residue, its position and its chain contained in **NodeId1** will be extracted:

**Residue1_RING**: source node of the edge

**Residue1_pos_RING**: the position of the node

**Residue1_chain_RING**: the chain where the node is located

In [None]:
#increasing the viewing capacity of columns and rows
import pandas as pd

pd.set_option('display.max_columns', 7000)
pd.set_option('display.max_rows',90000)

In [None]:
import pandas as pd
df_RING_edge_02 = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRING/edgesDB_02_sel_proc_PDB.csv",delimiter='\t')

In [None]:
df_RING_edge_02.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91917025 entries, 0 to 91917024
Data columns (total 6 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   PDB_id_RING        object
 1   NodeId1_RING       object
 2   Interaction_RING   object
 3   NodeId2_RING       object
 4   interacao_RING     object
 5   subinteracao_RING  object
dtypes: object(6)
memory usage: 4.1+ GB


In [None]:
df_RING_edge_02["NodeId1_RING"].value_counts()

D:1:_:DG        296202
A:404:_:OGA     250262
A:405:_:EDO     155489
A:407:_:EDO     152354
A:1101:_:MLI    150314
                 ...  
C:1342:_:GLY         1
Q:77:_:ILE           1
X:108:_:LEU          1
M:3:_:SER            1
J:259:_:LYS          1
Name: NodeId1_RING, Length: 253839, dtype: int64

In [None]:
df_RING_edge_02["Residue1_RING"] = df_RING_edge_02["NodeId1_RING"].apply(lambda x: x.split(":")[3])

In [None]:
#Converting to standard: Ala, Arg, Asn,...
Amin = ['ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HIS', 'ILE', 'LEU', 'LYS',
      'MET', 'PHE', 'PRO', 'PYL', 'SER', 'SEC', 'THR', 'TRP', 'TYR', 'VAL', 'ASX', 'GLX']

df_RING_edge_02["Residue1_RING"] = df_RING_edge_02["Residue1_RING"].apply(lambda x: x.capitalize() if x in Amin else x)

In [None]:
df_RING_edge_02["Residue1_RING"].value_counts()

EDO    4658861
GOL    2899127
DG     2607258
Leu    2320503
SO4    2255366
NAG    2210761
Phe    2082717
DC     1973316
Arg    1724259
Tyr    1688755
DA     1686300
DT     1623394
HEM    1495321
Val    1398199
Ile    1287072
Lys    1062053
Glu    1003681
Trp     891313
Ala     886566
Gln     832812
Thr     829189
Pro     799028
Ser     783222
Asn     778752
Asp     757441
MSE     719271
His     714339
Met     674409
PEG     623162
DMS     622078
ZN      619729
FAD     567837
NAP     520186
Cys     501699
Gly     496771
GDP     495149
NAD     468081
PTR     461309
PO4     426995
GNP     423053
SAH     409832
U       398212
SEP     395957
ADP     395505
TPO     359179
ACT     342712
OGA     325043
CL      321849
G       316301
MG      298458
A       297212
MES     282432
PGE     242074
SAM     239094
AR6     232973
DOD     215579
GTP     215331
C       214896
PLP     212083
MLI     203469
ANP     202301
PG4     200266
CSO     200012
CA      195079
CME     193684
NAI     192726
B7G     18

In [None]:
df_RING_edge_02["Residue1_pos_RING"] = df_RING_edge_02["NodeId1_RING"].apply(lambda x: x.split(":")[1])

In [None]:
df_RING_edge_02["Residue1_chain_RING"] = df_RING_edge_02["NodeId1_RING"].apply(lambda x: x.split(":")[0].upper())

In [None]:
df_RING_edge_02["Residue1_chain_RING"].value_counts()

A    42761127
B    18590498
C     7955433
D     6663432
E     2617689
H     2002586
F     1994468
T     1555648
G     1415016
P     1394074
I     1194939
J      968868
L      944339
K      312637
N      216694
M      182312
V      167056
X      136097
S      128195
R      124515
O      118441
Q      118327
U      113181
Y       98935
W       64688
Z       64034
1        4740
2        2829
4        2698
3        1908
0        1463
5         158
Name: Residue1_chain_RING, dtype: int64

In [None]:
df_RING_edge_02.head(25)

Unnamed: 0,PDB_id_RING,NodeId1_RING,Interaction_RING,NodeId2_RING,interacao_RING,subinteracao_RING,Residue1_RING,Residue1_pos_RING,Residue1_chain_RING
0,4X6J,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,IAC,LIG_MC,3Y2,307,A
1,4X6J,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,IAC,LIG_MC,3Y2,307,A
2,4X6J,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,IAC,LIG_MC,3Y2,307,A
3,4X6J,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,IAC,LIG_MC,3Y2,307,A
4,4X6J,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,IAC,LIG_MC,3Y2,307,A
5,4X6J,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,IAC,LIG_MC,3Y2,307,A
6,4X6J,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,IAC,LIG_MC,3Y2,307,A
7,4X6J,A:307:_:3Y2,IAC:LIG_SC,A:66:_:GLY,IAC,LIG_SC,3Y2,307,A
8,4X6J,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,IAC,LIG_MC,3Y2,307,A
9,4X6J,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,IAC,LIG_MC,3Y2,307,A


###1.2.8 Generating an intermediate file with the extraction of the residue from the *Node_id1* attribute of the *edgesDB_02.txt* database

In [None]:
df_RING_edge_02.to_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRING/edgesDB_02_sel_proc_PDB_Res1.csv",sep='\t',index=False)

###1.2.9 Extracting the residue, its position and chain in the *NodeId2 attribute*

The target residue, its position and chain contained in **NodeId2** will be extracted

**Residue2_RING**: target node of the edge

**Residue2_pos_RING**: the position of the node

**Residue2_chain_RING**: the chain where the node is located

In [None]:
#increasing the viewing capacity of columns and rows
import pandas as pd

pd.set_option('display.max_columns', 7000)
pd.set_option('display.max_rows',90000)

In [None]:
import pandas as pd
df_RING_edge_02 = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRING/edgesDB_02_sel_proc_PDB_Res1.csv",delimiter='\t')

In [None]:
df_RING_edge_02.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91917025 entries, 0 to 91917024
Data columns (total 9 columns):
 #   Column               Dtype 
---  ------               ----- 
 0   PDB_id_RING          object
 1   NodeId1_RING         object
 2   Interaction_RING     object
 3   NodeId2_RING         object
 4   interacao_RING       object
 5   subinteracao_RING    object
 6   Residue1_RING        object
 7   Residue1_pos_RING    int64 
 8   Residue1_chain_RING  object
dtypes: int64(1), object(8)
memory usage: 6.2+ GB


In [None]:
df_RING_edge_02["NodeId2_RING"].value_counts()

A:234:_:LYS     89659
A:146:_:ILE     84306
A:198:_:LEU     80204
A:92:_:LEU      66990
A:283:_:ARG     61221
                ...  
A:2249:_:ARG        1
F:728:_:GLY         1
H:2068:_:GLN        1
C:185:A:PRO         1
F:448:_:PHE         1
Name: NodeId2_RING, Length: 240583, dtype: int64

In [None]:
df_RING_edge_02["Residue2_RING"] = df_RING_edge_02["NodeId2_RING"].apply(lambda x: x.split(":")[3])

In [None]:
#Convertendo para o padrão Ala, Arg, Asn,...
Amin = ['ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HIS', 'ILE', 'LEU', 'LYS',
      'MET', 'PHE', 'PRO', 'PYL', 'SER', 'SEC', 'THR', 'TRP', 'TYR', 'VAL', 'ASX', 'GLX']

df_RING_edge_02["Residue2_RING"] = df_RING_edge_02["Residue2_RING"].apply(lambda x: x.capitalize() if x in Amin else x)

In [None]:
df_RING_edge_02["Residue2_pos_RING"] = df_RING_edge_02["NodeId2_RING"].apply(lambda x: x.split(":")[1])

In [None]:
df_RING_edge_02.to_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRING/edgesDB_02_sel_proc_PDB_Res2.csv",sep='\t',index=False)

In [None]:
#increasing the viewing capacity of columns and rows
import pandas as pd

pd.set_option('display.max_columns', 7000)
pd.set_option('display.max_rows',90000)

In [None]:
import pandas as pd
df_RING_edge_02 = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRING/edgesDB_02_sel_proc_PDB_Res2.csv",delimiter='\t')

In [None]:
df_RING_edge_02.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91917025 entries, 0 to 91917024
Data columns (total 11 columns):
 #   Column               Dtype 
---  ------               ----- 
 0   PDB_id_RING          object
 1   NodeId1_RING         object
 2   Interaction_RING     object
 3   NodeId2_RING         object
 4   interacao_RING       object
 5   subinteracao_RING    object
 6   Residue1_RING        object
 7   Residue1_pos_RING    int64 
 8   Residue1_chain_RING  object
 9   Residue2_RING        object
 10  Residue2_pos_RING    int64 
dtypes: int64(2), object(9)
memory usage: 7.5+ GB


In [None]:
df_RING_edge_02["Residue2_chain_RING"] = df_RING_edge_02["NodeId2_RING"].apply(lambda x: x.split(":")[0].upper())

In [None]:
df_RING_edge_02["Residue2_chain_RING"].value_counts()

A    48388023
B    19039465
C     6821214
D     5383707
H     1934679
E     1797074
I     1513123
J     1410988
F     1388661
G     1082263
L      882974
T      503577
P      315924
K      253937
N      151331
M      130847
S      119053
O      118119
X      116705
V       99362
U       92818
Q       92168
R       81669
Y       77856
Z       62786
W       44908
2        6141
1        5389
3         995
0         682
5         378
4         209
Name: Residue2_chain_RING, dtype: int64

In [None]:
df_RING_edge_02.head(25)

Unnamed: 0,PDB_id_RING,NodeId1_RING,Interaction_RING,NodeId2_RING,interacao_RING,subinteracao_RING,Residue1_RING,Residue1_pos_RING,Residue1_chain_RING,Residue2_RING,Residue2_pos_RING,Residue2_chain_RING
0,4X6J,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,IAC,LIG_MC,3Y2,307,A,Gly,66,A
1,4X6J,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,IAC,LIG_MC,3Y2,307,A,Gly,66,A
2,4X6J,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,IAC,LIG_MC,3Y2,307,A,Gly,66,A
3,4X6J,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,IAC,LIG_MC,3Y2,307,A,Gly,66,A
4,4X6J,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,IAC,LIG_MC,3Y2,307,A,Gly,66,A
5,4X6J,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,IAC,LIG_MC,3Y2,307,A,Gly,66,A
6,4X6J,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,IAC,LIG_MC,3Y2,307,A,Gly,66,A
7,4X6J,A:307:_:3Y2,IAC:LIG_SC,A:66:_:GLY,IAC,LIG_SC,3Y2,307,A,Gly,66,A
8,4X6J,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,IAC,LIG_MC,3Y2,307,A,Gly,66,A
9,4X6J,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,IAC,LIG_MC,3Y2,307,A,Gly,66,A


In [None]:
df_RING_edge_02.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91917025 entries, 0 to 91917024
Data columns (total 12 columns):
 #   Column               Dtype 
---  ------               ----- 
 0   PDB_id_RING          object
 1   NodeId1_RING         object
 2   Interaction_RING     object
 3   NodeId2_RING         object
 4   interacao_RING       object
 5   subinteracao_RING    object
 6   Residue1_RING        object
 7   Residue1_pos_RING    int64 
 8   Residue1_chain_RING  object
 9   Residue2_RING        object
 10  Residue2_pos_RING    int64 
 11  Residue2_chain_RING  object
dtypes: int64(2), object(10)
memory usage: 8.2+ GB


In [None]:

def categories_column(df):
    for col in ['PDB_id_RING', 'Residue1_RING', 'Residue2_RING', 'interacao_RING']:
        mydic= df[col].value_counts().to_dict()
        print(col, mydic)
        print('\n')

categories_column(df_RING_edge_02)

PDB_id_RING {'6L9Z': 161519, '5JBY': 104796, '6EN6': 101057, '6EN5': 93647, '5AOX': 92590, '5NHG': 89219, '6JR1': 82405, '6LLC': 81905, '6Z86': 77630, '6JR0': 77515, '6E21': 77174, '6ZEE': 73529, '5JRG': 72332, '6MO6': 71654, '6K1K': 71545, '6JXD': 71089, '6R7D': 70939, '6IQ4': 70530, '6IPU': 69880, '5XF3': 69592, '5AV6': 69374, '5B1L': 69197, '5AV8': 69109, '5AV9': 69095, '5AV5': 69087, '5AVC': 68639, '5AVB': 68584, '6KVD': 68490, '5Y0C': 68410, '5Y0D': 68064, '5OMX': 68002, '5B32': 67890, '5B2J': 67857, '6JOU': 67841, '5B0Y': 67691, '5X7X': 67627, '4Z66': 66990, '5B1M': 66843, '5Z30': 66671, '6V2K': 66471, '5B31': 66236, '5ZBX': 65806, '5B0Z': 65320, '4XZQ': 65317, '6LE9': 64592, '6L9H': 64397, '6VNP': 63564, '6KE9': 63439, '6NIG': 63147, '5LF1': 62943, '5LF3': 61800, '6W89': 60699, '5LF7': 60233, '5LF0': 59685, '6W8B': 59587, '5UYS': 59099, '5LF4': 58993, '5T00': 57946, '5HHD': 57334, '6BB1': 56859, '5LEY': 55660, '5LEZ': 55453, '6C5A': 55300, '6BC9': 54928, '5LHD': 54925, '6Q0D': 5

In [None]:
import pandas as pd
df_RING_edge_02 = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRING/edgesDB_02_sel_proc_PDB_Res1_Res2.csv",delimiter='\t')

In [None]:
df_RING_edge_02.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91917025 entries, 0 to 91917024
Data columns (total 12 columns):
 #   Column               Dtype 
---  ------               ----- 
 0   PDB_id_RING          object
 1   NodeId1_RING         object
 2   Interaction_RING     object
 3   NodeId2_RING         object
 4   interacao_RING       object
 5   subinteracao_RING    object
 6   Residue1_RING        object
 7   Residue1_pos_RING    int64 
 8   Residue1_chain_RING  object
 9   Residue2_RING        object
 10  Residue2_pos_RING    int64 
 11  Residue2_chain_RING  object
dtypes: int64(2), object(10)
memory usage: 8.2+ GB


In [None]:
df_RING_edge_02.query('PDB_id_RING == "9JDW" & Residue1_RING == "Phe" & Residue1_pos_RING == "330" & Residue1_chain_RING == "A"')

Unnamed: 0,PDB_id_RING,NodeId1_RING,Interaction_RING,NodeId2_RING,interacao_RING,subinteracao_RING,Residue1_RING,Residue1_pos_RING,Residue1_chain_RING,Residue2_RING,Residue2_pos_RING,Residue2_chain_RING
91916830,9JDW,A:330:_:PHE,HBOND:MC_MC,A:333:_:ALA,HBOND,MC_MC,Phe,330,A,Ala,333,A
91916831,9JDW,A:330:_:PHE,HBOND:MC_MC,A:334:_:GLY,HBOND,MC_MC,Phe,330,A,Gly,334,A
91916832,9JDW,A:330:_:PHE,HBOND:MC_MC,A:335:_:TRP,HBOND,MC_MC,Phe,330,A,Trp,335,A
91916833,9JDW,A:330:_:PHE,PIPISTACK:SC_SC,A:335:_:TRP,PIPISTACK,SC_SC,Phe,330,A,Trp,335,A
91916834,9JDW,A:330:_:PHE,VDW:SC_SC,A:335:_:TRP,VDW,SC_SC,Phe,330,A,Trp,335,A
91916835,9JDW,A:330:_:PHE,VDW:SC_SC,A:335:_:TRP,VDW,SC_SC,Phe,330,A,Trp,335,A
91916836,9JDW,A:330:_:PHE,VDW:SC_SC,A:335:_:TRP,VDW,SC_SC,Phe,330,A,Trp,335,A
91916837,9JDW,A:330:_:PHE,VDW:SC_SC,A:335:_:TRP,VDW,SC_SC,Phe,330,A,Trp,335,A


In [None]:
df_RING_edge_02.query('PDB_id_RING == "9JDW" & Residue2_RING == "Phe" & Residue2_pos_RING == "330" & Residue2_chain_RING == "A"')

Unnamed: 0,PDB_id_RING,NodeId1_RING,Interaction_RING,NodeId2_RING,interacao_RING,subinteracao_RING,Residue1_RING,Residue1_pos_RING,Residue1_chain_RING,Residue2_RING,Residue2_pos_RING,Residue2_chain_RING
91916644,9JDW,A:266:_:PHE,PIPISTACK:SC_SC,A:330:_:PHE,PIPISTACK,SC_SC,Phe,266,A,Phe,330,A
91916645,9JDW,A:266:_:PHE,VDW:SC_SC,A:330:_:PHE,VDW,SC_SC,Phe,266,A,Phe,330,A
91916646,9JDW,A:266:_:PHE,VDW:SC_SC,A:330:_:PHE,VDW,SC_SC,Phe,266,A,Phe,330,A
91916647,9JDW,A:266:_:PHE,VDW:SC_SC,A:330:_:PHE,VDW,SC_SC,Phe,266,A,Phe,330,A
91916648,9JDW,A:266:_:PHE,VDW:SC_SC,A:330:_:PHE,VDW,SC_SC,Phe,266,A,Phe,330,A
91916766,9JDW,A:308:_:PHE,PIPISTACK:SC_SC,A:330:_:PHE,PIPISTACK,SC_SC,Phe,308,A,Phe,330,A
91916767,9JDW,A:308:_:PHE,VDW:SC_SC,A:330:_:PHE,VDW,SC_SC,Phe,308,A,Phe,330,A
91916768,9JDW,A:308:_:PHE,VDW:SC_SC,A:330:_:PHE,VDW,SC_SC,Phe,308,A,Phe,330,A
91916769,9JDW,A:308:_:PHE,VDW:SC_SC,A:330:_:PHE,VDW,SC_SC,Phe,308,A,Phe,330,A
91916770,9JDW,A:308:_:PHE,VDW:SC_SC,A:330:_:PHE,VDW,SC_SC,Phe,308,A,Phe,330,A


###1.2.10 Generating an intermediate file with the extraction of the residue from the *Node_id2* attribute of the *edgesDB_02.txt* database

In [None]:
df_RING_edge_02.to_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRING/edgesDB_02_sel_proc_PDB_Res1_Res2.csv",sep='\t',index=False)

###1.2.11 Processing interaction types of Source nodes of edges

In [None]:
#increasing the viewing capacity of columns and rows
import pandas as pd

pd.set_option('display.max_columns', 7000)
pd.set_option('display.max_rows',90000)

In [None]:
import pandas as pd
df_RING_edge_02 = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRING/edgesDB_02_sel_proc_PDB_Res1_Res2.csv",delimiter='\t')

In [None]:
df_RING_edge_02.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91917025 entries, 0 to 91917024
Data columns (total 12 columns):
 #   Column               Dtype 
---  ------               ----- 
 0   PDB_id_RING          object
 1   NodeId1_RING         object
 2   Interaction_RING     object
 3   NodeId2_RING         object
 4   interacao_RING       object
 5   subinteracao_RING    object
 6   Residue1_RING        object
 7   Residue1_pos_RING    int64 
 8   Residue1_chain_RING  object
 9   Residue2_RING        object
 10  Residue2_pos_RING    int64 
 11  Residue2_chain_RING  object
dtypes: int64(2), object(10)
memory usage: 8.2+ GB


In [None]:
df_RING_edge_02.head(100)

Unnamed: 0,PDB_id_RING,NodeId1_RING,Interaction_RING,NodeId2_RING,interacao_RING,subinteracao_RING,Residue1_RING,Residue1_pos_RING,Residue1_chain_RING,Residue2_RING,Residue2_pos_RING,Residue2_chain_RING
0,4X6J,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,IAC,LIG_MC,3Y2,307,A,Gly,66,A
1,4X6J,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,IAC,LIG_MC,3Y2,307,A,Gly,66,A
2,4X6J,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,IAC,LIG_MC,3Y2,307,A,Gly,66,A
3,4X6J,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,IAC,LIG_MC,3Y2,307,A,Gly,66,A
4,4X6J,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,IAC,LIG_MC,3Y2,307,A,Gly,66,A
5,4X6J,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,IAC,LIG_MC,3Y2,307,A,Gly,66,A
6,4X6J,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,IAC,LIG_MC,3Y2,307,A,Gly,66,A
7,4X6J,A:307:_:3Y2,IAC:LIG_SC,A:66:_:GLY,IAC,LIG_SC,3Y2,307,A,Gly,66,A
8,4X6J,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,IAC,LIG_MC,3Y2,307,A,Gly,66,A
9,4X6J,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,IAC,LIG_MC,3Y2,307,A,Gly,66,A


In [None]:
df_RING_edge_02.query('PDB_id_RING == "10GS" & NodeId1_RING == "A:100:_:ARG"')

Unnamed: 0,PDB_id_RING,NodeId1_RING,Interaction_RING,NodeId2_RING,interacao_RING,subinteracao_RING,Residue1_RING,Residue1_pos_RING,Residue1_chain_RING,Residue2_RING,Residue2_pos_RING,Residue2_chain_RING


Let's group the edge Source node records by: **PDB_id_RING**, **Residue1_RING**, **Residue1_pos_RING**, **Residue1_chain_RING** and counting the types of interaction they have, generating  **df_groupnode1** dataframe.

Solution based on: https://stackoverflow.com/questions/29836477/pandas-create-new-column-with-count-from-groupby

In [None]:
df_RING_edge_02.groupby(['PDB_id_RING',	'NodeId1_RING', 'interacao_RING'])["Interaction_RING"].count().reset_index(name="count")

Unnamed: 0,PDB_id_RING,NodeId1_RING,interacao_RING,count
0,4X6J,A:100:_:PRO,HBOND,2
1,4X6J,A:103:_:LYS,VDW,1
2,4X6J,A:105:_:ALA,VDW,1
3,4X6J,A:107:_:CYS,VDW,1
4,4X6J,A:108:_:ARG,HBOND,1
...,...,...,...,...
7408156,9JDW,A:97:_:ALA,VDW,2
7408157,9JDW,A:98:_:ASN,HBOND,2
7408158,9JDW,A:98:_:ASN,VDW,5
7408159,9JDW,A:99:_:THR,HBOND,1


In [None]:
df_groupnode1_0 = df_RING_edge_02.groupby(['PDB_id_RING',	'Residue1_RING', 'Residue1_pos_RING','Residue1_chain_RING','interacao_RING'])["Interaction_RING"].count().reset_index(name="count")

In [None]:
df_groupnode1_0.head(10)

Unnamed: 0,PDB_id_RING,Residue1_RING,Residue1_pos_RING,Residue1_chain_RING,interacao_RING,count
0,4X6J,3Y2,307,A,IAC,963
1,4X6J,Ala,27,A,HBOND,2
2,4X6J,Ala,27,A,VDW,3
3,4X6J,Ala,33,A,HBOND,2
4,4X6J,Ala,33,A,VDW,2
5,4X6J,Ala,71,A,HBOND,2
6,4X6J,Ala,105,A,VDW,1
7,4X6J,Ala,120,A,HBOND,2
8,4X6J,Ala,124,A,HBOND,2
9,4X6J,Ala,134,A,VDW,2


In [None]:
df_groupnode1 = df_RING_edge_02.groupby(['PDB_id_RING',	'Residue1_RING', 'Residue1_pos_RING','Residue1_chain_RING','Interaction_RING'])["interacao_RING"].count().reset_index(name="count")

In [None]:
df_groupnode1.head(10)

Unnamed: 0,PDB_id_RING,Residue1_RING,Residue1_pos_RING,Residue1_chain_RING,Interaction_RING,count
0,4X6J,3Y2,307,A,IAC:LIG_MC,307
1,4X6J,3Y2,307,A,IAC:LIG_SC,656
2,4X6J,Ala,27,A,HBOND:MC_MC,2
3,4X6J,Ala,27,A,VDW:SC_SC,3
4,4X6J,Ala,33,A,HBOND:MC_MC,2
5,4X6J,Ala,33,A,VDW:MC_SC,1
6,4X6J,Ala,33,A,VDW:SC_SC,1
7,4X6J,Ala,71,A,HBOND:MC_MC,2
8,4X6J,Ala,105,A,VDW:SC_SC,1
9,4X6J,Ala,120,A,HBOND:MC_MC,2


In [None]:
df_groupnode1.query('PDB_id_RING == "10GS" & Residue1_RING == "Tyr" & Residue1_pos_RING == "7" & Residue1_chain_RING == "A"')

Unnamed: 0,PDB_id_RING,Residue1_RING,Residue1_pos_RING,Residue1_chain_RING,Interaction_RING,count


In [None]:
df_groupnode1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9306180 entries, 0 to 9306179
Data columns (total 6 columns):
 #   Column               Dtype 
---  ------               ----- 
 0   PDB_id_RING          object
 1   Residue1_RING        object
 2   Residue1_pos_RING    int64 
 3   Residue1_chain_RING  object
 4   Interaction_RING     object
 5   count                int64 
dtypes: int64(2), object(4)
memory usage: 426.0+ MB


Creating a Dataframe that will store the processing of the **df_groupnode1** dataframe.
The Dataframe will have the following attributes:
- **PDB_id_RING**: PDB              
- **Node_RING**: the node can be a residue or a ligand        
- **Node_pos_RING**: position of the node in the chain
- **Node_chain_RING**: chain       
- **Node_type**: node type: *source* (S) or *target* (T)              
- **Inter_Lig_tot**: total residue-ligand interactions            
- **Inter_Res_tot**: total residue-residue interactions        
- **Inter_IAC_Lig_tot**: total interactions of type IAC with ligand
- **Inter_VDW_Lig_tot**: total interactions of type VDW with ligand
- **Inter_HBOND_Lig_tot**: total interactions of type HBOND with ligand       
- **Inter_PIPISTACK_Lig_tot**: total interactions of type PIPISTACK with ligand   
- **Inter_IONIC_Lig_tot**: total interactions of type IONIC with ligand      
- **Inter_SSBOND_Lig_tot**: total interactions of type SSBOND with ligand      
- **Inter_PICATION_Lig_tot**: total interactions of type PICATION with ligand    
- **Inter_IAC_Res_tot**: total interactions of type IAC between residues
- **Inter_VDW_Res_tot**: total interactions of type VDW between residues
- **Inter_HBOND_Res_tot**: total interactions of type HBOND between residues               
- **Inter_PIPISTACK_Res_tot**: total interactions of type PIPISTACKK between residues           
- **Inter_IONIC_Res_tot**: total interactions of type IONIC between residues       
- **Inter_SSBOND_Res_tot**: total interactions of type SSBOND between residues              
- **Inter_PICATION_Res_tot**: total interactions of type PICATION between residues         


In [None]:
COLUMN_NAMES=['PDB_id_RING','Node_RING','Node_pos_RING','Node_chain_RING','Node_type','Inter_Lig_tot','Inter_Res_tot',
              'Inter_IAC_Lig_tot','Inter_VDW_Lig_tot', 'Inter_HBOND_Lig_tot', 'Inter_PIPISTACK_Lig_tot', 'Inter_IONIC_Lig_tot',
              'Inter_SSBOND_Lig_tot', 'Inter_PICATION_Lig_tot','Inter_IAC_Res_tot','Inter_VDW_Res_tot', 'Inter_HBOND_Res_tot',
              'Inter_PIPISTACK_Res_tot', 'Inter_IONIC_Res_tot','Inter_SSBOND_Res_tot', 'Inter_PICATION_Res_tot']
df_proc_RING_1 = pd.DataFrame(columns=COLUMN_NAMES)

In [None]:
df_proc_RING_1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   PDB_id_RING              0 non-null      object
 1   Node_RING                0 non-null      object
 2   Node_pos_RING            0 non-null      object
 3   Node_chain_RING          0 non-null      object
 4   Node_type                0 non-null      object
 5   Inter_Lig_tot            0 non-null      object
 6   Inter_Res_tot            0 non-null      object
 7   Inter_IAC_Lig_tot        0 non-null      object
 8   Inter_VDW_Lig_tot        0 non-null      object
 9   Inter_HBOND_Lig_tot      0 non-null      object
 10  Inter_PIPISTACK_Lig_tot  0 non-null      object
 11  Inter_IONIC_Lig_tot      0 non-null      object
 12  Inter_SSBOND_Lig_tot     0 non-null      object
 13  Inter_PICATION_Lig_tot   0 non-null      object
 14  Inter_IAC_Res_tot        0 non-null      object
 15  Inter_

Processing nodes that are source (nodes present in the **Residue1_RING** attribute) stored in the **df_groupnode1** dataframe

In [None]:

l_PDB_id_RING = []
l_Node_RING = []
l__Node_pos_RING = []
l_Node_chain_RING = []
l_Node_type = []
l_Inter_Lig_tot = []
l_Inter_Res_tot = []
l_Inter_IAC_Lig_tot = []
l_Inter_VDW_Lig_tot = []
l_Inter_HBOND_Lig_tot = []
l_Inter_PIPISTACK_Lig_tot = []
l_Inter_IONIC_Lig_tot = []
l_Inter_SSBOND_Lig_tot = []
l_Inter_PICATION_Lig_tot = []
l_Inter_IAC_Res_tot = []
l_Inter_VDW_Res_tot = []
l_Inter_HBOND_Res_tot = []
l_Inter_PIPISTACK_Res_tot = []
l_Inter_IONIC_Res_tot = []
l_Inter_SSBOND_Res_tot = []
l_Inter_PICATION_Res_tot = []

l_PDB = []
l_Node = []
l_pos = []
l_chain = []
l_type = []
l_lig = []
l_res = []
l_IAC_L = []
l_VDW_L = []
l_HBOND_L = []
l_PIPISTACK_L = []
l_IONIC_L = []
l_SSBOND_L = []
l_PICATION_L = []
l_IAC_R = []
l_VDW_R = []
l_HBOND_R = []
l_PIPISTACK_R = []
l_IONIC_R = []
l_SSBOND_R = []
l_PICATION_R = []

def process_reg_group(df):
  pdb = 0
  no = 0
  pos = 0
  chain = 0
  mudou = False
  primeiro = False

  totlig = 0
  totres = 0
  tot1 = 0
  tot2 = 0
  tot3 = 0
  tot4 = 0
  tot5 = 0
  tot6 = 0
  tot7 = 0
  tot8 = 0
  tot9 = 0
  tot10 = 0
  tot11 = 0
  tot12 = 0
  tot13 = 0
  tot14 = 0

  for i in df.itertuples():

    if (i.Index == 0):  #First record
       primeiro = True
       #print("e o primeiro")
    if (((i.PDB_id_RING == pdb) and (i.Residue1_RING  == no) and (i.Residue1_pos_RING == pos) and (i.Residue1_chain_RING == chain))
    or primeiro):
      if(primeiro == True):
        primeiro = False
      pdb = i.PDB_id_RING
      no = i.Residue1_RING
      pos = i.Residue1_pos_RING
      chain = i.Residue1_chain_RING
      interacao = i.Interaction_RING.split(":")[0]
      if ("LIG" in i.Interaction_RING):
        totlig = totlig + i.count
        if (interacao == "IAC"):
          tot1 = tot1 + i.count
        elif (interacao == "VDW"):
          tot2 = tot2 + i.count
        elif (interacao == "HBOND"):
          tot3 = tot3 + i.count
        elif (interacao == "PIPISTACK"):
          tot4 = tot4 + i.count
        elif (interacao == "IONIC"):
          tot5 = tot5 + i.count
        elif (interacao == "SSBOND"):
          tot6 = tot6 + i.count
        elif (interacao == "PICATION"):
          tot7 = tot7 + i.count
      else:    #interacao residuo - residuo
        totres = totres + i.count
        if (interacao == "IAC"):
          tot8 = tot8 + i.count
        elif (interacao == "VDW"):
          tot9 = tot9 + i.count
        elif (interacao == "HBOND"):
          tot10 = tot10 + i.count
        elif (interacao == "PIPISTACK"):
          tot11 = tot11 + i.count
        elif (interacao == "IONIC"):
          tot12 = tot12 + i.count
        elif (interacao == "SSBOND"):
          tot13 = tot13 + i.count
        elif (interacao == "PICATION"):
          tot14 = tot14 + i.count
    else:  #changed the key ((i.PDB_id_RING == pdb) & (i.Residue1_RING  == no) & (i.Residue1_pos_RING == pos) & (i.Residue1_chain_RING == chain))
      l_PDB_id_RING.append(pdb)
      l_Node_RING.append(no)
      l__Node_pos_RING.append(pos)
      l_Node_chain_RING.append(chain)
      l_Node_type.append("S")
      l_Inter_Lig_tot.append(totlig)
      l_Inter_Res_tot.append(totres)
      l_Inter_IAC_Lig_tot.append(tot1)
      l_Inter_VDW_Lig_tot.append(tot2)
      l_Inter_HBOND_Lig_tot.append(tot3)
      l_Inter_PIPISTACK_Lig_tot.append(tot4)
      l_Inter_IONIC_Lig_tot.append(tot5)
      l_Inter_SSBOND_Lig_tot.append(tot6)
      l_Inter_PICATION_Lig_tot.append(tot7)
      l_Inter_IAC_Res_tot.append(tot8)
      l_Inter_VDW_Res_tot.append(tot9)
      l_Inter_HBOND_Res_tot.append(tot10)
      l_Inter_PIPISTACK_Res_tot.append(tot11)
      l_Inter_IONIC_Res_tot.append(tot12)
      l_Inter_SSBOND_Res_tot.append(tot13)
      l_Inter_PICATION_Res_tot.append(tot14)
      totlig = 0
      totres = 0
      tot1 = 0
      tot2 = 0
      tot3 = 0
      tot4 = 0
      tot5 = 0
      tot6 = 0
      tot7 = 0
      tot8 = 0
      tot9 = 0
      tot10 = 0
      tot11 = 0
      tot12 = 0
      tot13 = 0
      tot14 = 0
      pdb = i.PDB_id_RING
      no = i.Residue1_RING
      pos = i.Residue1_pos_RING
      chain = i.Residue1_chain_RING
      interacao = i.Interaction_RING.split(":")[0]
      if ("LIG" in i.Interaction_RING):
        totlig = totlig + i.count
        if (interacao == "IAC"):
          tot1 = tot1 + i.count
        elif (interacao == "VDW"):
          tot2 = tot2 + i.count
        elif (interacao == "HBOND"):
          tot3 = tot3 + i.count
        elif (interacao == "PIPISTACK"):
          tot4 = tot4 + i.count
        elif (interacao == "IONIC"):
          tot5 = tot5 + i.count
        elif (interacao == "SSBOND"):
          tot6 = tot6 + i.count
        elif (interacao == "PICATION"):
          tot7 = tot7 + i.count
      else:    #interacao residuo - residuo
        totres = totres + i.count
        if (interacao == "IAC"):
          tot8 = tot8 + i.count
        elif (interacao == "VDW"):
          tot9 = tot9 + i.count
        elif (interacao == "HBOND"):
          tot10 = tot10 + i.count
        elif (interacao == "PIPISTACK"):
          tot11 = tot11 + i.count
        elif (interacao == "IONIC"):
          tot12 = tot12 + i.count
        elif (interacao == "SSBOND"):
          tot13 = tot13 + i.count
        elif (interacao == "PICATION"):
          tot14 = tot14 + i.count
  return l_PDB_id_RING,l_Node_RING,l__Node_pos_RING,l_Node_chain_RING,l_Node_type,l_Inter_Lig_tot,l_Inter_Res_tot,l_Inter_IAC_Lig_tot,l_Inter_VDW_Lig_tot,l_Inter_HBOND_Lig_tot,l_Inter_PIPISTACK_Lig_tot,l_Inter_IONIC_Lig_tot,l_Inter_SSBOND_Lig_tot,l_Inter_PICATION_Lig_tot,l_Inter_IAC_Res_tot,l_Inter_VDW_Res_tot,l_Inter_HBOND_Res_tot,l_Inter_PIPISTACK_Res_tot,l_Inter_IONIC_Res_tot,l_Inter_SSBOND_Res_tot,l_Inter_PICATION_Res_tot


In [None]:
#Processing nodes that are source in the edges
l_PDB1,l_Node1,l_pos1,l_chain1,l_type1,l_lig1,l_res1,l_IAC_L1,l_VDW_L1,l_HBOND_L1,l_PIPISTACK_L1,l_IONIC_L1,l_SSBOND_L1,l_PICATION_L1,l_IAC_R1,l_VDW_R1,l_HBOND_R1,l_PIPISTACK_R1,l_IONIC_R1,l_SSBOND_R1,l_PICATION_R1 = process_reg_group(df_groupnode1)

In [None]:
#Resulting size of processing Source nodes
tam = len(l_PDB1)
print(tam)

4679976


In [None]:
df_proc_RING_1['PDB_id_RING'] = l_PDB1
df_proc_RING_1['Node_RING'] = l_Node1
df_proc_RING_1['Node_pos_RING'] = l_pos1
df_proc_RING_1['Node_chain_RING'] = l_chain1
df_proc_RING_1['Node_type'] = l_type1
df_proc_RING_1['Inter_Lig_tot'] = l_lig1
df_proc_RING_1['Inter_Res_tot'] = l_res1
df_proc_RING_1['Inter_IAC_Lig_tot'] = l_IAC_L1
df_proc_RING_1['Inter_VDW_Lig_tot'] = l_VDW_L1
df_proc_RING_1['Inter_HBOND_Lig_tot'] = l_HBOND_L1
df_proc_RING_1['Inter_PIPISTACK_Lig_tot'] = l_PIPISTACK_L1
df_proc_RING_1['Inter_IONIC_Lig_tot'] = l_IONIC_L1
df_proc_RING_1['Inter_SSBOND_Lig_tot'] = l_SSBOND_L1
df_proc_RING_1['Inter_PICATION_Lig_tot'] = l_PICATION_L1
df_proc_RING_1['Inter_IAC_Res_tot'] = l_IAC_R1
df_proc_RING_1['Inter_VDW_Res_tot'] = l_VDW_R1
df_proc_RING_1['Inter_HBOND_Res_tot'] = l_HBOND_R1
df_proc_RING_1['Inter_PIPISTACK_Res_tot'] = l_PIPISTACK_R1
df_proc_RING_1['Inter_IONIC_Res_tot'] = l_IONIC_R1
df_proc_RING_1['Inter_SSBOND_Res_tot'] = l_SSBOND_R1
df_proc_RING_1['Inter_PICATION_Res_tot'] = l_PICATION_R1

In [None]:
df_proc_RING_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4679976 entries, 0 to 4679975
Data columns (total 21 columns):
 #   Column                   Dtype 
---  ------                   ----- 
 0   PDB_id_RING              object
 1   Node_RING                object
 2   Node_pos_RING            int64 
 3   Node_chain_RING          object
 4   Node_type                object
 5   Inter_Lig_tot            int64 
 6   Inter_Res_tot            int64 
 7   Inter_IAC_Lig_tot        int64 
 8   Inter_VDW_Lig_tot        int64 
 9   Inter_HBOND_Lig_tot      int64 
 10  Inter_PIPISTACK_Lig_tot  int64 
 11  Inter_IONIC_Lig_tot      int64 
 12  Inter_SSBOND_Lig_tot     int64 
 13  Inter_PICATION_Lig_tot   int64 
 14  Inter_IAC_Res_tot        int64 
 15  Inter_VDW_Res_tot        int64 
 16  Inter_HBOND_Res_tot      int64 
 17  Inter_PIPISTACK_Res_tot  int64 
 18  Inter_IONIC_Res_tot      int64 
 19  Inter_SSBOND_Res_tot     int64 
 20  Inter_PICATION_Res_tot   int64 
dtypes: int64(17), object(4)
memory 

In [None]:
df_proc_RING_1.head(30)

Unnamed: 0,PDB_id_RING,Node_RING,Node_pos_RING,Node_chain_RING,Node_type,Inter_Lig_tot,Inter_Res_tot,Inter_IAC_Lig_tot,Inter_VDW_Lig_tot,Inter_HBOND_Lig_tot,Inter_PIPISTACK_Lig_tot,Inter_IONIC_Lig_tot,Inter_SSBOND_Lig_tot,Inter_PICATION_Lig_tot,Inter_IAC_Res_tot,Inter_VDW_Res_tot,Inter_HBOND_Res_tot,Inter_PIPISTACK_Res_tot,Inter_IONIC_Res_tot,Inter_SSBOND_Res_tot,Inter_PICATION_Res_tot
0,4X6J,3Y2,307,A,S,963,0,963,0,0,0,0,0,0,0,0,0,0,0,0,0
1,4X6J,Ala,27,A,S,0,5,0,0,0,0,0,0,0,0,3,2,0,0,0,0
2,4X6J,Ala,33,A,S,0,4,0,0,0,0,0,0,0,0,2,2,0,0,0,0
3,4X6J,Ala,71,A,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
4,4X6J,Ala,105,A,S,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
5,4X6J,Ala,120,A,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
6,4X6J,Ala,124,A,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
7,4X6J,Ala,134,A,S,0,2,0,0,0,0,0,0,0,0,2,0,0,0,0,0
8,4X6J,Ala,137,A,S,0,7,0,0,0,0,0,0,0,0,6,1,0,0,0,0
9,4X6J,Ala,166,A,S,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [None]:
df_proc_RING_1.query('PDB_id_RING == "4X6J" & Node_RING == "Asp" & Node_pos_RING == "55" & Node_chain_RING == "A"')

Unnamed: 0,PDB_id_RING,Node_RING,Node_pos_RING,Node_chain_RING,Node_type,Inter_Lig_tot,Inter_Res_tot,Inter_IAC_Lig_tot,Inter_VDW_Lig_tot,Inter_HBOND_Lig_tot,Inter_PIPISTACK_Lig_tot,Inter_IONIC_Lig_tot,Inter_SSBOND_Lig_tot,Inter_PICATION_Lig_tot,Inter_IAC_Res_tot,Inter_VDW_Res_tot,Inter_HBOND_Res_tot,Inter_PIPISTACK_Res_tot,Inter_IONIC_Res_tot,Inter_SSBOND_Res_tot,Inter_PICATION_Res_tot
28,4X6J,Asp,55,A,S,0,3,0,0,0,0,0,0,0,0,1,2,0,0,0,0


In [None]:
df_proc_RING_1.query('PDB_id_RING == "4X6J" & Node_RING == "3Y2" & Node_pos_RING == "307"')

Unnamed: 0,PDB_id_RING,Node_RING,Node_pos_RING,Node_chain_RING,Node_type,Inter_Lig_tot,Inter_Res_tot,Inter_IAC_Lig_tot,Inter_VDW_Lig_tot,Inter_HBOND_Lig_tot,Inter_PIPISTACK_Lig_tot,Inter_IONIC_Lig_tot,Inter_SSBOND_Lig_tot,Inter_PICATION_Lig_tot,Inter_IAC_Res_tot,Inter_VDW_Res_tot,Inter_HBOND_Res_tot,Inter_PIPISTACK_Res_tot,Inter_IONIC_Res_tot,Inter_SSBOND_Res_tot,Inter_PICATION_Res_tot
0,4X6J,3Y2,307,A,S,963,0,963,0,0,0,0,0,0,0,0,0,0,0,0,0


###1.2.12 Processing interaction types of Target nodes of edges

In [None]:
#increasing the viewing capacity of columns and rows
import pandas as pd

pd.set_option('display.max_columns', 7000)
pd.set_option('display.max_rows',90000)

In [None]:
import pandas as pd
df_RING_edge_02 = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRING/edgesDB_02_sel_proc_PDB_Res1_Res2.csv",delimiter='\t')

In [None]:
df_RING_edge_02.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91917025 entries, 0 to 91917024
Data columns (total 12 columns):
 #   Column               Dtype 
---  ------               ----- 
 0   PDB_id_RING          object
 1   NodeId1_RING         object
 2   Interaction_RING     object
 3   NodeId2_RING         object
 4   interacao_RING       object
 5   subinteracao_RING    object
 6   Residue1_RING        object
 7   Residue1_pos_RING    int64 
 8   Residue1_chain_RING  object
 9   Residue2_RING        object
 10  Residue2_pos_RING    int64 
 11  Residue2_chain_RING  object
dtypes: int64(2), object(10)
memory usage: 8.2+ GB


In [None]:
df_RING_edge_02.head(100)

Unnamed: 0,PDB_id_RING,NodeId1_RING,Interaction_RING,NodeId2_RING,interacao_RING,subinteracao_RING,Residue1_RING,Residue1_pos_RING,Residue1_chain_RING,Residue2_RING,Residue2_pos_RING,Residue2_chain_RING
0,4X6J,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,IAC,LIG_MC,3Y2,307,A,Gly,66,A
1,4X6J,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,IAC,LIG_MC,3Y2,307,A,Gly,66,A
2,4X6J,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,IAC,LIG_MC,3Y2,307,A,Gly,66,A
3,4X6J,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,IAC,LIG_MC,3Y2,307,A,Gly,66,A
4,4X6J,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,IAC,LIG_MC,3Y2,307,A,Gly,66,A
5,4X6J,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,IAC,LIG_MC,3Y2,307,A,Gly,66,A
6,4X6J,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,IAC,LIG_MC,3Y2,307,A,Gly,66,A
7,4X6J,A:307:_:3Y2,IAC:LIG_SC,A:66:_:GLY,IAC,LIG_SC,3Y2,307,A,Gly,66,A
8,4X6J,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,IAC,LIG_MC,3Y2,307,A,Gly,66,A
9,4X6J,A:307:_:3Y2,IAC:LIG_MC,A:66:_:GLY,IAC,LIG_MC,3Y2,307,A,Gly,66,A


Let's group the edge Target node records by: **PDB_id_RING**, **Residue2_RING**, **Residue2_pos_RING**, **Residue2_chain_RING** and counting the types of interaction they have, generating the **df_groupnode2** dataframe.

Solution based on: https://stackoverflow.com/questions/29836477/pandas-create-new-column-with-count-from-groupby

In [None]:
df_groupnode2 = df_RING_edge_02.groupby(['PDB_id_RING',	'Residue2_RING', 'Residue2_pos_RING','Residue2_chain_RING','Interaction_RING'])["interacao_RING"].count().reset_index(name="count")

In [None]:
df_groupnode2.head(10)

Unnamed: 0,PDB_id_RING,Residue2_RING,Residue2_pos_RING,Residue2_chain_RING,Interaction_RING,count
0,4X6J,Ala,27,A,HBOND:MC_MC,1
1,4X6J,Ala,33,A,HBOND:MC_MC,2
2,4X6J,Ala,71,A,HBOND:MC_MC,2
3,4X6J,Ala,71,A,VDW:SC_SC,3
4,4X6J,Ala,86,A,HBOND:MC_MC,1
5,4X6J,Ala,86,A,HBOND:SC_MC,1
6,4X6J,Ala,104,A,HBOND:MC_MC,1
7,4X6J,Ala,104,A,VDW:SC_SC,2
8,4X6J,Ala,105,A,HBOND:MC_MC,1
9,4X6J,Ala,105,A,VDW:SC_SC,2


In [None]:
df_groupnode2.query('PDB_id_RING == "4X6J" & Residue2_RING == "Ala" & Residue2_pos_RING == "71" & Residue2_chain_RING == "A"')

Unnamed: 0,PDB_id_RING,Residue2_RING,Residue2_pos_RING,Residue2_chain_RING,Interaction_RING,count
2,4X6J,Ala,71,A,HBOND:MC_MC,2
3,4X6J,Ala,71,A,VDW:SC_SC,3


Creating a Dataframe that will store the processing of the **df_groupnode2** dataframe.
The Dataframe will have the following attributes:
- **PDB_id_RING**: PDB              
- **Node_RING**: the node can be a residue or a ligand        
- **Node_pos_RING**: position of the node in the chain
- **Node_chain_RING**: chain       
- **Node_type**: node type: *source* (S) or *target* (T)              
- **Inter_Lig_tot**: total residue-ligand interactions            
- **Inter_Res_tot**: total residue-residue interactions        
- **Inter_IAC_Lig_tot**: total interactions of type IAC with ligand
- **Inter_VDW_Lig_tot**: total interactions of type VDW with ligand
- **Inter_HBOND_Lig_tot**: total interactions of type HBOND with ligand       
- **Inter_PIPISTACK_Lig_tot**: total interactions of type PIPISTACK with ligand   
- **Inter_IONIC_Lig_tot**: total interactions of type IONIC with ligand      
- **Inter_SSBOND_Lig_tot**: total interactions of type SSBOND with ligand      
- **Inter_PICATION_Lig_tot**: total interactions of type PICATION with ligand    
- **Inter_IAC_Res_tot**: total interactions of type IAC between residues
- **Inter_VDW_Res_tot**: total interactions of type VDW between residues
- **Inter_HBOND_Res_tot**: total interactions of type HBOND between residues               
- **Inter_PIPISTACK_Res_tot**: total interactions of type PIPISTACKK between residues           
- **Inter_IONIC_Res_tot**: total interactions of type IONIC between residues       
- **Inter_SSBOND_Res_tot**: total interactions of type SSBOND between residues              
- **Inter_PICATION_Res_tot**: total interactions of type PICATION between residues         


In [None]:
COLUMN_NAMES=['PDB_id_RING','Node_RING','Node_pos_RING','Node_chain_RING','Node_type','Inter_Lig_tot','Inter_Res_tot',
              'Inter_IAC_Lig_tot','Inter_VDW_Lig_tot', 'Inter_HBOND_Lig_tot', 'Inter_PIPISTACK_Lig_tot', 'Inter_IONIC_Lig_tot',
              'Inter_SSBOND_Lig_tot', 'Inter_PICATION_Lig_tot','Inter_IAC_Res_tot','Inter_VDW_Res_tot', 'Inter_HBOND_Res_tot',
              'Inter_PIPISTACK_Res_tot', 'Inter_IONIC_Res_tot','Inter_SSBOND_Res_tot', 'Inter_PICATION_Res_tot']
df_proc_RING_2 = pd.DataFrame(columns=COLUMN_NAMES)

In [None]:
df_proc_RING_2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   PDB_id_RING              0 non-null      object
 1   Node_RING                0 non-null      object
 2   Node_pos_RING            0 non-null      object
 3   Node_chain_RING          0 non-null      object
 4   Node_type                0 non-null      object
 5   Inter_Lig_tot            0 non-null      object
 6   Inter_Res_tot            0 non-null      object
 7   Inter_IAC_Lig_tot        0 non-null      object
 8   Inter_VDW_Lig_tot        0 non-null      object
 9   Inter_HBOND_Lig_tot      0 non-null      object
 10  Inter_PIPISTACK_Lig_tot  0 non-null      object
 11  Inter_IONIC_Lig_tot      0 non-null      object
 12  Inter_SSBOND_Lig_tot     0 non-null      object
 13  Inter_PICATION_Lig_tot   0 non-null      object
 14  Inter_IAC_Res_tot        0 non-null      object
 15  Inter_

Processing nodes that are Target (nodes present in the **Residue2_RING** attribute) stored in the **df_groupnode2** dataframe

In [None]:

l_PDB_id_RING = []
l_Node_RING = []
l__Node_pos_RING = []
l_Node_chain_RING = []
l_Node_type = []
l_Inter_Lig_tot = []
l_Inter_Res_tot = []
l_Inter_IAC_Lig_tot = []
l_Inter_VDW_Lig_tot = []
l_Inter_HBOND_Lig_tot = []
l_Inter_PIPISTACK_Lig_tot = []
l_Inter_IONIC_Lig_tot = []
l_Inter_SSBOND_Lig_tot = []
l_Inter_PICATION_Lig_tot = []
l_Inter_IAC_Res_tot = []
l_Inter_VDW_Res_tot = []
l_Inter_HBOND_Res_tot = []
l_Inter_PIPISTACK_Res_tot = []
l_Inter_IONIC_Res_tot = []
l_Inter_SSBOND_Res_tot = []
l_Inter_PICATION_Res_tot = []

l_PDB = []
l_Node = []
l_pos = []
l_chain = []
l_type = []
l_lig = []
l_res = []
l_IAC_L = []
l_VDW_L = []
l_HBOND_L = []
l_PIPISTACK_L = []
l_IONIC_L = []
l_SSBOND_L = []
l_PICATION_L = []
l_IAC_R = []
l_VDW_R = []
l_HBOND_R = []
l_PIPISTACK_R = []
l_IONIC_R = []
l_SSBOND_R = []
l_PICATION_R = []

def process_reg_group(df):
  pdb = 0
  no = 0
  pos = 0
  chain = 0
  mudou = False
  primeiro = False

  totlig = 0
  totres = 0
  tot1 = 0
  tot2 = 0
  tot3 = 0
  tot4 = 0
  tot5 = 0
  tot6 = 0
  tot7 = 0
  tot8 = 0
  tot9 = 0
  tot10 = 0
  tot11 = 0
  tot12 = 0
  tot13 = 0
  tot14 = 0

  for i in df.itertuples():
    if (i.Index == 0):  #First record
       primeiro = True
       #print("é o primeiro")
    if (((i.PDB_id_RING == pdb) and (i.Residue2_RING  == no) and (i.Residue2_pos_RING == pos) and (i.Residue2_chain_RING == chain))
    or primeiro):
      if(primeiro == True):
        primeiro = False
      pdb = i.PDB_id_RING
      no = i.Residue2_RING
      pos = i.Residue2_pos_RING
      chain = i.Residue2_chain_RING
      interacao = i.Interaction_RING.split(":")[0]
      if ("LIG" in i.Interaction_RING):
        totlig = totlig + i.count
        if (interacao == "IAC"):
          tot1 = tot1 + i.count
        elif (interacao == "VDW"):
          tot2 = tot2 + i.count
        elif (interacao == "HBOND"):
          tot3 = tot3 + i.count
        elif (interacao == "PIPISTACK"):
          tot4 = tot4 + i.count
        elif (interacao == "IONIC"):
          tot5 = tot5 + i.count
        elif (interacao == "SSBOND"):
          tot6 = tot6 + i.count
        elif (interacao == "PICATION"):
          tot7 = tot7 + i.count
      else:    #interacao residuo - residuo
        totres = totres + i.count
        if (interacao == "IAC"):
          tot8 = tot8 + i.count
        elif (interacao == "VDW"):
          tot9 = tot9 + i.count
        elif (interacao == "HBOND"):
          tot10 = tot10 + i.count
        elif (interacao == "PIPISTACK"):
          tot11 = tot11 + i.count
        elif (interacao == "IONIC"):
          tot12 = tot12 + i.count
        elif (interacao == "SSBOND"):
          tot13 = tot13 + i.count
        elif (interacao == "PICATION"):
          tot14 = tot14 + i.count
    else: #key changed ((i.PDB_id_RING == pdb) & (i.Residue2_RING  == no) & (i.Residue2_pos_RING == pos) & (i.Residue2_chain_RING == chain))
      l_PDB_id_RING.append(pdb)
      l_Node_RING.append(no)
      l__Node_pos_RING.append(pos)
      l_Node_chain_RING.append(chain)
      l_Node_type.append("T")
      l_Inter_Lig_tot.append(totlig)
      l_Inter_Res_tot.append(totres)
      l_Inter_IAC_Lig_tot.append(tot1)
      l_Inter_VDW_Lig_tot.append(tot2)
      l_Inter_HBOND_Lig_tot.append(tot3)
      l_Inter_PIPISTACK_Lig_tot.append(tot4)
      l_Inter_IONIC_Lig_tot.append(tot5)
      l_Inter_SSBOND_Lig_tot.append(tot6)
      l_Inter_PICATION_Lig_tot.append(tot7)
      l_Inter_IAC_Res_tot.append(tot8)
      l_Inter_VDW_Res_tot.append(tot9)
      l_Inter_HBOND_Res_tot.append(tot10)
      l_Inter_PIPISTACK_Res_tot.append(tot11)
      l_Inter_IONIC_Res_tot.append(tot12)
      l_Inter_SSBOND_Res_tot.append(tot13)
      l_Inter_PICATION_Res_tot.append(tot14)
      totlig = 0
      totres = 0
      tot1 = 0
      tot2 = 0
      tot3 = 0
      tot4 = 0
      tot5 = 0
      tot6 = 0
      tot7 = 0
      tot8 = 0
      tot9 = 0
      tot10 = 0
      tot11 = 0
      tot12 = 0
      tot13 = 0
      tot14 = 0
      pdb = i.PDB_id_RING
      no = i.Residue2_RING
      pos = i.Residue2_pos_RING
      chain = i.Residue2_chain_RING
      interacao = i.Interaction_RING.split(":")[0]
      if ("LIG" in i.Interaction_RING):
        totlig = totlig + i.count
        if (interacao == "IAC"):
          tot1 = tot1 + i.count
        elif (interacao == "VDW"):
          tot2 = tot2 + i.count
        elif (interacao == "HBOND"):
          tot3 = tot3 + i.count
        elif (interacao == "PIPISTACK"):
          tot4 = tot4 + i.count
        elif (interacao == "IONIC"):
          tot5 = tot5 + i.count
        elif (interacao == "SSBOND"):
          tot6 = tot6 + i.count
        elif (interacao == "PICATION"):
          tot7 = tot7 + i.count
      else:    #interacao residuo - residuo
        totres = totres + i.count
        if (interacao == "IAC"):
          tot8 = tot8 + i.count
        elif (interacao == "VDW"):
          tot9 = tot9 + i.count
        elif (interacao == "HBOND"):
          tot10 = tot10 + i.count
        elif (interacao == "PIPISTACK"):
          tot11 = tot11 + i.count
        elif (interacao == "IONIC"):
          tot12 = tot12 + i.count
        elif (interacao == "SSBOND"):
          tot13 = tot13 + i.count
        elif (interacao == "PICATION"):
          tot14 = tot14 + i.count
  return l_PDB_id_RING,l_Node_RING,l__Node_pos_RING,l_Node_chain_RING,l_Node_type,l_Inter_Lig_tot,l_Inter_Res_tot,l_Inter_IAC_Lig_tot,l_Inter_VDW_Lig_tot,l_Inter_HBOND_Lig_tot,l_Inter_PIPISTACK_Lig_tot,l_Inter_IONIC_Lig_tot,l_Inter_SSBOND_Lig_tot,l_Inter_PICATION_Lig_tot,l_Inter_IAC_Res_tot,l_Inter_VDW_Res_tot,l_Inter_HBOND_Res_tot,l_Inter_PIPISTACK_Res_tot,l_Inter_IONIC_Res_tot,l_Inter_SSBOND_Res_tot,l_Inter_PICATION_Res_tot


In [None]:
#Processing nodes that are Target in the edges
l_PDB2,l_Node2,l_pos2,l_chain2,l_type2,l_lig2,l_res2,l_IAC_L2,l_VDW_L2,l_HBOND_L2,l_PIPISTACK_L2,l_IONIC_L2,l_SSBOND_L2,l_PICATION_L2,l_IAC_R2,l_VDW_R2,l_HBOND_R2,l_PIPISTACK_R2,l_IONIC_R2,l_SSBOND_R2,l_PICATION_R2 = process_reg_group(df_groupnode2)

In [None]:
#Resulting size of processing Source nodes
tam = len(l_PDB2)
print(tam)

4940166


In [None]:
df_proc_RING_2['PDB_id_RING'] = l_PDB2
df_proc_RING_2['Node_RING'] = l_Node2
df_proc_RING_2['Node_pos_RING'] = l_pos2
df_proc_RING_2['Node_chain_RING'] = l_chain2
df_proc_RING_2['Node_type'] = l_type2
df_proc_RING_2['Inter_Lig_tot'] = l_lig2
df_proc_RING_2['Inter_Res_tot'] = l_res2
df_proc_RING_2['Inter_IAC_Lig_tot'] = l_IAC_L2
df_proc_RING_2['Inter_VDW_Lig_tot'] = l_VDW_L2
df_proc_RING_2['Inter_HBOND_Lig_tot'] = l_HBOND_L2
df_proc_RING_2['Inter_PIPISTACK_Lig_tot'] = l_PIPISTACK_L2
df_proc_RING_2['Inter_IONIC_Lig_tot'] = l_IONIC_L2
df_proc_RING_2['Inter_SSBOND_Lig_tot'] = l_SSBOND_L2
df_proc_RING_2['Inter_PICATION_Lig_tot'] = l_PICATION_L2
df_proc_RING_2['Inter_IAC_Res_tot'] = l_IAC_R2
df_proc_RING_2['Inter_VDW_Res_tot'] = l_VDW_R2
df_proc_RING_2['Inter_HBOND_Res_tot'] = l_HBOND_R2
df_proc_RING_2['Inter_PIPISTACK_Res_tot'] = l_PIPISTACK_R2
df_proc_RING_2['Inter_IONIC_Res_tot'] = l_IONIC_R2
df_proc_RING_2['Inter_SSBOND_Res_tot'] = l_SSBOND_R2
df_proc_RING_2['Inter_PICATION_Res_tot'] = l_PICATION_R2

In [None]:
df_proc_RING_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4940166 entries, 0 to 4940165
Data columns (total 21 columns):
 #   Column                   Dtype 
---  ------                   ----- 
 0   PDB_id_RING              object
 1   Node_RING                object
 2   Node_pos_RING            int64 
 3   Node_chain_RING          object
 4   Node_type                object
 5   Inter_Lig_tot            int64 
 6   Inter_Res_tot            int64 
 7   Inter_IAC_Lig_tot        int64 
 8   Inter_VDW_Lig_tot        int64 
 9   Inter_HBOND_Lig_tot      int64 
 10  Inter_PIPISTACK_Lig_tot  int64 
 11  Inter_IONIC_Lig_tot      int64 
 12  Inter_SSBOND_Lig_tot     int64 
 13  Inter_PICATION_Lig_tot   int64 
 14  Inter_IAC_Res_tot        int64 
 15  Inter_VDW_Res_tot        int64 
 16  Inter_HBOND_Res_tot      int64 
 17  Inter_PIPISTACK_Res_tot  int64 
 18  Inter_IONIC_Res_tot      int64 
 19  Inter_SSBOND_Res_tot     int64 
 20  Inter_PICATION_Res_tot   int64 
dtypes: int64(17), object(4)
memory 

In [None]:
df_proc_RING_2.head(20)

Unnamed: 0,PDB_id_RING,Node_RING,Node_pos_RING,Node_chain_RING,Node_type,Inter_Lig_tot,Inter_Res_tot,Inter_IAC_Lig_tot,Inter_VDW_Lig_tot,Inter_HBOND_Lig_tot,Inter_PIPISTACK_Lig_tot,Inter_IONIC_Lig_tot,Inter_SSBOND_Lig_tot,Inter_PICATION_Lig_tot,Inter_IAC_Res_tot,Inter_VDW_Res_tot,Inter_HBOND_Res_tot,Inter_PIPISTACK_Res_tot,Inter_IONIC_Res_tot,Inter_SSBOND_Res_tot,Inter_PICATION_Res_tot
0,4X6J,Ala,27,A,T,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,4X6J,Ala,33,A,T,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
2,4X6J,Ala,71,A,T,0,5,0,0,0,0,0,0,0,0,3,2,0,0,0,0
3,4X6J,Ala,86,A,T,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
4,4X6J,Ala,104,A,T,0,3,0,0,0,0,0,0,0,0,2,1,0,0,0,0
5,4X6J,Ala,105,A,T,0,3,0,0,0,0,0,0,0,0,2,1,0,0,0,0
6,4X6J,Ala,120,A,T,0,7,0,0,0,0,0,0,0,0,5,2,0,0,0,0
7,4X6J,Ala,124,A,T,0,3,0,0,0,0,0,0,0,0,1,2,0,0,0,0
8,4X6J,Ala,126,A,T,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
9,4X6J,Ala,134,A,T,48,0,48,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
df_proc_RING_2.query('PDB_id_RING == "4X6J" & Node_RING == "Ala" & Node_pos_RING == "104" & Node_chain_RING == "A"')

Unnamed: 0,PDB_id_RING,Node_RING,Node_pos_RING,Node_chain_RING,Node_type,Inter_Lig_tot,Inter_Res_tot,Inter_IAC_Lig_tot,Inter_VDW_Lig_tot,Inter_HBOND_Lig_tot,Inter_PIPISTACK_Lig_tot,Inter_IONIC_Lig_tot,Inter_SSBOND_Lig_tot,Inter_PICATION_Lig_tot,Inter_IAC_Res_tot,Inter_VDW_Res_tot,Inter_HBOND_Res_tot,Inter_PIPISTACK_Res_tot,Inter_IONIC_Res_tot,Inter_SSBOND_Res_tot,Inter_PICATION_Res_tot
4,4X6J,Ala,104,A,T,0,3,0,0,0,0,0,0,0,0,2,1,0,0,0,0


###1.2.13 integration of Databases that have the interactions of source and target nodes

In [None]:
df_proc_RING  = df_proc_RING_1.append(df_proc_RING_2, ignore_index=True)

In [None]:
df_proc_RING.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9620142 entries, 0 to 9620141
Data columns (total 21 columns):
 #   Column                   Dtype 
---  ------                   ----- 
 0   PDB_id_RING              object
 1   Node_RING                object
 2   Node_pos_RING            int64 
 3   Node_chain_RING          object
 4   Node_type                object
 5   Inter_Lig_tot            int64 
 6   Inter_Res_tot            int64 
 7   Inter_IAC_Lig_tot        int64 
 8   Inter_VDW_Lig_tot        int64 
 9   Inter_HBOND_Lig_tot      int64 
 10  Inter_PIPISTACK_Lig_tot  int64 
 11  Inter_IONIC_Lig_tot      int64 
 12  Inter_SSBOND_Lig_tot     int64 
 13  Inter_PICATION_Lig_tot   int64 
 14  Inter_IAC_Res_tot        int64 
 15  Inter_VDW_Res_tot        int64 
 16  Inter_HBOND_Res_tot      int64 
 17  Inter_PIPISTACK_Res_tot  int64 
 18  Inter_IONIC_Res_tot      int64 
 19  Inter_SSBOND_Res_tot     int64 
 20  Inter_PICATION_Res_tot   int64 
dtypes: int64(17), object(4)
memory 

In [None]:
df_proc_RING.head(20)

Unnamed: 0,PDB_id_RING,Node_RING,Node_pos_RING,Node_chain_RING,Node_type,Inter_Lig_tot,Inter_Res_tot,Inter_IAC_Lig_tot,Inter_VDW_Lig_tot,Inter_HBOND_Lig_tot,Inter_PIPISTACK_Lig_tot,Inter_IONIC_Lig_tot,Inter_SSBOND_Lig_tot,Inter_PICATION_Lig_tot,Inter_IAC_Res_tot,Inter_VDW_Res_tot,Inter_HBOND_Res_tot,Inter_PIPISTACK_Res_tot,Inter_IONIC_Res_tot,Inter_SSBOND_Res_tot,Inter_PICATION_Res_tot
0,4X6J,3Y2,307,A,S,963,0,963,0,0,0,0,0,0,0,0,0,0,0,0,0
1,4X6J,Ala,27,A,S,0,5,0,0,0,0,0,0,0,0,3,2,0,0,0,0
2,4X6J,Ala,33,A,S,0,4,0,0,0,0,0,0,0,0,2,2,0,0,0,0
3,4X6J,Ala,71,A,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
4,4X6J,Ala,105,A,S,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
5,4X6J,Ala,120,A,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
6,4X6J,Ala,124,A,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
7,4X6J,Ala,134,A,S,0,2,0,0,0,0,0,0,0,0,2,0,0,0,0,0
8,4X6J,Ala,137,A,S,0,7,0,0,0,0,0,0,0,0,6,1,0,0,0,0
9,4X6J,Ala,166,A,S,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [None]:
df_proc_RING.query('PDB_id_RING == "4X6J" & Node_RING == "Arg" & Node_pos_RING == "111" & Node_chain_RING == "A"')

Unnamed: 0,PDB_id_RING,Node_RING,Node_pos_RING,Node_chain_RING,Node_type,Inter_Lig_tot,Inter_Res_tot,Inter_IAC_Lig_tot,Inter_VDW_Lig_tot,Inter_HBOND_Lig_tot,Inter_PIPISTACK_Lig_tot,Inter_IONIC_Lig_tot,Inter_SSBOND_Lig_tot,Inter_PICATION_Lig_tot,Inter_IAC_Res_tot,Inter_VDW_Res_tot,Inter_HBOND_Res_tot,Inter_PIPISTACK_Res_tot,Inter_IONIC_Res_tot,Inter_SSBOND_Res_tot,Inter_PICATION_Res_tot
14,4X6J,Arg,111,A,S,0,8,0,0,0,0,0,0,0,0,6,2,0,0,0,0


In [None]:
df_q1 = df_proc_RING.query('PDB_id_RING == "4X6J" & Node_RING == "Asn" & Node_pos_RING == "60" & Node_chain_RING == "A"')

In [None]:
tam = len(df_q1)
print(tam)

2


In [None]:
df_q1

Unnamed: 0,PDB_id_RING,Node_RING,Node_pos_RING,Node_chain_RING,Node_type,Inter_Lig_tot,Inter_Res_tot,Inter_IAC_Lig_tot,Inter_VDW_Lig_tot,Inter_HBOND_Lig_tot,Inter_PIPISTACK_Lig_tot,Inter_IONIC_Lig_tot,Inter_SSBOND_Lig_tot,Inter_PICATION_Lig_tot,Inter_IAC_Res_tot,Inter_VDW_Res_tot,Inter_HBOND_Res_tot,Inter_PIPISTACK_Res_tot,Inter_IONIC_Res_tot,Inter_SSBOND_Res_tot,Inter_PICATION_Res_tot
19,4X6J,Asn,60,A,S,0,8,0,0,0,0,0,0,0,0,6,2,0,0,0,0
4679999,4X6J,Asn,60,A,T,0,7,0,0,0,0,0,0,0,0,6,1,0,0,0,0


In [None]:
df_q1 = df_proc_RING.query('PDB_id_RING == "4X6J" & Node_RING == "Arg" & Node_pos_RING == "111" & Node_chain_RING == "A"')

In [None]:
tam = len(df_q1)
print(tam)

1


In [None]:
df_q1

Unnamed: 0,PDB_id_RING,Node_RING,Node_pos_RING,Node_chain_RING,Node_type,Inter_Lig_tot,Inter_Res_tot,Inter_IAC_Lig_tot,Inter_VDW_Lig_tot,Inter_HBOND_Lig_tot,Inter_PIPISTACK_Lig_tot,Inter_IONIC_Lig_tot,Inter_SSBOND_Lig_tot,Inter_PICATION_Lig_tot,Inter_IAC_Res_tot,Inter_VDW_Res_tot,Inter_HBOND_Res_tot,Inter_PIPISTACK_Res_tot,Inter_IONIC_Res_tot,Inter_SSBOND_Res_tot,Inter_PICATION_Res_tot
14,4X6J,Arg,111,A,S,0,8,0,0,0,0,0,0,0,0,6,2,0,0,0,0


In [None]:
df_proc_RING.query('PDB_id_RING == "9JDW" & Node_RING == "Phe" & Node_pos_RING == "330" & Node_chain_RING == "A"')

Unnamed: 0,PDB_id_RING,Node_RING,Node_pos_RING,Node_chain_RING,Node_type,Inter_Lig_tot,Inter_Res_tot,Inter_IAC_Lig_tot,Inter_VDW_Lig_tot,Inter_HBOND_Lig_tot,Inter_PIPISTACK_Lig_tot,Inter_IONIC_Lig_tot,Inter_SSBOND_Lig_tot,Inter_PICATION_Lig_tot,Inter_IAC_Res_tot,Inter_VDW_Res_tot,Inter_HBOND_Res_tot,Inter_PIPISTACK_Res_tot,Inter_IONIC_Res_tot,Inter_SSBOND_Res_tot,Inter_PICATION_Res_tot
4679899,9JDW,Phe,330,A,S,0,8,0,0,0,0,0,0,0,0,4,3,1,0,0,0
9620072,9JDW,Phe,330,A,T,0,12,0,0,0,0,0,0,0,0,9,1,2,0,0,0


###1.2.14 Generating an intermediate file with the processed **edgesDB_02.txt** database

In [None]:
df_proc_RING.to_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRING/edgesDB_02_proc.csv",sep='\t',index=False)

##1.3 Reading and integrating the *edges_DB_01_proc* and *edgesDB_02_proc* Databases

In [None]:
#increasing the viewing capacity of columns and rows
import pandas as pd

pd.set_option('display.max_columns', 7000)
pd.set_option('display.max_rows',90000)

In [None]:
import pandas as pd
df_RING_edge_01 = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRING/edgesDB_01_proc.csv",delimiter='\t')

In [None]:
import pandas as pd
df_RING_edge_02 = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRING/edgesDB_02_proc.csv",delimiter='\t')

In [None]:
df_RING_edge_01.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10599398 entries, 0 to 10599397
Data columns (total 21 columns):
 #   Column                   Dtype 
---  ------                   ----- 
 0   PDB_id_RING              object
 1   Node_RING                object
 2   Node_pos_RING            int64 
 3   Node_chain_RING          object
 4   Node_type                object
 5   Inter_Lig_tot            int64 
 6   Inter_Res_tot            int64 
 7   Inter_IAC_Lig_tot        int64 
 8   Inter_VDW_Lig_tot        int64 
 9   Inter_HBOND_Lig_tot      int64 
 10  Inter_PIPISTACK_Lig_tot  int64 
 11  Inter_IONIC_Lig_tot      int64 
 12  Inter_SSBOND_Lig_tot     int64 
 13  Inter_PICATION_Lig_tot   int64 
 14  Inter_IAC_Res_tot        int64 
 15  Inter_VDW_Res_tot        int64 
 16  Inter_HBOND_Res_tot      int64 
 17  Inter_PIPISTACK_Res_tot  int64 
 18  Inter_IONIC_Res_tot      int64 
 19  Inter_SSBOND_Res_tot     int64 
 20  Inter_PICATION_Res_tot   int64 
dtypes: int64(17), object(4)
memor

In [None]:
df_RING_edge_02.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9620142 entries, 0 to 9620141
Data columns (total 21 columns):
 #   Column                   Dtype 
---  ------                   ----- 
 0   PDB_id_RING              object
 1   Node_RING                object
 2   Node_pos_RING            int64 
 3   Node_chain_RING          object
 4   Node_type                object
 5   Inter_Lig_tot            int64 
 6   Inter_Res_tot            int64 
 7   Inter_IAC_Lig_tot        int64 
 8   Inter_VDW_Lig_tot        int64 
 9   Inter_HBOND_Lig_tot      int64 
 10  Inter_PIPISTACK_Lig_tot  int64 
 11  Inter_IONIC_Lig_tot      int64 
 12  Inter_SSBOND_Lig_tot     int64 
 13  Inter_PICATION_Lig_tot   int64 
 14  Inter_IAC_Res_tot        int64 
 15  Inter_VDW_Res_tot        int64 
 16  Inter_HBOND_Res_tot      int64 
 17  Inter_PIPISTACK_Res_tot  int64 
 18  Inter_IONIC_Res_tot      int64 
 19  Inter_SSBOND_Res_tot     int64 
 20  Inter_PICATION_Res_tot   int64 
dtypes: int64(17), object(4)
memory 

In [None]:
df_RING_edge = df_RING_edge_01.append(df_RING_edge_02, ignore_index=True)

In [None]:
df_RING_edge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20219540 entries, 0 to 20219539
Data columns (total 21 columns):
 #   Column                   Dtype 
---  ------                   ----- 
 0   PDB_id_RING              object
 1   Node_RING                object
 2   Node_pos_RING            int64 
 3   Node_chain_RING          object
 4   Node_type                object
 5   Inter_Lig_tot            int64 
 6   Inter_Res_tot            int64 
 7   Inter_IAC_Lig_tot        int64 
 8   Inter_VDW_Lig_tot        int64 
 9   Inter_HBOND_Lig_tot      int64 
 10  Inter_PIPISTACK_Lig_tot  int64 
 11  Inter_IONIC_Lig_tot      int64 
 12  Inter_SSBOND_Lig_tot     int64 
 13  Inter_PICATION_Lig_tot   int64 
 14  Inter_IAC_Res_tot        int64 
 15  Inter_VDW_Res_tot        int64 
 16  Inter_HBOND_Res_tot      int64 
 17  Inter_PIPISTACK_Res_tot  int64 
 18  Inter_IONIC_Res_tot      int64 
 19  Inter_SSBOND_Res_tot     int64 
 20  Inter_PICATION_Res_tot   int64 
dtypes: int64(17), object(4)
memor

In [None]:
#checking for the existence of 'missing' values
df_RING_edge.isna().sum()

PDB_id_RING                0
Node_RING                  0
Node_pos_RING              0
Node_chain_RING            0
Node_type                  0
Inter_Lig_tot              0
Inter_Res_tot              0
Inter_IAC_Lig_tot          0
Inter_VDW_Lig_tot          0
Inter_HBOND_Lig_tot        0
Inter_PIPISTACK_Lig_tot    0
Inter_IONIC_Lig_tot        0
Inter_SSBOND_Lig_tot       0
Inter_PICATION_Lig_tot     0
Inter_IAC_Res_tot          0
Inter_VDW_Res_tot          0
Inter_HBOND_Res_tot        0
Inter_PIPISTACK_Res_tot    0
Inter_IONIC_Res_tot        0
Inter_SSBOND_Res_tot       0
Inter_PICATION_Res_tot     0
dtype: int64

###1.3.1 Generating a file with the RING edges databases integrated characterizing the node as source and target

In this file each node is characterized by two records, one counts its interactions when it is source and the other counts its interactions when it is target.

In [None]:
df_RING_edge.to_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRING/edgesDB_proc_ST.csv",sep='\t',index=False)

##1.4 Generating an edge file without explicitly stating the node when it is source or target

In [None]:
#increasing the viewing capacity of columns and rows
import pandas as pd

pd.set_option('display.max_columns', 7000)
pd.set_option('display.max_rows',90000)

In [None]:
import pandas as pd
df_RING_edge = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRING/edgesDB_proc_ST.csv",delimiter='\t')

In [None]:
df_RING_edge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20219540 entries, 0 to 20219539
Data columns (total 21 columns):
 #   Column                   Dtype 
---  ------                   ----- 
 0   PDB_id_RING              object
 1   Node_RING                object
 2   Node_pos_RING            int64 
 3   Node_chain_RING          object
 4   Node_type                object
 5   Inter_Lig_tot            int64 
 6   Inter_Res_tot            int64 
 7   Inter_IAC_Lig_tot        int64 
 8   Inter_VDW_Lig_tot        int64 
 9   Inter_HBOND_Lig_tot      int64 
 10  Inter_PIPISTACK_Lig_tot  int64 
 11  Inter_IONIC_Lig_tot      int64 
 12  Inter_SSBOND_Lig_tot     int64 
 13  Inter_PICATION_Lig_tot   int64 
 14  Inter_IAC_Res_tot        int64 
 15  Inter_VDW_Res_tot        int64 
 16  Inter_HBOND_Res_tot      int64 
 17  Inter_PIPISTACK_Res_tot  int64 
 18  Inter_IONIC_Res_tot      int64 
 19  Inter_SSBOND_Res_tot     int64 
 20  Inter_PICATION_Res_tot   int64 
dtypes: int64(17), object(4)
memor

Let's sort the file by the following attributes to facilitate our processing:
- PDB_id_RING              
- Node_RING
- Node_pos_RING
- Node_chain_RING  

The dataframe **df_ord** will be generated

In [None]:
df_RING_edge.query('PDB_id_RING == "9JDW" & Node_RING == "Phe" & Node_pos_RING == "330" & Node_chain_RING == "A"')

Unnamed: 0,PDB_id_RING,Node_RING,Node_pos_RING,Node_chain_RING,Node_type,Inter_Lig_tot,Inter_Res_tot,Inter_IAC_Lig_tot,Inter_VDW_Lig_tot,Inter_HBOND_Lig_tot,Inter_PIPISTACK_Lig_tot,Inter_IONIC_Lig_tot,Inter_SSBOND_Lig_tot,Inter_PICATION_Lig_tot,Inter_IAC_Res_tot,Inter_VDW_Res_tot,Inter_HBOND_Res_tot,Inter_PIPISTACK_Res_tot,Inter_IONIC_Res_tot,Inter_SSBOND_Res_tot,Inter_PICATION_Res_tot
15279297,9JDW,Phe,330,A,S,0,8,0,0,0,0,0,0,0,0,4,3,1,0,0,0
20219470,9JDW,Phe,330,A,T,0,12,0,0,0,0,0,0,0,0,9,1,2,0,0,0


In [None]:
df_RING_edge.query('PDB_id_RING == "10GS" & Node_RING == "Ala" & Node_pos_RING == "16"')

Unnamed: 0,PDB_id_RING,Node_RING,Node_pos_RING,Node_chain_RING,Node_type,Inter_Lig_tot,Inter_Res_tot,Inter_IAC_Lig_tot,Inter_VDW_Lig_tot,Inter_HBOND_Lig_tot,Inter_PIPISTACK_Lig_tot,Inter_IONIC_Lig_tot,Inter_SSBOND_Lig_tot,Inter_PICATION_Lig_tot,Inter_IAC_Res_tot,Inter_VDW_Res_tot,Inter_HBOND_Res_tot,Inter_PIPISTACK_Res_tot,Inter_IONIC_Res_tot,Inter_SSBOND_Res_tot,Inter_PICATION_Res_tot
2,10GS,Ala,16,A,S,0,3,0,0,0,0,0,0,0,0,1,2,0,0,0,0
3,10GS,Ala,16,B,S,0,3,0,0,0,0,0,0,0,0,1,2,0,0,0,0


In [None]:
df_ord = df_RING_edge.sort_values(by=['PDB_id_RING',	'Node_RING', 'Node_pos_RING', 'Node_chain_RING'], ignore_index=True)

In [None]:
df_ord.head(10)

Unnamed: 0,PDB_id_RING,Node_RING,Node_pos_RING,Node_chain_RING,Node_type,Inter_Lig_tot,Inter_Res_tot,Inter_IAC_Lig_tot,Inter_VDW_Lig_tot,Inter_HBOND_Lig_tot,Inter_PIPISTACK_Lig_tot,Inter_IONIC_Lig_tot,Inter_SSBOND_Lig_tot,Inter_PICATION_Lig_tot,Inter_IAC_Res_tot,Inter_VDW_Res_tot,Inter_HBOND_Res_tot,Inter_PIPISTACK_Res_tot,Inter_IONIC_Res_tot,Inter_SSBOND_Res_tot,Inter_PICATION_Res_tot
0,10GS,Ala,15,A,S,0,3,0,0,0,0,0,0,0,0,1,2,0,0,0,0
1,10GS,Ala,15,A,T,0,2,0,0,0,0,0,0,0,0,1,1,0,0,0,0
2,10GS,Ala,15,B,S,0,4,0,0,0,0,0,0,0,0,2,2,0,0,0,0
3,10GS,Ala,15,B,T,0,2,0,0,0,0,0,0,0,0,1,1,0,0,0,0
4,10GS,Ala,16,A,S,0,3,0,0,0,0,0,0,0,0,1,2,0,0,0,0
5,10GS,Ala,16,B,S,0,3,0,0,0,0,0,0,0,0,1,2,0,0,0,0
6,10GS,Ala,22,A,S,0,4,0,0,0,0,0,0,0,0,3,1,0,0,0,0
7,10GS,Ala,22,A,T,51,1,51,0,0,0,0,0,0,0,0,1,0,0,0,0
8,10GS,Ala,22,B,S,0,4,0,0,0,0,0,0,0,0,3,1,0,0,0,0
9,10GS,Ala,22,B,T,51,1,51,0,0,0,0,0,0,0,0,1,0,0,0,0


In [None]:
df_ord.query('PDB_id_RING == "9JDW" & Node_RING == "Phe" & Node_pos_RING == "330" & Node_chain_RING == "A"')

Unnamed: 0,PDB_id_RING,Node_RING,Node_pos_RING,Node_chain_RING,Node_type,Inter_Lig_tot,Inter_Res_tot,Inter_IAC_Lig_tot,Inter_VDW_Lig_tot,Inter_HBOND_Lig_tot,Inter_PIPISTACK_Lig_tot,Inter_IONIC_Lig_tot,Inter_SSBOND_Lig_tot,Inter_PICATION_Lig_tot,Inter_IAC_Res_tot,Inter_VDW_Res_tot,Inter_HBOND_Res_tot,Inter_PIPISTACK_Res_tot,Inter_IONIC_Res_tot,Inter_SSBOND_Res_tot,Inter_PICATION_Res_tot
20219393,9JDW,Phe,330,A,S,0,8,0,0,0,0,0,0,0,0,4,3,1,0,0,0
20219394,9JDW,Phe,330,A,T,0,12,0,0,0,0,0,0,0,0,9,1,2,0,0,0


In [None]:
df_ord.query('PDB_id_RING == "10GS" & Node_RING == "Phe" & Node_pos_RING == "8" & Node_chain_RING == "A"')

Unnamed: 0,PDB_id_RING,Node_RING,Node_pos_RING,Node_chain_RING,Node_type,Inter_Lig_tot,Inter_Res_tot,Inter_IAC_Lig_tot,Inter_VDW_Lig_tot,Inter_HBOND_Lig_tot,Inter_PIPISTACK_Lig_tot,Inter_IONIC_Lig_tot,Inter_SSBOND_Lig_tot,Inter_PICATION_Lig_tot,Inter_IAC_Res_tot,Inter_VDW_Res_tot,Inter_HBOND_Res_tot,Inter_PIPISTACK_Res_tot,Inter_IONIC_Res_tot,Inter_SSBOND_Res_tot,Inter_PICATION_Res_tot
466,10GS,Phe,8,A,S,0,12,0,0,0,0,0,0,0,0,11,0,1,0,0,0
467,10GS,Phe,8,A,T,269,0,266,3,0,0,0,0,0,0,0,0,0,0,0,0


Creating a Dataframe that will store the processing of the **df_proc_edge** dataframe.
The Dataframe will have the following attributes:
- **PDB_id_RING**: PDB              
- **Node_RING**: the node can be a residue or a ligand        
- **Node_pos_RING**: position of the node in the chain
- **Node_chain_RING**: chain                     
- **Inter_Lig_tot**: total residue-ligand interactions            
- **Inter_Res_tot**: total residue-residue interactions        
- **Inter_IAC_Lig_tot**: total interactions of type IAC with ligand
- **Inter_VDW_Lig_tot**: total interactions of type VDW with ligand
- **Inter_HBOND_Lig_tot**: total interactions of type HBOND with ligand       
- **Inter_PIPISTACK_Lig_tot**: total interactions of type PIPISTACK with ligand   
- **Inter_IONIC_Lig_tot**: total interactions of type IONIC with ligand      
- **Inter_SSBOND_Lig_tot**: total interactions of type SSBOND with ligand      
- **Inter_PICATION_Lig_tot**: total interactions of type PICATION with ligand    
- **Inter_IAC_Res_tot**: total interactions of type IAC between residues
- **Inter_VDW_Res_tot**: total interactions of type VDW between residues
- **Inter_HBOND_Res_tot**: total interactions of type HBOND between residues               
- **Inter_PIPISTACK_Res_tot**: total interactions of type PIPISTACKK between residues           
- **Inter_IONIC_Res_tot**: total interactions of type IONIC between residues       
- **Inter_SSBOND_Res_tot**: total interactions of type SSBOND between residues              
- **Inter_PICATION_Res_tot**: total interactions of type PICATION between residues         


In [None]:
COLUMN_NAMES=['PDB_id_RING','Node_RING','Node_pos_RING','Node_chain_RING','Inter_Lig_tot','Inter_Res_tot',
              'Inter_IAC_Lig_tot','Inter_VDW_Lig_tot', 'Inter_HBOND_Lig_tot', 'Inter_PIPISTACK_Lig_tot', 'Inter_IONIC_Lig_tot',
              'Inter_SSBOND_Lig_tot', 'Inter_PICATION_Lig_tot','Inter_IAC_Res_tot','Inter_VDW_Res_tot', 'Inter_HBOND_Res_tot',
              'Inter_PIPISTACK_Res_tot', 'Inter_IONIC_Res_tot','Inter_SSBOND_Res_tot', 'Inter_PICATION_Res_tot']
df_proc_edge = pd.DataFrame(columns=COLUMN_NAMES)

In [None]:
df_proc_edge.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   PDB_id_RING              0 non-null      object
 1   Node_RING                0 non-null      object
 2   Node_pos_RING            0 non-null      object
 3   Node_chain_RING          0 non-null      object
 4   Inter_Lig_tot            0 non-null      object
 5   Inter_Res_tot            0 non-null      object
 6   Inter_IAC_Lig_tot        0 non-null      object
 7   Inter_VDW_Lig_tot        0 non-null      object
 8   Inter_HBOND_Lig_tot      0 non-null      object
 9   Inter_PIPISTACK_Lig_tot  0 non-null      object
 10  Inter_IONIC_Lig_tot      0 non-null      object
 11  Inter_SSBOND_Lig_tot     0 non-null      object
 12  Inter_PICATION_Lig_tot   0 non-null      object
 13  Inter_IAC_Res_tot        0 non-null      object
 14  Inter_VDW_Res_tot        0 non-null      object
 15  Inter_

The processing will total the number of interactions of a node (belonging to a PDB) in a given chain and a given position, regardless of whether it is Source or Target, sum everything. As a result, the **df_proc_edge** dataframe will be generated.

In [None]:

l_PDB_id_RING = []
l_Node_RING = []
l__Node_pos_RING = []
l_Node_chain_RING = []
l_Inter_Lig_tot = []
l_Inter_Res_tot = []
l_Inter_IAC_Lig_tot = []
l_Inter_VDW_Lig_tot = []
l_Inter_HBOND_Lig_tot = []
l_Inter_PIPISTACK_Lig_tot = []
l_Inter_IONIC_Lig_tot = []
l_Inter_SSBOND_Lig_tot = []
l_Inter_PICATION_Lig_tot = []
l_Inter_IAC_Res_tot = []
l_Inter_VDW_Res_tot = []
l_Inter_HBOND_Res_tot = []
l_Inter_PIPISTACK_Res_tot = []
l_Inter_IONIC_Res_tot = []
l_Inter_SSBOND_Res_tot = []
l_Inter_PICATION_Res_tot = []

l_PDB = []
l_Node = []
l_pos = []
l_chain = []
l_lig = []
l_res = []
l_IAC_L = []
l_VDW_L = []
l_HBOND_L = []
l_PIPISTACK_L = []
l_IONIC_L = []
l_SSBOND_L = []
l_PICATION_L = []
l_IAC_R = []
l_VDW_R = []
l_HBOND_R = []
l_PIPISTACK_R = []
l_IONIC_R = []
l_SSBOND_R = []
l_PICATION_R = []

def process_reg_ord(df):
  pdb = 0
  no = 0
  pos = 0
  chain = 0
  mudou = False
  primeiro = False

  totlig = 0
  totres = 0
  tot1 = 0
  tot2 = 0
  tot3 = 0
  tot4 = 0
  tot5 = 0
  tot6 = 0
  tot7 = 0
  tot8 = 0
  tot9 = 0
  tot10 = 0
  tot11 = 0
  tot12 = 0
  tot13 = 0
  tot14 = 0

  for i in df.itertuples():

    if (i.Index == 0):  #First record
       primeiro = True
       print("e o primeiro")
    if (((i.PDB_id_RING == pdb) and (i.Node_RING  == no) and (i.Node_pos_RING == pos) and (i.Node_chain_RING == chain))
    or primeiro):
      if(primeiro == True):
        primeiro = False
      pdb = i.PDB_id_RING
      no = i.Node_RING
      pos = i.Node_pos_RING
      chain = i.Node_chain_RING
      totlig = totlig + i.Inter_Lig_tot
      totres = totres + i.Inter_Res_tot
      tot1 = tot1 + i.Inter_IAC_Lig_tot
      tot2 = tot2 + i.Inter_VDW_Lig_tot
      tot3 = tot3 + i.Inter_HBOND_Lig_tot
      tot4 = tot4 + i.Inter_PIPISTACK_Lig_tot
      tot5 = tot5 + i.Inter_IONIC_Lig_tot
      tot6 = tot6 + i.Inter_SSBOND_Lig_tot
      tot7 = tot7 + i.Inter_PICATION_Lig_tot
      tot8 = tot8 + i.Inter_IAC_Res_tot
      tot9 = tot9 + i.Inter_VDW_Res_tot
      tot10 = tot10 + i.Inter_HBOND_Res_tot
      tot11 = tot11 + i.Inter_PIPISTACK_Res_tot
      tot12 = tot12 + i.Inter_IONIC_Res_tot
      tot13 = tot13 + i.Inter_SSBOND_Res_tot
      tot14 = tot14 + i.Inter_PICATION_Res_tot
    else:  #key changed ((i.PDB_id_RING != pdb) & (i.Node_RING  "!=" no) & (i.Node_pos_RING != pos) & (i.Node_chain_RING != chain))
      l_PDB_id_RING.append(pdb)
      l_Node_RING.append(no)
      l__Node_pos_RING.append(pos)
      l_Node_chain_RING.append(chain)
      l_Inter_Lig_tot.append(totlig)
      l_Inter_Res_tot.append(totres)
      l_Inter_IAC_Lig_tot.append(tot1)
      l_Inter_VDW_Lig_tot.append(tot2)
      l_Inter_HBOND_Lig_tot.append(tot3)
      l_Inter_PIPISTACK_Lig_tot.append(tot4)
      l_Inter_IONIC_Lig_tot.append(tot5)
      l_Inter_SSBOND_Lig_tot.append(tot6)
      l_Inter_PICATION_Lig_tot.append(tot7)
      l_Inter_IAC_Res_tot.append(tot8)
      l_Inter_VDW_Res_tot.append(tot9)
      l_Inter_HBOND_Res_tot.append(tot10)
      l_Inter_PIPISTACK_Res_tot.append(tot11)
      l_Inter_IONIC_Res_tot.append(tot12)
      l_Inter_SSBOND_Res_tot.append(tot13)
      l_Inter_PICATION_Res_tot.append(tot14)
      totlig = 0
      totres = 0
      tot1 = 0
      tot2 = 0
      tot3 = 0
      tot4 = 0
      tot5 = 0
      tot6 = 0
      tot7 = 0
      tot8 = 0
      tot9 = 0
      tot10 = 0
      tot11 = 0
      tot12 = 0
      tot13 = 0
      tot14 = 0
      pdb = i.PDB_id_RING
      no = i.Node_RING
      pos = i.Node_pos_RING
      chain = i.Node_chain_RING
      totlig = totlig + i.Inter_Lig_tot
      totres = totres + i.Inter_Res_tot
      tot1 = tot1 + i.Inter_IAC_Lig_tot
      tot2 = tot2 + i.Inter_VDW_Lig_tot
      tot3 = tot3 + i.Inter_HBOND_Lig_tot
      tot4 = tot4 + i.Inter_PIPISTACK_Lig_tot
      tot5 = tot5 + i.Inter_IONIC_Lig_tot
      tot6 = tot6 + i.Inter_SSBOND_Lig_tot
      tot7 = tot7 + i.Inter_PICATION_Lig_tot
      tot8 = tot8 + i.Inter_IAC_Res_tot
      tot9 = tot9 + i.Inter_VDW_Res_tot
      tot10 = tot10 + i.Inter_HBOND_Res_tot
      tot11 = tot11 + i.Inter_PIPISTACK_Res_tot
      tot12 = tot12 + i.Inter_IONIC_Res_tot
      tot13 = tot13 + i.Inter_SSBOND_Res_tot
      tot14 = tot14 + i.Inter_PICATION_Res_tot

  return l_PDB_id_RING,l_Node_RING,l__Node_pos_RING,l_Node_chain_RING,l_Inter_Lig_tot,l_Inter_Res_tot,l_Inter_IAC_Lig_tot,l_Inter_VDW_Lig_tot,l_Inter_HBOND_Lig_tot,l_Inter_PIPISTACK_Lig_tot,l_Inter_IONIC_Lig_tot,l_Inter_SSBOND_Lig_tot,l_Inter_PICATION_Lig_tot,l_Inter_IAC_Res_tot,l_Inter_VDW_Res_tot,l_Inter_HBOND_Res_tot,l_Inter_PIPISTACK_Res_tot,l_Inter_IONIC_Res_tot,l_Inter_SSBOND_Res_tot,l_Inter_PICATION_Res_tot


In [None]:
#Processing the edges and joining the information when it is source and target
l_PDB1,l_Node1,l_pos1,l_chain1,l_lig1,l_res1,l_IAC_L1,l_VDW_L1,l_HBOND_L1,l_PIPISTACK_L1,l_IONIC_L1,l_SSBOND_L1,l_PICATION_L1,l_IAC_R1,l_VDW_R1,l_HBOND_R1,l_PIPISTACK_R1,l_IONIC_R1,l_SSBOND_R1,l_PICATION_R1 = process_reg_ord(df_ord)

e o primeiro


In [None]:
#Resulting size of processing Source
tam = len(l_PDB1)
print(tam)

12871779


In [None]:
df_proc_edge['PDB_id_RING'] = l_PDB1
df_proc_edge['Node_RING'] = l_Node1
df_proc_edge['Node_pos_RING'] = l_pos1
df_proc_edge['Node_chain_RING'] = l_chain1
df_proc_edge['Inter_Lig_tot'] = l_lig1
df_proc_edge['Inter_Res_tot'] = l_res1
df_proc_edge['Inter_IAC_Lig_tot'] = l_IAC_L1
df_proc_edge['Inter_VDW_Lig_tot'] = l_VDW_L1
df_proc_edge['Inter_HBOND_Lig_tot'] = l_HBOND_L1
df_proc_edge['Inter_PIPISTACK_Lig_tot'] = l_PIPISTACK_L1
df_proc_edge['Inter_IONIC_Lig_tot'] = l_IONIC_L1
df_proc_edge['Inter_SSBOND_Lig_tot'] = l_SSBOND_L1
df_proc_edge['Inter_PICATION_Lig_tot'] = l_PICATION_L1
df_proc_edge['Inter_IAC_Res_tot'] = l_IAC_R1
df_proc_edge['Inter_VDW_Res_tot'] = l_VDW_R1
df_proc_edge['Inter_HBOND_Res_tot'] = l_HBOND_R1
df_proc_edge['Inter_PIPISTACK_Res_tot'] = l_PIPISTACK_R1
df_proc_edge['Inter_IONIC_Res_tot'] = l_IONIC_R1
df_proc_edge['Inter_SSBOND_Res_tot'] = l_SSBOND_R1
df_proc_edge['Inter_PICATION_Res_tot'] = l_PICATION_R1

In [None]:
df_proc_edge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12871779 entries, 0 to 12871778
Data columns (total 20 columns):
 #   Column                   Dtype 
---  ------                   ----- 
 0   PDB_id_RING              object
 1   Node_RING                object
 2   Node_pos_RING            int64 
 3   Node_chain_RING          object
 4   Inter_Lig_tot            int64 
 5   Inter_Res_tot            int64 
 6   Inter_IAC_Lig_tot        int64 
 7   Inter_VDW_Lig_tot        int64 
 8   Inter_HBOND_Lig_tot      int64 
 9   Inter_PIPISTACK_Lig_tot  int64 
 10  Inter_IONIC_Lig_tot      int64 
 11  Inter_SSBOND_Lig_tot     int64 
 12  Inter_PICATION_Lig_tot   int64 
 13  Inter_IAC_Res_tot        int64 
 14  Inter_VDW_Res_tot        int64 
 15  Inter_HBOND_Res_tot      int64 
 16  Inter_PIPISTACK_Res_tot  int64 
 17  Inter_IONIC_Res_tot      int64 
 18  Inter_SSBOND_Res_tot     int64 
 19  Inter_PICATION_Res_tot   int64 
dtypes: int64(17), object(3)
memory usage: 1.9+ GB


Comparing the result (**df_proc_edge**) with the ordered edges database (**df_ord**)

In [None]:
df_ord.head(10)

Unnamed: 0,PDB_id_RING,Node_RING,Node_pos_RING,Node_chain_RING,Node_type,Inter_Lig_tot,Inter_Res_tot,Inter_IAC_Lig_tot,Inter_VDW_Lig_tot,Inter_HBOND_Lig_tot,Inter_PIPISTACK_Lig_tot,Inter_IONIC_Lig_tot,Inter_SSBOND_Lig_tot,Inter_PICATION_Lig_tot,Inter_IAC_Res_tot,Inter_VDW_Res_tot,Inter_HBOND_Res_tot,Inter_PIPISTACK_Res_tot,Inter_IONIC_Res_tot,Inter_SSBOND_Res_tot,Inter_PICATION_Res_tot
0,10GS,Ala,15,A,S,0,3,0,0,0,0,0,0,0,0,1,2,0,0,0,0
1,10GS,Ala,15,A,T,0,2,0,0,0,0,0,0,0,0,1,1,0,0,0,0
2,10GS,Ala,15,B,S,0,4,0,0,0,0,0,0,0,0,2,2,0,0,0,0
3,10GS,Ala,15,B,T,0,2,0,0,0,0,0,0,0,0,1,1,0,0,0,0
4,10GS,Ala,16,A,S,0,3,0,0,0,0,0,0,0,0,1,2,0,0,0,0
5,10GS,Ala,16,B,S,0,3,0,0,0,0,0,0,0,0,1,2,0,0,0,0
6,10GS,Ala,22,A,S,0,4,0,0,0,0,0,0,0,0,3,1,0,0,0,0
7,10GS,Ala,22,A,T,51,1,51,0,0,0,0,0,0,0,0,1,0,0,0,0
8,10GS,Ala,22,B,S,0,4,0,0,0,0,0,0,0,0,3,1,0,0,0,0
9,10GS,Ala,22,B,T,51,1,51,0,0,0,0,0,0,0,0,1,0,0,0,0


In [None]:
df_proc_edge.head(10)

Unnamed: 0,PDB_id_RING,Node_RING,Node_pos_RING,Node_chain_RING,Inter_Lig_tot,Inter_Res_tot,Inter_IAC_Lig_tot,Inter_VDW_Lig_tot,Inter_HBOND_Lig_tot,Inter_PIPISTACK_Lig_tot,Inter_IONIC_Lig_tot,Inter_SSBOND_Lig_tot,Inter_PICATION_Lig_tot,Inter_IAC_Res_tot,Inter_VDW_Res_tot,Inter_HBOND_Res_tot,Inter_PIPISTACK_Res_tot,Inter_IONIC_Res_tot,Inter_SSBOND_Res_tot,Inter_PICATION_Res_tot
0,10GS,Ala,15,A,0,5,0,0,0,0,0,0,0,0,2,3,0,0,0,0
1,10GS,Ala,15,B,0,6,0,0,0,0,0,0,0,0,3,3,0,0,0,0
2,10GS,Ala,16,A,0,3,0,0,0,0,0,0,0,0,1,2,0,0,0,0
3,10GS,Ala,16,B,0,3,0,0,0,0,0,0,0,0,1,2,0,0,0,0
4,10GS,Ala,22,A,51,5,51,0,0,0,0,0,0,0,3,2,0,0,0,0
5,10GS,Ala,22,B,51,5,51,0,0,0,0,0,0,0,3,2,0,0,0,0
6,10GS,Ala,45,A,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
7,10GS,Ala,45,B,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
8,10GS,Ala,86,A,0,7,0,0,0,0,0,0,0,0,3,4,0,0,0,0
9,10GS,Ala,86,B,0,7,0,0,0,0,0,0,0,0,3,4,0,0,0,0


In [None]:
df_ord.query('PDB_id_RING == "9JDW" & Node_RING == "Phe" & Node_pos_RING == "330" & Node_chain_RING == "A"')

Unnamed: 0,PDB_id_RING,Node_RING,Node_pos_RING,Node_chain_RING,Node_type,Inter_Lig_tot,Inter_Res_tot,Inter_IAC_Lig_tot,Inter_VDW_Lig_tot,Inter_HBOND_Lig_tot,Inter_PIPISTACK_Lig_tot,Inter_IONIC_Lig_tot,Inter_SSBOND_Lig_tot,Inter_PICATION_Lig_tot,Inter_IAC_Res_tot,Inter_VDW_Res_tot,Inter_HBOND_Res_tot,Inter_PIPISTACK_Res_tot,Inter_IONIC_Res_tot,Inter_SSBOND_Res_tot,Inter_PICATION_Res_tot
20219393,9JDW,Phe,330,A,S,0,8,0,0,0,0,0,0,0,0,4,3,1,0,0,0
20219394,9JDW,Phe,330,A,T,0,12,0,0,0,0,0,0,0,0,9,1,2,0,0,0


In [None]:
df_proc_edge.query('PDB_id_RING == "9JDW" & Node_RING == "Phe" & Node_pos_RING == "330" & Node_chain_RING == "A"')

Unnamed: 0,PDB_id_RING,Node_RING,Node_pos_RING,Node_chain_RING,Inter_Lig_tot,Inter_Res_tot,Inter_IAC_Lig_tot,Inter_VDW_Lig_tot,Inter_HBOND_Lig_tot,Inter_PIPISTACK_Lig_tot,Inter_IONIC_Lig_tot,Inter_SSBOND_Lig_tot,Inter_PICATION_Lig_tot,Inter_IAC_Res_tot,Inter_VDW_Res_tot,Inter_HBOND_Res_tot,Inter_PIPISTACK_Res_tot,Inter_IONIC_Res_tot,Inter_SSBOND_Res_tot,Inter_PICATION_Res_tot
12871677,9JDW,Phe,330,A,0,20,0,0,0,0,0,0,0,0,13,4,3,0,0,0


In [None]:
df_proc_edge.to_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRING/edgesDB_proc.csv",sep='\t',index=False)

#2 - Reading and processing the RING nodes file

##2.1 Processing the *nodesDB.txt* database

In [None]:
#increasing the viewing capacity of columns and rows
import pandas as pd

pd.set_option('display.max_columns', 7000)
pd.set_option('display.max_rows',1000000)
pd.set_option('display.width', 7000)

In [None]:
import pandas as pd

df_RING_nodes = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRING/nodesDB.txt",index_col=False, header=None, delimiter='\t')


In [None]:
df_RING_nodes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12883771 entries, 0 to 12883770
Data columns (total 14 columns):
 #   Column  Dtype  
---  ------  -----  
 0   0       object 
 1   1       object 
 2   2       object 
 3   3       int64  
 4   4       object 
 5   5       object 
 6   6       int64  
 7   7       float64
 8   8       float64
 9   9       float64
 10  10      float64
 11  11      object 
 12  12      float64
 13  13      float64
dtypes: float64(6), int64(2), object(6)
memory usage: 1.3+ GB


In [None]:
df_RING_nodes.head(100)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,10gs,A:2:_:PRO,A,2,PRO,,1,31.96,31.195,2.392,37.963,10GS.pdb#2.A,-15.115,-0.0
1,10gs,A:3:_:TYR,A,3,TYR,E,12,18.42,28.011,2.405,35.85,10GS.pdb#3.A,-82.695,0.534
2,10gs,A:4:_:THR,A,4,THR,E,3,19.34,24.361,2.547,36.835,10GS.pdb#4.A,-95.302,-0.315
3,10gs,A:5:_:VAL,A,5,VAL,E,4,17.96,21.42,2.437,34.438,10GS.pdb#5.A,-124.668,0.894
4,10gs,A:6:_:VAL,A,6,VAL,E,5,19.18,18.059,3.585,35.781,10GS.pdb#6.A,-140.144,0.636
5,10gs,A:7:_:TYR,A,7,TYR,E,137,17.51,15.226,2.331,33.585,10GS.pdb#7.A,-39.65,0.393
6,10gs,A:8:_:PHE,A,8,PHE,,281,20.97,11.87,0.551,33.384,10GS.pdb#8.A,-46.54,-0.573
7,10gs,A:9:_:PRO,A,9,PRO,S,12,25.63,11.81,-3.285,33.365,10GS.pdb#9.A,-15.709,0.81
8,10gs,A:10:_:VAL,A,10,VAL,S,84,19.23,11.969,-3.489,29.541,10GS.pdb#10.A,16.02,-0.032
9,10gs,A:11:_:ARG,A,11,ARG,,13,18.17,14.701,-3.694,26.898,10GS.pdb#11.A,-45.97,0.177


###2.1.1 Renaming the fields

In [None]:
df_RING_nodes.rename(columns={0: 'PDB_id_RING',
                       1: 'NodeId_RING',
                       2: 'Chain_RING',
                       3: 'Position_RING',
                       4: 'Residue_RING',
                       5: 'Dssp_RING',
                       6: 'Degree_RING',
                       7: 'Bfactor_CA_RING',
                       8: 'x',
                       9: 'y',
                       10: 'z',
                       11: 'pdbFileName'}, inplace=True)

In [None]:
df_RING_nodes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12883771 entries, 0 to 12883770
Data columns (total 14 columns):
 #   Column           Dtype  
---  ------           -----  
 0   PDB_id_RING      object 
 1   NodeId_RING      object 
 2   Chain_RING       object 
 3   Position_RING    int64  
 4   Residue_RING     object 
 5   Dssp_RING        object 
 6   Degree_RING      int64  
 7   Bfactor_CA_RING  float64
 8   x                float64
 9   y                float64
 10  z                float64
 11  pdbFileName      object 
 12  12               float64
 13  13               float64
dtypes: float64(6), int64(2), object(6)
memory usage: 1.3+ GB


In [None]:
df_RING_nodes.head()

Unnamed: 0,PDB_id_RING,NodeId_RING,Chain_RING,Position_RING,Residue_RING,Dssp_RING,Degree_RING,Bfactor_CA_RING,x,y,z,pdbFileName,12,13
0,10gs,A:2:_:PRO,A,2,PRO,,1,31.96,31.195,2.392,37.963,10GS.pdb#2.A,-15.115,-0.0
1,10gs,A:3:_:TYR,A,3,TYR,E,12,18.42,28.011,2.405,35.85,10GS.pdb#3.A,-82.695,0.534
2,10gs,A:4:_:THR,A,4,THR,E,3,19.34,24.361,2.547,36.835,10GS.pdb#4.A,-95.302,-0.315
3,10gs,A:5:_:VAL,A,5,VAL,E,4,17.96,21.42,2.437,34.438,10GS.pdb#5.A,-124.668,0.894
4,10gs,A:6:_:VAL,A,6,VAL,E,5,19.18,18.059,3.585,35.781,10GS.pdb#6.A,-140.144,0.636


###2.1.2 Selection of fields that will be used

In [None]:
#Selection of fields
df_RING_nodes = df_RING_nodes.loc[:,['PDB_id_RING','NodeId_RING', 'Chain_RING','Position_RING', 'Residue_RING','Dssp_RING','Degree_RING','Bfactor_CA_RING']]

In [None]:
df_RING_nodes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12883771 entries, 0 to 12883770
Data columns (total 8 columns):
 #   Column           Dtype  
---  ------           -----  
 0   PDB_id_RING      object 
 1   NodeId_RING      object 
 2   Chain_RING       object 
 3   Position_RING    int64  
 4   Residue_RING     object 
 5   Dssp_RING        object 
 6   Degree_RING      int64  
 7   Bfactor_CA_RING  float64
dtypes: float64(1), int64(2), object(5)
memory usage: 786.4+ MB


In [None]:
df_RING_nodes.head()

Unnamed: 0,PDB_id_RING,NodeId_RING,Chain_RING,Position_RING,Residue_RING,Dssp_RING,Degree_RING,Bfactor_CA_RING
0,10gs,A:2:_:PRO,A,2,PRO,,1,31.96
1,10gs,A:3:_:TYR,A,3,TYR,E,12,18.42
2,10gs,A:4:_:THR,A,4,THR,E,3,19.34
3,10gs,A:5:_:VAL,A,5,VAL,E,4,17.96
4,10gs,A:6:_:VAL,A,6,VAL,E,5,19.18


In [None]:
#Except NodeId
def categories_column(df):
    for col in ['PDB_id_RING',	'Chain_RING',	'Position_RING',	'Residue_RING',	'Dssp_RING',	'Degree_RING',	'Bfactor_CA_RING']:
        mydic= df[col].value_counts().to_dict()
        print(col, mydic)
        print('\n')

categories_column(df_RING_nodes)

PDB_id_RING {'1qo5': 6046, '5le5': 6046, '5l5u': 6039, '5l5f': 6018, '5lf7': 6018, '5lf0': 6018, '5lf3': 6016, '5lf4': 6013, '3lk4': 6005, '5l5h': 6005, '5lf6': 6004, '5l5a': 6002, '5l5s': 6002, '5ley': 6000, '5lf1': 5999, '5l5o': 5985, '6htr': 5975, '5lez': 5927, '5lex': 5914, '4r3o': 5799, '5dou': 5521, '2q3e': 5401, '4dvq': 5317, '5k9q': 5163, '4xgz': 4786, '2f5z': 4751, '1zy8': 4558, '3b2u': 4478, '4dl1': 4387, '4ay1': 4175, '4zul': 3964, '2a3w': 3944, '4zuk': 3943, '2j6l': 3932, '3n80': 3911, '3sz9': 3893, '1yde': 3887, '1o02': 3885, '5l13': 3879, '1o01': 3877, '3inj': 3873, '5w08': 3871, '1cw3': 3860, '1o00': 3859, '1nzz': 3856, '4kwg': 3849, '1nzx': 3845, '1o05': 3841, '2vle': 3836, '5l2o': 3832, '4zvw': 3826, '1n4s': 3819, '4kwf': 3814, '3pvn': 3813, '1n4q': 3803, '6vr6': 3793, '3pnw': 3765, '4cqm': 3759, '6z86': 3754, '5z2c': 3690, '6i34': 3690, '6i35': 3689, '1zmd': 3664, '1zmc': 3663, '5lhd': 3629, '6qak': 3614, '6x5t': 3602, '4bl5': 3598, '2qg4': 3594, '5nhg': 3589, '3som':

###2.1.3 Processing the *PDB_id* and *Residue* attributes

All letters in the PDB code will be uppercase

In [None]:
df_RING_nodes["PDB_id_RING"] = df_RING_nodes["PDB_id_RING"].apply(lambda x: x.upper())

In [None]:
df_RING_nodes.head(10)

Unnamed: 0,PDB_id_RING,NodeId_RING,Chain_RING,Position_RING,Residue_RING,Dssp_RING,Degree_RING,Bfactor_CA_RING
0,10GS,A:2:_:PRO,A,2,PRO,,1,31.96
1,10GS,A:3:_:TYR,A,3,TYR,E,12,18.42
2,10GS,A:4:_:THR,A,4,THR,E,3,19.34
3,10GS,A:5:_:VAL,A,5,VAL,E,4,17.96
4,10GS,A:6:_:VAL,A,6,VAL,E,5,19.18
5,10GS,A:7:_:TYR,A,7,TYR,E,137,17.51
6,10GS,A:8:_:PHE,A,8,PHE,,281,20.97
7,10GS,A:9:_:PRO,A,9,PRO,S,12,25.63
8,10GS,A:10:_:VAL,A,10,VAL,S,84,19.23
9,10GS,A:11:_:ARG,A,11,ARG,,13,18.17


In [None]:
df_RING_nodes["Residue_RING"].value_counts()

LEU    1290746
VAL     914387
ALA     824327
GLU     802038
SER     798047
LYS     778419
GLY     755750
ILE     693420
THR     690233
ASP     653861
ARG     650060
PHE     584169
PRO     551878
GLN     537589
TYR     512256
ASN     497734
HIS     350684
MET     289292
CYS     268775
TRP     207973
EDO      18492
SO4      14500
DG       13703
DC       13093
DT       11510
DA       11406
GOL      10735
ZN       10352
NAG       9295
CL        6763
MG        6001
CA        5663
UNX       5266
MSE       4301
PO4       2359
DMS       2209
ACT       2102
MN        2045
G         2030
C         1836
U         1706
A         1641
DOD       1592
PEG       1495
MAN       1366
IOD       1366
HEM       1348
NI        1132
SEP       1007
FMT        937
PTR        818
TPO        787
K          784
ACE        750
BMA        742
FUC        669
CD         636
MES        548
ADP        488
CSO        484
MPD        483
HYP        458
FAD        441
MLI        441
NAP        440
GLC        425
NH2       

The residues will only have the first letter capitalized

In [None]:
Amin = ['ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HIS', 'ILE', 'LEU', 'LYS',
      'MET', 'PHE', 'PRO', 'PYL', 'SER', 'SEC', 'THR', 'TRP', 'TYR', 'VAL', 'ASX', 'GLX']

df_RING_nodes["Residue_RING"] = df_RING_nodes["Residue_RING"].apply(lambda x: x.capitalize() if x in Amin else x)


In [None]:
df_RING_nodes["Residue_RING"].value_counts()

Leu    1290746
Val     914387
Ala     824327
Glu     802038
Ser     798047
Lys     778419
Gly     755750
Ile     693420
Thr     690233
Asp     653861
Arg     650060
Phe     584169
Pro     551878
Gln     537589
Tyr     512256
Asn     497734
His     350684
Met     289292
Cys     268775
Trp     207973
EDO      18492
SO4      14500
DG       13703
DC       13093
DT       11510
DA       11406
GOL      10735
ZN       10352
NAG       9295
CL        6763
MG        6001
CA        5663
UNX       5266
MSE       4301
PO4       2359
DMS       2209
ACT       2102
MN        2045
G         2030
C         1836
U         1706
A         1641
DOD       1592
PEG       1495
MAN       1366
IOD       1366
HEM       1348
NI        1132
SEP       1007
FMT        937
PTR        818
TPO        787
K          784
ACE        750
BMA        742
FUC        669
CD         636
MES        548
ADP        488
CSO        484
MPD        483
HYP        458
FAD        441
MLI        441
NAP        440
GLC        425
NH2       

In [None]:
df_RING_nodes.head(100)

Unnamed: 0,PDB_id_RING,NodeId_RING,Chain_RING,Position_RING,Residue_RING,Dssp_RING,Degree_RING,Bfactor_CA_RING
0,10GS,A:2:_:PRO,A,2,Pro,,1,31.96
1,10GS,A:3:_:TYR,A,3,Tyr,E,12,18.42
2,10GS,A:4:_:THR,A,4,Thr,E,3,19.34
3,10GS,A:5:_:VAL,A,5,Val,E,4,17.96
4,10GS,A:6:_:VAL,A,6,Val,E,5,19.18
5,10GS,A:7:_:TYR,A,7,Tyr,E,137,17.51
6,10GS,A:8:_:PHE,A,8,Phe,,281,20.97
7,10GS,A:9:_:PRO,A,9,Pro,S,12,25.63
8,10GS,A:10:_:VAL,A,10,Val,S,84,19.23
9,10GS,A:11:_:ARG,A,11,Arg,,13,18.17


In [None]:
df_RING_nodes.tail(30)

Unnamed: 0,PDB_id_RING,NodeId_RING,Chain_RING,Position_RING,Residue_RING,Dssp_RING,Degree_RING,Bfactor_CA_RING
12883741,9JDW,A:393:_:VAL,A,393,Val,E,9,14.98
12883742,9JDW,A:394:_:ASN,A,394,Asn,,3,20.94
12883743,9JDW,A:395:_:ILE,A,395,Ile,,14,17.35
12883744,9JDW,A:396:_:ARG,A,396,Arg,H,10,19.2
12883745,9JDW,A:397:_:ASN,A,397,Asn,H,13,25.06
12883746,9JDW,A:398:_:ALA,A,398,Ala,H,8,17.14
12883747,9JDW,A:399:_:ASN,A,399,Asn,H,9,20.99
12883748,9JDW,A:400:_:SER,A,400,Ser,H,9,22.11
12883749,9JDW,A:401:_:LEU,A,401,Leu,H,7,21.36
12883750,9JDW,A:402:_:GLY,A,402,Gly,H,17,15.12


In [None]:
#Checking for the existence of 'missing' values
df_RING_nodes.isna().sum()

PDB_id_RING             0
NodeId_RING             0
Chain_RING              0
Position_RING           0
Residue_RING         3557
Dssp_RING          232571
Degree_RING             0
Bfactor_CA_RING         0
dtype: int64

In [None]:
#The residue is null (NaN), when in NodeId the value of the residue is NA (None available) and the value of Bfactor_CA = -999.9
q1 = df_RING_nodes.query('Residue_RING.isnull()', engine='python')

In [None]:
q1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3557 entries, 10838 to 12883299
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   PDB_id_RING      3557 non-null   object 
 1   NodeId_RING      3557 non-null   object 
 2   Chain_RING       3557 non-null   object 
 3   Position_RING    3557 non-null   int64  
 4   Residue_RING     0 non-null      object 
 5   Dssp_RING        0 non-null      object 
 6   Degree_RING      3557 non-null   int64  
 7   Bfactor_CA_RING  3557 non-null   float64
dtypes: float64(1), int64(2), object(5)
memory usage: 250.1+ KB


In [None]:
q1

Unnamed: 0,PDB_id_RING,NodeId_RING,Chain_RING,Position_RING,Residue_RING,Dssp_RING,Degree_RING,Bfactor_CA_RING
10838,1A2C,H:626:_:NA,H,626,,,126,-999.9
14638,1A46,H:548:_:NA,H,548,,,126,-999.9
14639,1A46,H:549:_:NA,H,549,,,83,-999.9
15745,1A4W,H:541:_:NA,H,541,,,128,-999.9
15746,1A4W,H:542:_:NA,H,542,,,80,-999.9
17127,1A5G,H:391:_:NA,H,391,,,133,-999.9
17128,1A5G,H:392:_:NA,H,392,,,78,-999.9
17408,1A61,H:650:_:NA,H,650,,,131,-999.9
17409,1A61,H:651:_:NA,H,651,,,81,-999.9
24636,1AD8,H:1:_:NA,H,1,,,77,-999.9


In [None]:
#Dssp is null (NaN) in the situation of q1 and when the residue exists, but Bfactor_CA is -999.9
q2 = df_RING_nodes.query('Dssp_RING.isnull()', engine='python')

In [None]:
q2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 232571 entries, 201 to 12883770
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   PDB_id_RING      232571 non-null  object 
 1   NodeId_RING      232571 non-null  object 
 2   Chain_RING       232571 non-null  object 
 3   Position_RING    232571 non-null  int64  
 4   Residue_RING     229014 non-null  object 
 5   Dssp_RING        0 non-null       object 
 6   Degree_RING      232571 non-null  int64  
 7   Bfactor_CA_RING  232571 non-null  float64
dtypes: float64(1), int64(2), object(5)
memory usage: 16.0+ MB


In [None]:
q2.head(100)

Unnamed: 0,PDB_id_RING,NodeId_RING,Chain_RING,Position_RING,Residue_RING,Dssp_RING,Degree_RING,Bfactor_CA_RING
201,10GS,A:210:_:VWW,A,210,VWW,,2253,-999.9
202,10GS,A:211:_:MES,A,211,MES,,656,-999.9
404,10GS,B:210:_:VWW,B,210,VWW,,2257,-999.9
405,10GS,B:211:_:MES,B,211,MES,,656,-999.9
608,11GS,A:210:_:GSH,A,210,GSH,,1542,-999.9
609,11GS,A:211:_:EAA,A,211,EAA,,1145,-999.9
610,11GS,A:212:_:MES,A,212,MES,,692,-999.9
813,11GS,B:210:_:GSH,B,210,GSH,,1555,-999.9
814,11GS,B:211:_:EAA,B,211,EAA,,1135,-999.9
815,11GS,B:212:_:MES,B,212,MES,,688,-999.9


In [None]:
#Dssp is " ", even though it has a valid residue and Bfactor_CA
q3 = df_RING_nodes.query('Dssp_RING == " "', engine='python')

In [None]:
q3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1972711 entries, 0 to 12883766
Data columns (total 8 columns):
 #   Column           Dtype  
---  ------           -----  
 0   PDB_id_RING      object 
 1   NodeId_RING      object 
 2   Chain_RING       object 
 3   Position_RING    int64  
 4   Residue_RING     object 
 5   Dssp_RING        object 
 6   Degree_RING      int64  
 7   Bfactor_CA_RING  float64
dtypes: float64(1), int64(2), object(5)
memory usage: 135.5+ MB


In [None]:
q3.head(100)

Unnamed: 0,PDB_id_RING,NodeId_RING,Chain_RING,Position_RING,Residue_RING,Dssp_RING,Degree_RING,Bfactor_CA_RING
0,10GS,A:2:_:PRO,A,2,Pro,,1,31.96
6,10GS,A:8:_:PHE,A,8,Phe,,281,20.97
9,10GS,A:11:_:ARG,A,11,Arg,,13,18.17
24,10GS,A:26:_:GLN,A,26,Gln,,21,28.99
25,10GS,A:27:_:SER,A,27,Ser,,37,26.82
31,10GS,A:33:_:VAL,A,33,Val,,3,20.59
32,10GS,A:34:_:THR,A,34,Thr,,4,36.01
49,10GS,A:51:_:GLN,A,51,Gln,,302,15.75
51,10GS,A:53:_:PRO,A,53,Pro,,113,19.9
75,10GS,A:78:_:LEU,A,78,Leu,,9,20.3


In [None]:
q4 = df_RING_nodes.query('Residue_RING== " "', engine='python')

In [None]:
q4

Unnamed: 0,PDB_id_RING,NodeId_RING,Chain_RING,Position_RING,Residue_RING,Dssp_RING,Degree_RING,Bfactor_CA_RING


In [None]:
#Converting NaN values to '.'
import numpy as np
df_RING_nodes_ok = df_RING_nodes.replace(np.nan, '.', regex=True)

In [None]:
#Checking for the existence of 'missing' values
df_RING_nodes_ok.isna().sum()

PDB_id_RING        0
NodeId_RING        0
Chain_RING         0
Position_RING      0
Residue_RING       0
Dssp_RING          0
Degree_RING        0
Bfactor_CA_RING    0
dtype: int64

In [None]:
res = df_RING_nodes_ok.query("Residue_RING == '.'")

In [None]:
res.tail(30)

Unnamed: 0,PDB_id_RING,NodeId_RING,Chain_RING,Position_RING,Residue_RING,Dssp_RING,Degree_RING,Bfactor_CA_RING
12810773,7K96,P:101:_:NA,P,101,.,.,52,-999.9
12811089,7K96,A:505:_:NA,A,505,.,.,97,-999.9
12811090,7K96,A:506:_:NA,A,506,.,.,94,-999.9
12811091,7K96,A:507:_:NA,A,507,.,.,51,-999.9
12811414,7K97,A:406:_:NA,A,406,.,.,96,-999.9
12811415,7K97,A:407:_:NA,A,407,.,.,102,-999.9
12814198,7KBU,B:708:_:NA,B,708,.,.,128,-999.9
12817994,7KGP,A:504:_:NA,A,504,.,.,64,-999.9
12817995,7KGP,A:505:_:NA,A,505,.,.,92,-999.9
12817996,7KGP,A:506:_:NA,A,506,.,.,72,-999.9


In [None]:
res1 = df_RING_nodes_ok.query("Dssp_RING == '.'")

In [None]:
res1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 232571 entries, 201 to 12883770
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   PDB_id_RING      232571 non-null  object 
 1   NodeId_RING      232571 non-null  object 
 2   Chain_RING       232571 non-null  object 
 3   Position_RING    232571 non-null  int64  
 4   Residue_RING     232571 non-null  object 
 5   Dssp_RING        232571 non-null  object 
 6   Degree_RING      232571 non-null  int64  
 7   Bfactor_CA_RING  232571 non-null  float64
dtypes: float64(1), int64(2), object(5)
memory usage: 16.0+ MB


In [None]:
res1.head(100)

Unnamed: 0,PDB_id_RING,NodeId_RING,Chain_RING,Position_RING,Residue_RING,Dssp_RING,Degree_RING,Bfactor_CA_RING
201,10GS,A:210:_:VWW,A,210,VWW,.,2253,-999.9
202,10GS,A:211:_:MES,A,211,MES,.,656,-999.9
404,10GS,B:210:_:VWW,B,210,VWW,.,2257,-999.9
405,10GS,B:211:_:MES,B,211,MES,.,656,-999.9
608,11GS,A:210:_:GSH,A,210,GSH,.,1542,-999.9
609,11GS,A:211:_:EAA,A,211,EAA,.,1145,-999.9
610,11GS,A:212:_:MES,A,212,MES,.,692,-999.9
813,11GS,B:210:_:GSH,B,210,GSH,.,1555,-999.9
814,11GS,B:211:_:EAA,B,211,EAA,.,1135,-999.9
815,11GS,B:212:_:MES,B,212,MES,.,688,-999.9


In [None]:
df_RING_nodes_ok["Residue_RING"] = df_RING_nodes_ok["Residue_RING"].apply(lambda x:"NA" if x == '.' else x)


In [None]:
res3 = df_RING_nodes_ok.query('Residue_RING == "NA"')

In [None]:
res3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3557 entries, 10838 to 12883299
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   PDB_id_RING      3557 non-null   object 
 1   NodeId_RING      3557 non-null   object 
 2   Chain_RING       3557 non-null   object 
 3   Position_RING    3557 non-null   int64  
 4   Residue_RING     3557 non-null   object 
 5   Dssp_RING        3557 non-null   object 
 6   Degree_RING      3557 non-null   int64  
 7   Bfactor_CA_RING  3557 non-null   float64
dtypes: float64(1), int64(2), object(5)
memory usage: 250.1+ KB


In [None]:
res3.tail(30)

Unnamed: 0,PDB_id_RING,NodeId_RING,Chain_RING,Position_RING,Residue_RING,Dssp_RING,Degree_RING,Bfactor_CA_RING
12810773,7K96,P:101:_:NA,P,101,,.,52,-999.9
12811089,7K96,A:505:_:NA,A,505,,.,97,-999.9
12811090,7K96,A:506:_:NA,A,506,,.,94,-999.9
12811091,7K96,A:507:_:NA,A,507,,.,51,-999.9
12811414,7K97,A:406:_:NA,A,406,,.,96,-999.9
12811415,7K97,A:407:_:NA,A,407,,.,102,-999.9
12814198,7KBU,B:708:_:NA,B,708,,.,128,-999.9
12817994,7KGP,A:504:_:NA,A,504,,.,64,-999.9
12817995,7KGP,A:505:_:NA,A,505,,.,92,-999.9
12817996,7KGP,A:506:_:NA,A,506,,.,72,-999.9


In [None]:
df_RING_nodes_ok["Residue_RING"].value_counts()

Leu    1290746
Val     914387
Ala     824327
Glu     802038
Ser     798047
Lys     778419
Gly     755750
Ile     693420
Thr     690233
Asp     653861
Arg     650060
Phe     584169
Pro     551878
Gln     537589
Tyr     512256
Asn     497734
His     350684
Met     289292
Cys     268775
Trp     207973
EDO      18492
SO4      14500
DG       13703
DC       13093
DT       11510
DA       11406
GOL      10735
ZN       10352
NAG       9295
CL        6763
MG        6001
CA        5663
UNX       5266
MSE       4301
NA        3557
PO4       2359
DMS       2209
ACT       2102
MN        2045
G         2030
C         1836
U         1706
A         1641
DOD       1592
PEG       1495
MAN       1366
IOD       1366
HEM       1348
NI        1132
SEP       1007
FMT        937
PTR        818
TPO        787
K          784
ACE        750
BMA        742
FUC        669
CD         636
MES        548
ADP        488
CSO        484
MPD        483
HYP        458
MLI        441
FAD        441
NAP        440
GLC       

In [None]:
res_dssp = df_RING_nodes_ok.query('Dssp_RING == " "')

In [None]:
res_dssp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1972711 entries, 0 to 12883766
Data columns (total 8 columns):
 #   Column           Dtype  
---  ------           -----  
 0   PDB_id_RING      object 
 1   NodeId_RING      object 
 2   Chain_RING       object 
 3   Position_RING    int64  
 4   Residue_RING     object 
 5   Dssp_RING        object 
 6   Degree_RING      int64  
 7   Bfactor_CA_RING  float64
dtypes: float64(1), int64(2), object(5)
memory usage: 135.5+ MB


In [None]:
res_dssp.head(100)

Unnamed: 0,PDB_id_RING,NodeId_RING,Chain_RING,Position_RING,Residue_RING,Dssp_RING,Degree_RING,Bfactor_CA_RING
0,10GS,A:2:_:PRO,A,2,Pro,,1,31.96
6,10GS,A:8:_:PHE,A,8,Phe,,281,20.97
9,10GS,A:11:_:ARG,A,11,Arg,,13,18.17
24,10GS,A:26:_:GLN,A,26,Gln,,21,28.99
25,10GS,A:27:_:SER,A,27,Ser,,37,26.82
31,10GS,A:33:_:VAL,A,33,Val,,3,20.59
32,10GS,A:34:_:THR,A,34,Thr,,4,36.01
49,10GS,A:51:_:GLN,A,51,Gln,,302,15.75
51,10GS,A:53:_:PRO,A,53,Pro,,113,19.9
75,10GS,A:78:_:LEU,A,78,Leu,,9,20.3


In [None]:
res_dssp.query('Residue_RING == "NA"')

Unnamed: 0,PDB_id_RING,NodeId_RING,Chain_RING,Position_RING,Residue_RING,Dssp_RING,Degree_RING,Bfactor_CA_RING


In [None]:
df_RING_nodes_ok.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12883771 entries, 0 to 12883770
Data columns (total 8 columns):
 #   Column           Dtype  
---  ------           -----  
 0   PDB_id_RING      object 
 1   NodeId_RING      object 
 2   Chain_RING       object 
 3   Position_RING    int64  
 4   Residue_RING     object 
 5   Dssp_RING        object 
 6   Degree_RING      int64  
 7   Bfactor_CA_RING  float64
dtypes: float64(1), int64(2), object(5)
memory usage: 786.4+ MB


###2.1.4 Generating an intermediate file with the processed *nodesDB.txt* database

In [None]:
df_RING_nodes_ok.to_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRING/nodesDB_proc.csv",sep='\t',index=False)