#0 - Basic Settings

In [None]:
#Permission to access a file from Google Drive.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#increasing the viewing capacity of columns and rows
import pandas as pd

pd.set_option('display.max_columns', 7000)
pd.set_option('display.max_rows',1000000)
pd.set_option('display.width', 7000)

#1 - Reading and processing RING edge files



In this section, the file **edgesDB.txt** will be read, which contains the content of the edges files of all wild AlphaFold structures that have notation in RING. The joining of all edges files was processed through the **JuntaEdges** script, which is located in the **TrataArqsRINGAlphaFold** folder of this drive.

The attributes that will be used from the edge files are:

- **FileName**: The name of the file generated by RING, from this name the Uniprot code and the F will be extracted.
- **NodeId1_RING**: The source node of the interaction. The node can be an amino acid or a ligand molecule. It contains the following information: Chain, Node Position and the node itself.
- Interaction_RING: contains the interaction type and the subtype of node1 and the subtype of node2.The subtype values are: *main chain* (MC), *side chain* (SC) e *ligand* (LIG).
- **NodeId2_RING**: The target node of the interaction. The node can be an amino acid or a ligand molecule. It contains the following information: chain, node position and the node itself.

##1.1 Processing the *edgesDB_01.txt* database

In [None]:
import pandas as pd

df_RING_edge = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRINGAlphaFold/edgesDB.txt",index_col=False, header=None, delimiter='\t')


In [None]:
df_RING_edge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22043890 entries, 0 to 22043889
Data columns (total 13 columns):
 #   Column  Dtype  
---  ------  -----  
 0   0       object 
 1   1       object 
 2   2       object 
 3   3       object 
 4   4       float64
 5   5       float64
 6   6       float64
 7   7       object 
 8   8       object 
 9   9       object 
 10  10      object 
 11  11      object 
 12  12      object 
dtypes: float64(3), object(10)
memory usage: 2.1+ GB


In [None]:
df_RING_edge.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,AF-A0AUZ9-F1-model_v4.pdb.edges,A:38:_:GLU,VDW:SC_SC,A:522:_:LEU,3.738,-999.9,6.0,CD,CD2,,,,
1,AF-A0AUZ9-F1-model_v4.pdb.edges,A:118:_:ASN,VDW:SC_SC,A:388:_:GLN,3.482,-999.9,6.0,CG,NE2,,,,
2,AF-A0AUZ9-F1-model_v4.pdb.edges,A:118:_:ASN,VDW:SC_SC,A:391:_:ASP,3.836,-999.9,6.0,ND2,CG,,,,
3,AF-A0AUZ9-F1-model_v4.pdb.edges,A:131:_:GLU,VDW:SC_SC,A:927:_:MET,3.684,-999.9,6.0,CD,CG,,,,
4,AF-A0AUZ9-F1-model_v4.pdb.edges,A:131:_:GLU,VDW:SC_SC,A:927:_:MET,3.98,-999.9,6.0,CD,SD,,,,


###1.1.1 Renaming the fields

In [None]:
df_RING_edge.rename(columns={0: 'FileName',
                       1: 'NodeId1_RING',
                       2: 'Interaction_RING',
                       3: 'NodeId2_RING',
                       4: 'Distance',
                       5: 'Angle',
                       6: 'Energy',
                       7: 'Atom1',
                       8: 'Atom2',
                       9: 'Donor',
                       10: 'Positive',
                       11: 'Cation',
                       12: 'Orientation'}, inplace=True)

In [None]:
df_RING_edge.head()

Unnamed: 0,FileName,NodeId1_RING,Interaction_RING,NodeId2_RING,Distance,Angle,Energy,Atom1,Atom2,Donor,Positive,Cation,Orientation
0,AF-A0AUZ9-F1-model_v4.pdb.edges,A:38:_:GLU,VDW:SC_SC,A:522:_:LEU,3.738,-999.9,6.0,CD,CD2,,,,
1,AF-A0AUZ9-F1-model_v4.pdb.edges,A:118:_:ASN,VDW:SC_SC,A:388:_:GLN,3.482,-999.9,6.0,CG,NE2,,,,
2,AF-A0AUZ9-F1-model_v4.pdb.edges,A:118:_:ASN,VDW:SC_SC,A:391:_:ASP,3.836,-999.9,6.0,ND2,CG,,,,
3,AF-A0AUZ9-F1-model_v4.pdb.edges,A:131:_:GLU,VDW:SC_SC,A:927:_:MET,3.684,-999.9,6.0,CD,CG,,,,
4,AF-A0AUZ9-F1-model_v4.pdb.edges,A:131:_:GLU,VDW:SC_SC,A:927:_:MET,3.98,-999.9,6.0,CD,SD,,,,


In [None]:
#Identify duplicates records in the data
dupes=df_RING_edge.duplicated()
sum(dupes)

1

###1.1.2 Selecting the fields that will be used

In [None]:
#Field Selection
df_RING_edge = df_RING_edge.loc[:,['FileName','NodeId1_RING', 'Interaction_RING', 'NodeId2_RING']]

In [None]:
df_RING_edge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22043890 entries, 0 to 22043889
Data columns (total 4 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   FileName          object
 1   NodeId1_RING      object
 2   Interaction_RING  object
 3   NodeId2_RING      object
dtypes: object(4)
memory usage: 672.7+ MB


In [None]:
#Checking for missing values
df_RING_edge.isna().sum()

FileName            0
NodeId1_RING        0
Interaction_RING    0
NodeId2_RING        0
dtype: int64

###1.1.3 Generating the Uniprot_AF_id_RING attribute extracted from the *FileName* attribute.

In [None]:
from re import U
def get_uniprot(filename):
  aux = filename.split("-")[1]   #AF-A0AUZ9-F1-model_v4.pdb.edges
  return aux

In [None]:
df_RING_edge['Uniprot_AF_id_RING'] = df_RING_edge['FileName'].apply(get_uniprot)

In [None]:
df_RING_edge.head(10)

Unnamed: 0,FileName,NodeId1_RING,Interaction_RING,NodeId2_RING,Uniprot_AF_id_RING
0,AF-A0AUZ9-F1-model_v4.pdb.edges,A:38:_:GLU,VDW:SC_SC,A:522:_:LEU,A0AUZ9
1,AF-A0AUZ9-F1-model_v4.pdb.edges,A:118:_:ASN,VDW:SC_SC,A:388:_:GLN,A0AUZ9
2,AF-A0AUZ9-F1-model_v4.pdb.edges,A:118:_:ASN,VDW:SC_SC,A:391:_:ASP,A0AUZ9
3,AF-A0AUZ9-F1-model_v4.pdb.edges,A:131:_:GLU,VDW:SC_SC,A:927:_:MET,A0AUZ9
4,AF-A0AUZ9-F1-model_v4.pdb.edges,A:131:_:GLU,VDW:SC_SC,A:927:_:MET,A0AUZ9
5,AF-A0AUZ9-F1-model_v4.pdb.edges,A:132:_:PHE,VDW:SC_SC,A:921:_:PRO,A0AUZ9
6,AF-A0AUZ9-F1-model_v4.pdb.edges,A:132:_:PHE,VDW:SC_SC,A:927:_:MET,A0AUZ9
7,AF-A0AUZ9-F1-model_v4.pdb.edges,A:132:_:PHE,VDW:SC_SC,A:927:_:MET,A0AUZ9
8,AF-A0AUZ9-F1-model_v4.pdb.edges,A:132:_:PHE,VDW:SC_SC,A:927:_:MET,A0AUZ9
9,AF-A0AUZ9-F1-model_v4.pdb.edges,A:211:_:SER,HBOND:MC_SC,A:216:_:LYS,A0AUZ9


###1.1.4 Generating the F_AF_RING attribute extracted from the *FileName* attribute

In [None]:
def get_F(filename):

  aux1 = filename.split("-")[2]  #AF-A0AUZ9-F1-model_v4.pdb.edges
  return aux1

In [None]:
df_RING_edge['F_AF_RING'] = df_RING_edge['FileName'].apply(get_F)

In [None]:
df_RING_edge.tail(50)

Unnamed: 0,FileName,NodeId1_RING,Interaction_RING,NodeId2_RING,Uniprot_AF_id_RING,F_AF_RING
22043840,AF-Q9Y6Z7-F1-model_v4.pdb.edges,A:234:_:TRP,VDW:SC_SC,A:249:_:VAL,Q9Y6Z7,F1
22043841,AF-Q9Y6Z7-F1-model_v4.pdb.edges,A:234:_:TRP,HBOND:MC_SC,A:256:_:ARG,Q9Y6Z7,F1
22043842,AF-Q9Y6Z7-F1-model_v4.pdb.edges,A:234:_:TRP,PIPISTACK:SC_SC,A:257:_:TRP,Q9Y6Z7,F1
22043843,AF-Q9Y6Z7-F1-model_v4.pdb.edges,A:234:_:TRP,VDW:SC_SC,A:257:_:TRP,Q9Y6Z7,F1
22043844,AF-Q9Y6Z7-F1-model_v4.pdb.edges,A:234:_:TRP,VDW:SC_MC,A:257:_:TRP,Q9Y6Z7,F1
22043845,AF-Q9Y6Z7-F1-model_v4.pdb.edges,A:234:_:TRP,VDW:SC_SC,A:257:_:TRP,Q9Y6Z7,F1
22043846,AF-Q9Y6Z7-F1-model_v4.pdb.edges,A:234:_:TRP,VDW:SC_MC,A:258:_:ASN,Q9Y6Z7,F1
22043847,AF-Q9Y6Z7-F1-model_v4.pdb.edges,A:235:_:ASN,HBOND:MC_MC,A:238:_:GLU,Q9Y6Z7,F1
22043848,AF-Q9Y6Z7-F1-model_v4.pdb.edges,A:235:_:ASN,VDW:SC_SC,A:238:_:GLU,Q9Y6Z7,F1
22043849,AF-Q9Y6Z7-F1-model_v4.pdb.edges,A:235:_:ASN,VDW:SC_SC,A:256:_:ARG,Q9Y6Z7,F1


The **FileName** attribute will no longer be needed, so it will be removed

In [None]:
del df_RING_edge['FileName']

In [None]:
df_RING_edge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22043890 entries, 0 to 22043889
Data columns (total 5 columns):
 #   Column              Dtype 
---  ------              ----- 
 0   NodeId1_RING        object
 1   Interaction_RING    object
 2   NodeId2_RING        object
 3   Uniprot_AF_id_RING  object
 4   F_AF_RING           object
dtypes: object(5)
memory usage: 840.9+ MB


In [None]:
#Identify duplicates records in the data
dupes=df_RING_edge.duplicated()
sum(dupes)

5696034

In [None]:
#Removing duplicates
df_RING_edge_ok  = df_RING_edge.drop_duplicates(keep='first')

In [None]:
#Identify duplicates records in the data
dupes=df_RING_edge_ok.duplicated()
sum(dupes)

0

In [None]:
df_RING_edge_ok.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16347856 entries, 0 to 22043889
Data columns (total 5 columns):
 #   Column              Dtype 
---  ------              ----- 
 0   NodeId1_RING        object
 1   Interaction_RING    object
 2   NodeId2_RING        object
 3   Uniprot_AF_id_RING  object
 4   F_AF_RING           object
dtypes: object(5)
memory usage: 748.3+ MB


###1.1.5 Generating an intermediate file with the selected fields from the *edgesDB.txt* database  

In [None]:
df_RING_edge_ok.to_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRINGAlphaFold/edgesDB_sel.csv",sep='\t',index=False)

###1.1.6 Splitting the *Interaction* field into two fields.

The **Interaction** field will be split into two: interacao and subinteracao.

In [None]:
#aumentado a capacidade de visualização de colunas e linhas
import pandas as pd

pd.set_option('display.max_columns', 7000)
pd.set_option('display.max_rows',1000000)
pd.set_option('display.width', 7000)

In [None]:
import pandas as pd
df_RING_edge = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRINGAlphaFold/edgesDB_sel.csv",delimiter='\t')

In [None]:
df_RING_edge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16347856 entries, 0 to 16347855
Data columns (total 5 columns):
 #   Column              Dtype 
---  ------              ----- 
 0   NodeId1_RING        object
 1   Interaction_RING    object
 2   NodeId2_RING        object
 3   Uniprot_AF_id_RING  object
 4   F_AF_RING           object
dtypes: object(5)
memory usage: 623.6+ MB


In [None]:
df_RING_edge["interacao_RING"] = df_RING_edge["Interaction_RING"].apply(lambda x: x.split(":")[0])

df_RING_edge["subinteracao_RING"] = df_RING_edge["Interaction_RING"].apply(lambda x: x.split(":")[1])

In [None]:
df_RING_edge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16347856 entries, 0 to 16347855
Data columns (total 7 columns):
 #   Column              Dtype 
---  ------              ----- 
 0   NodeId1_RING        object
 1   Interaction_RING    object
 2   NodeId2_RING        object
 3   Uniprot_AF_id_RING  object
 4   F_AF_RING           object
 5   interacao_RING      object
 6   subinteracao_RING   object
dtypes: object(7)
memory usage: 873.1+ MB


In [None]:
df_RING_edge.head()

Unnamed: 0,NodeId1_RING,Interaction_RING,NodeId2_RING,Uniprot_AF_id_RING,F_AF_RING,interacao_RING,subinteracao_RING
0,A:38:_:GLU,VDW:SC_SC,A:522:_:LEU,A0AUZ9,F1,VDW,SC_SC
1,A:118:_:ASN,VDW:SC_SC,A:388:_:GLN,A0AUZ9,F1,VDW,SC_SC
2,A:118:_:ASN,VDW:SC_SC,A:391:_:ASP,A0AUZ9,F1,VDW,SC_SC
3,A:131:_:GLU,VDW:SC_SC,A:927:_:MET,A0AUZ9,F1,VDW,SC_SC
4,A:132:_:PHE,VDW:SC_SC,A:921:_:PRO,A0AUZ9,F1,VDW,SC_SC


###1.1.7 Generating an intermediate file with the processing of the *Interaction_RING* attribute from the *edgesDB.txt* database  

In [None]:
df_RING_edge.to_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRINGAlphaFold/edgesDB_sel_proc_Interaction.csv",sep='\t',index=False)

###1.1.8 Extracting the residue, its position and chain in the *NodeId1* attribute

The source residue, its position and its chain contained in **NodeId1** will be extracted:

**Residue1_RING**: source node of the edge

**Residue1_pos_RING**: the position of the node

**Residue1_chain_RING**: the chain where the node is located

In [None]:
#increasing the viewing capacity of columns and rows
import pandas as pd

pd.set_option('display.max_columns', 7000)
pd.set_option('display.max_rows',1000000)
pd.set_option('display.width', 7000)

In [None]:
import pandas as pd
df_RING_edge = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRINGAlphaFold/edgesDB_sel_proc_Interaction.csv",delimiter='\t')

In [None]:
df_RING_edge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16347856 entries, 0 to 16347855
Data columns (total 7 columns):
 #   Column              Dtype 
---  ------              ----- 
 0   NodeId1_RING        object
 1   Interaction_RING    object
 2   NodeId2_RING        object
 3   Uniprot_AF_id_RING  object
 4   F_AF_RING           object
 5   interacao_RING      object
 6   subinteracao_RING   object
dtypes: object(7)
memory usage: 873.1+ MB


In [None]:
df_RING_edge["NodeId1_RING"].value_counts()

A:1:_:MET       5975
A:66:_:LEU      4796
A:48:_:LEU      4467
A:115:_:LEU     4356
A:63:_:LEU      4349
A:14:_:LEU      4348
A:114:_:LEU     4340
A:44:_:LEU      4274
A:55:_:LEU      4240
A:62:_:LEU      4237
A:45:_:LEU      4216
A:67:_:LEU      4201
A:39:_:LEU      4191
A:82:_:LEU      4187
A:15:_:LEU      4170
A:46:_:LEU      4162
A:64:_:LEU      4161
A:30:_:LEU      4153
A:51:_:LEU      4141
A:65:_:LEU      4135
A:79:_:LEU      4129
A:86:_:LEU      4127
A:13:_:LEU      4120
A:58:_:LEU      4110
A:101:_:LEU     4106
A:96:_:LEU      4104
A:53:_:LEU      4093
A:42:_:LEU      4087
A:47:_:LEU      4081
A:105:_:LEU     4066
A:68:_:LEU      4052
A:43:_:LEU      4048
A:56:_:LEU      4043
A:90:_:LEU      4033
A:31:_:LEU      4028
A:106:_:LEU     4016
A:57:_:LEU      4014
A:37:_:LEU      4008
A:107:_:LEU     4007
A:50:_:LEU      4004
A:83:_:LEU      4003
A:116:_:LEU     4003
A:61:_:LEU      3999
A:81:_:LEU      3998
A:75:_:LEU      3994
A:98:_:LEU      3994
A:87:_:LEU      3994
A:104:_:LEU  

In [None]:
df_RING_edge["Residue1_RING"] = df_RING_edge["NodeId1_RING"].apply(lambda x: x.split(":")[3])

In [None]:
#Convertendo para o padrão Ala, Arg, Asn,...
Amin = ['ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HIS', 'ILE', 'LEU', 'LYS',
      'MET', 'PHE', 'PRO', 'PYL', 'SER', 'SEC', 'THR', 'TRP', 'TYR', 'VAL', 'ASX', 'GLX']

df_RING_edge["Residue1_RING"] = df_RING_edge["Residue1_RING"].apply(lambda x: x.capitalize() if x in Amin else x)

In [None]:
df_RING_edge["Residue1_RING"].value_counts()

Leu    2239914
Phe    1173952
Val    1138271
Glu    1097559
Ile    1040286
Ala     883518
Arg     880470
Lys     817120
Ser     792503
Tyr     768217
Asp     759281
Gln     701915
Thr     693999
Asn     610003
Cys     548481
His     482982
Pro     451237
Met     449811
Gly     412988
Trp     405349
Name: Residue1_RING, dtype: int64

In [None]:
df_RING_edge["Residue1_pos_RING"] = df_RING_edge["NodeId1_RING"].apply(lambda x: x.split(":")[1])

In [None]:
df_RING_edge["Residue1_chain_RING"] = df_RING_edge["NodeId1_RING"].apply(lambda x: x.split(":")[0].upper())

In [None]:
df_RING_edge["Residue1_chain_RING"].value_counts()

A    16347856
Name: Residue1_chain_RING, dtype: int64

In [None]:
df_RING_edge.head(25)

Unnamed: 0,NodeId1_RING,Interaction_RING,NodeId2_RING,Uniprot_AF_id_RING,F_AF_RING,interacao_RING,subinteracao_RING,Residue1_RING,Residue1_pos_RING,Residue1_chain_RING
0,A:38:_:GLU,VDW:SC_SC,A:522:_:LEU,A0AUZ9,F1,VDW,SC_SC,Glu,38,A
1,A:118:_:ASN,VDW:SC_SC,A:388:_:GLN,A0AUZ9,F1,VDW,SC_SC,Asn,118,A
2,A:118:_:ASN,VDW:SC_SC,A:391:_:ASP,A0AUZ9,F1,VDW,SC_SC,Asn,118,A
3,A:131:_:GLU,VDW:SC_SC,A:927:_:MET,A0AUZ9,F1,VDW,SC_SC,Glu,131,A
4,A:132:_:PHE,VDW:SC_SC,A:921:_:PRO,A0AUZ9,F1,VDW,SC_SC,Phe,132,A
5,A:132:_:PHE,VDW:SC_SC,A:927:_:MET,A0AUZ9,F1,VDW,SC_SC,Phe,132,A
6,A:211:_:SER,HBOND:MC_SC,A:216:_:LYS,A0AUZ9,F1,HBOND,MC_SC,Ser,211,A
7,A:211:_:SER,HBOND:SC_SC,A:216:_:LYS,A0AUZ9,F1,HBOND,SC_SC,Ser,211,A
8,A:212:_:SER,HBOND:MC_SC,A:215:_:GLU,A0AUZ9,F1,HBOND,MC_SC,Ser,212,A
9,A:212:_:SER,HBOND:SC_MC,A:215:_:GLU,A0AUZ9,F1,HBOND,SC_MC,Ser,212,A


###1.1.9 Generating an intermediate file with the extraction of the residue from the *Node_id1* attribute of the *edgesDB_01.txt* database.  

In [None]:
df_RING_edge.to_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRINGAlphaFold/edgesDB_sel_proc_Interaction_Res1.csv",sep='\t',index=False)

###1.1.10 Extracting the residue, its position and chain in the *NodeId2* attribute

The target residue, its position and chain contained in **NodeId2** will be extracted

**Residue2_RING**: target node of the edge

**Residue2_pos_RING**: the position of the node

**Residue2_chain_RING**: the chain where the node is located

In [None]:
#increasing the viewing capacity of columns and rows
import pandas as pd

pd.set_option('display.max_columns', 7000)
pd.set_option('display.max_rows',1000000)
pd.set_option('display.width', 7000)

In [None]:
import pandas as pd
df_RING_edge = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRINGAlphaFold/edgesDB_sel_proc_Interaction_Res1.csv",delimiter='\t')

In [None]:
df_RING_edge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16347856 entries, 0 to 16347855
Data columns (total 10 columns):
 #   Column               Dtype 
---  ------               ----- 
 0   NodeId1_RING         object
 1   Interaction_RING     object
 2   NodeId2_RING         object
 3   Uniprot_AF_id_RING   object
 4   F_AF_RING            object
 5   interacao_RING       object
 6   subinteracao_RING    object
 7   Residue1_RING        object
 8   Residue1_pos_RING    int64 
 9   Residue1_chain_RING  object
dtypes: int64(1), object(9)
memory usage: 1.2+ GB


In [None]:
df_RING_edge["NodeId2_RING"].value_counts()

A:114:_:LEU     4145
A:185:_:LEU     3996
A:107:_:LEU     3934
A:187:_:LEU     3910
A:115:_:LEU     3899
A:86:_:LEU      3806
A:82:_:LEU      3768
A:84:_:LEU      3761
A:144:_:LEU     3741
A:112:_:LEU     3721
A:213:_:LEU     3713
A:106:_:LEU     3710
A:96:_:LEU      3707
A:104:_:LEU     3702
A:164:_:LEU     3693
A:184:_:LEU     3686
A:113:_:LEU     3676
A:97:_:LEU      3675
A:135:_:LEU     3675
A:119:_:LEU     3668
A:117:_:LEU     3665
A:133:_:LEU     3662
A:105:_:LEU     3660
A:101:_:LEU     3647
A:116:_:LEU     3643
A:159:_:LEU     3636
A:153:_:LEU     3631
A:183:_:LEU     3631
A:212:_:LEU     3630
A:130:_:LEU     3605
A:134:_:LEU     3605
A:142:_:LEU     3596
A:124:_:LEU     3590
A:121:_:LEU     3586
A:98:_:LEU      3584
A:204:_:LEU     3581
A:152:_:LEU     3573
A:143:_:LEU     3566
A:127:_:LEU     3565
A:148:_:LEU     3557
A:81:_:LEU      3556
A:180:_:LEU     3548
A:166:_:LEU     3548
A:110:_:LEU     3546
A:99:_:LEU      3539
A:155:_:LEU     3537
A:162:_:LEU     3535
A:165:_:LEU  

In [None]:
df_RING_edge["Residue2_RING"] = df_RING_edge["NodeId2_RING"].apply(lambda x: x.split(":")[3])

In [None]:
#Convertendo para o padrão Ala, Arg, Asn,...
Amin = ['ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HIS', 'ILE', 'LEU', 'LYS',
      'MET', 'PHE', 'PRO', 'PYL', 'SER', 'SEC', 'THR', 'TRP', 'TYR', 'VAL', 'ASX', 'GLX']

df_RING_edge["Residue2_RING"] = df_RING_edge["Residue2_RING"].apply(lambda x: x.capitalize() if x in Amin else x)

In [None]:
df_RING_edge["Residue2_RING"].value_counts()

Leu    2273459
Phe    1123513
Val    1092723
Ile    1044746
Arg    1033884
Glu     966388
Lys     959856
Ala     939620
Ser     823336
Gln     785423
Tyr     755345
Thr     719498
Asn     601162
Asp     560720
His     547080
Cys     522792
Met     478510
Gly     457181
Trp     400506
Pro     262114
Name: Residue2_RING, dtype: int64

In [None]:
df_RING_edge["Residue2_pos_RING"] = df_RING_edge["NodeId2_RING"].apply(lambda x: x.split(":")[1])

In [None]:
df_RING_edge["Residue2_chain_RING"] = df_RING_edge["NodeId2_RING"].apply(lambda x: x.split(":")[0].upper())

In [None]:
df_RING_edge["Residue2_chain_RING"].value_counts()

A    16347856
Name: Residue2_chain_RING, dtype: int64

In [None]:
df_RING_edge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16347856 entries, 0 to 16347855
Data columns (total 13 columns):
 #   Column               Dtype 
---  ------               ----- 
 0   NodeId1_RING         object
 1   Interaction_RING     object
 2   NodeId2_RING         object
 3   Uniprot_AF_id_RING   object
 4   F_AF_RING            object
 5   interacao_RING       object
 6   subinteracao_RING    object
 7   Residue1_RING        object
 8   Residue1_pos_RING    int64 
 9   Residue1_chain_RING  object
 10  Residue2_RING        object
 11  Residue2_pos_RING    object
 12  Residue2_chain_RING  object
dtypes: int64(1), object(12)
memory usage: 1.6+ GB


In [None]:
df_RING_edge.head(25)

Unnamed: 0,NodeId1_RING,Interaction_RING,NodeId2_RING,Uniprot_AF_id_RING,F_AF_RING,interacao_RING,subinteracao_RING,Residue1_RING,Residue1_pos_RING,Residue1_chain_RING,Residue2_RING,Residue2_pos_RING,Residue2_chain_RING
0,A:38:_:GLU,VDW:SC_SC,A:522:_:LEU,A0AUZ9,F1,VDW,SC_SC,Glu,38,A,Leu,522,A
1,A:118:_:ASN,VDW:SC_SC,A:388:_:GLN,A0AUZ9,F1,VDW,SC_SC,Asn,118,A,Gln,388,A
2,A:118:_:ASN,VDW:SC_SC,A:391:_:ASP,A0AUZ9,F1,VDW,SC_SC,Asn,118,A,Asp,391,A
3,A:131:_:GLU,VDW:SC_SC,A:927:_:MET,A0AUZ9,F1,VDW,SC_SC,Glu,131,A,Met,927,A
4,A:132:_:PHE,VDW:SC_SC,A:921:_:PRO,A0AUZ9,F1,VDW,SC_SC,Phe,132,A,Pro,921,A
5,A:132:_:PHE,VDW:SC_SC,A:927:_:MET,A0AUZ9,F1,VDW,SC_SC,Phe,132,A,Met,927,A
6,A:211:_:SER,HBOND:MC_SC,A:216:_:LYS,A0AUZ9,F1,HBOND,MC_SC,Ser,211,A,Lys,216,A
7,A:211:_:SER,HBOND:SC_SC,A:216:_:LYS,A0AUZ9,F1,HBOND,SC_SC,Ser,211,A,Lys,216,A
8,A:212:_:SER,HBOND:MC_SC,A:215:_:GLU,A0AUZ9,F1,HBOND,MC_SC,Ser,212,A,Glu,215,A
9,A:212:_:SER,HBOND:SC_MC,A:215:_:GLU,A0AUZ9,F1,HBOND,SC_MC,Ser,212,A,Glu,215,A


In [None]:

def categories_column(df):
    for col in ['Uniprot_AF_id_RING', 'F_AF_RING', 'Residue1_RING', 'Residue2_RING', 'interacao_RING']:
        mydic= df[col].value_counts().to_dict()
        print(col, mydic)
        print('\n')

categories_column(df_RING_edge)

Uniprot_AF_id_RING {'Q8NF91': 122408, 'Q8WXH0': 89131, 'Q9UPN3': 82599, 'Q03001': 80787, 'Q9NU22': 59305, 'Q8TE73': 54148, 'Q14204': 53988, 'Q9NYC9': 53109, 'Q8WXG9': 52861, 'Q96DT5': 52566, 'Q96JB1': 52426, 'Q8IVF4': 52399, 'Q9P225': 51970, 'Q15413': 51809, 'Q5T4S7': 51451, 'P21817': 50968, 'Q92736': 50590, 'Q15149': 48991, 'Q8NCM8': 46882, 'Q9NZJ4': 46815, 'Q8TD57': 46360, 'Q8WXX0': 46146, 'Q96M86': 45959, 'Q9C0G6': 45477, 'Q96RW7': 45071, 'O95714': 43691, 'P78527': 43662, 'O75445': 41585, 'Q9Y4A5': 41324, 'P98164': 40978, 'Q6V0I7': 40419, 'Q15751': 40320, 'Q07954': 40288, 'Q9NZR2': 38995, 'Q9NRC6': 38595, 'Q86WI1': 38234, 'Q4G0P3': 37368, 'Q96Q15': 36926, 'Q14517': 36513, 'Q8TDW7': 36098, 'P46939': 35914, 'Q9NYQ8': 34245, 'Q99698': 32684, 'Q70CQ2': 32088, 'P08F94': 31729, 'P98161': 30913, 'P98160': 30471, 'Q13315': 29977, 'Q8IZQ1': 29686, 'Q99996': 29478, 'Q8IZT6': 28543, 'Q0VDD8': 28387, 'O15230': 28307, 'O60494': 27998, 'P42858': 27401, 'Q7Z407': 26938, 'P78509': 26638, 'Q14789': 

###1.1.11 Generating an intermediate file with the extraction of the residue from the *Node_id2* attribute of the *edgesDB_01.txt* database

In [None]:
df_RING_edge.to_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRINGAlphaFold/edgesDB_sel_proc_Interaction_Res1_Res2.csv",sep='\t',index=False)

###1.1.12 Processing interaction types of Source nodes of edges

In [None]:
#increasing the viewing capacity of columns and rows
import pandas as pd

pd.set_option('display.max_columns', 7000)
pd.set_option('display.max_rows',1000000)
pd.set_option('display.width', 7000)

In [None]:
import pandas as pd
df_RING_edge = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRINGAlphaFold/edgesDB_sel_proc_Interaction_Res1_Res2.csv",delimiter='\t')

In [None]:
df_RING_edge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16347856 entries, 0 to 16347855
Data columns (total 13 columns):
 #   Column               Dtype 
---  ------               ----- 
 0   NodeId1_RING         object
 1   Interaction_RING     object
 2   NodeId2_RING         object
 3   Uniprot_AF_id_RING   object
 4   F_AF_RING            object
 5   interacao_RING       object
 6   subinteracao_RING    object
 7   Residue1_RING        object
 8   Residue1_pos_RING    int64 
 9   Residue1_chain_RING  object
 10  Residue2_RING        object
 11  Residue2_pos_RING    int64 
 12  Residue2_chain_RING  object
dtypes: int64(2), object(11)
memory usage: 1.6+ GB


In [None]:
df_RING_edge.head(100)

Unnamed: 0,NodeId1_RING,Interaction_RING,NodeId2_RING,Uniprot_AF_id_RING,F_AF_RING,interacao_RING,subinteracao_RING,Residue1_RING,Residue1_pos_RING,Residue1_chain_RING,Residue2_RING,Residue2_pos_RING,Residue2_chain_RING
0,A:38:_:GLU,VDW:SC_SC,A:522:_:LEU,A0AUZ9,F1,VDW,SC_SC,Glu,38,A,Leu,522,A
1,A:118:_:ASN,VDW:SC_SC,A:388:_:GLN,A0AUZ9,F1,VDW,SC_SC,Asn,118,A,Gln,388,A
2,A:118:_:ASN,VDW:SC_SC,A:391:_:ASP,A0AUZ9,F1,VDW,SC_SC,Asn,118,A,Asp,391,A
3,A:131:_:GLU,VDW:SC_SC,A:927:_:MET,A0AUZ9,F1,VDW,SC_SC,Glu,131,A,Met,927,A
4,A:132:_:PHE,VDW:SC_SC,A:921:_:PRO,A0AUZ9,F1,VDW,SC_SC,Phe,132,A,Pro,921,A
5,A:132:_:PHE,VDW:SC_SC,A:927:_:MET,A0AUZ9,F1,VDW,SC_SC,Phe,132,A,Met,927,A
6,A:211:_:SER,HBOND:MC_SC,A:216:_:LYS,A0AUZ9,F1,HBOND,MC_SC,Ser,211,A,Lys,216,A
7,A:211:_:SER,HBOND:SC_SC,A:216:_:LYS,A0AUZ9,F1,HBOND,SC_SC,Ser,211,A,Lys,216,A
8,A:212:_:SER,HBOND:MC_SC,A:215:_:GLU,A0AUZ9,F1,HBOND,MC_SC,Ser,212,A,Glu,215,A
9,A:212:_:SER,HBOND:SC_MC,A:215:_:GLU,A0AUZ9,F1,HBOND,SC_MC,Ser,212,A,Glu,215,A


Let's group the edge source node records by: **Uniprot_AF_id_RING**, **F_AF_RING**, **Residue1_RING**, **Residue1_pos_RING**, **Residue1_chain_RING** and counting the types of interactions they have.

Solution based on: https://stackoverflow.com/questions/29836477/pandas-create-new-column-with-count-from-groupby

In [None]:
df_RING_edge.groupby(['Uniprot_AF_id_RING', 'F_AF_RING',	'NodeId1_RING', 'interacao_RING'])["Interaction_RING"].count().reset_index(name="count")

Unnamed: 0,Uniprot_AF_id_RING,F_AF_RING,NodeId1_RING,interacao_RING,count
0,A0AUZ9,F1,A:118:_:ASN,VDW,2
1,A0AUZ9,F1,A:131:_:GLU,VDW,1
2,A0AUZ9,F1,A:132:_:PHE,VDW,2
3,A0AUZ9,F1,A:211:_:SER,HBOND,2
4,A0AUZ9,F1,A:212:_:SER,HBOND,4
...,...,...,...,...,...
9031204,Q9Y6Z7,F1,A:5:_:ALA,HBOND,1
9031205,Q9Y6Z7,F1,A:6:_:SER,HBOND,2
9031206,Q9Y6Z7,F1,A:7:_:LEU,HBOND,2
9031207,Q9Y6Z7,F1,A:8:_:LEU,HBOND,2


In [None]:
df_groupnode1_0 = df_RING_edge.groupby(['Uniprot_AF_id_RING', 'F_AF_RING',	'Residue1_RING', 'Residue1_pos_RING','Residue1_chain_RING','interacao_RING'])["Interaction_RING"].count().reset_index(name="count")

In [None]:
df_groupnode1_0.head(10)

Unnamed: 0,Uniprot_AF_id_RING,F_AF_RING,Residue1_RING,Residue1_pos_RING,Residue1_chain_RING,interacao_RING,count
0,A0AUZ9,F1,Ala,213,A,HBOND,2
1,A0AUZ9,F1,Ala,214,A,HBOND,2
2,A0AUZ9,F1,Ala,222,A,HBOND,2
3,A0AUZ9,F1,Ala,238,A,HBOND,2
4,A0AUZ9,F1,Ala,250,A,HBOND,2
5,A0AUZ9,F1,Ala,318,A,HBOND,2
6,A0AUZ9,F1,Ala,319,A,HBOND,2
7,A0AUZ9,F1,Ala,325,A,HBOND,1
8,A0AUZ9,F1,Ala,328,A,HBOND,1
9,A0AUZ9,F1,Ala,328,A,VDW,1


In [None]:
df_groupnode1 = df_RING_edge.groupby(['Uniprot_AF_id_RING', 'F_AF_RING',	'Residue1_RING', 'Residue1_pos_RING','Residue1_chain_RING','Interaction_RING'])["interacao_RING"].count().reset_index(name="count")

In [None]:
df_groupnode1.head(10)

Unnamed: 0,Uniprot_AF_id_RING,F_AF_RING,Residue1_RING,Residue1_pos_RING,Residue1_chain_RING,Interaction_RING,count
0,A0AUZ9,F1,Ala,213,A,HBOND:MC_MC,2
1,A0AUZ9,F1,Ala,214,A,HBOND:MC_MC,2
2,A0AUZ9,F1,Ala,222,A,HBOND:MC_MC,2
3,A0AUZ9,F1,Ala,238,A,HBOND:MC_MC,1
4,A0AUZ9,F1,Ala,238,A,HBOND:MC_SC,1
5,A0AUZ9,F1,Ala,250,A,HBOND:MC_MC,2
6,A0AUZ9,F1,Ala,318,A,HBOND:MC_MC,2
7,A0AUZ9,F1,Ala,319,A,HBOND:MC_MC,1
8,A0AUZ9,F1,Ala,319,A,HBOND:MC_SC,1
9,A0AUZ9,F1,Ala,325,A,HBOND:MC_MC,1


In [None]:
df_groupnode1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10976787 entries, 0 to 10976786
Data columns (total 7 columns):
 #   Column               Dtype 
---  ------               ----- 
 0   Uniprot_AF_id_RING   object
 1   F_AF_RING            object
 2   Residue1_RING        object
 3   Residue1_pos_RING    int64 
 4   Residue1_chain_RING  object
 5   Interaction_RING     object
 6   count                int64 
dtypes: int64(2), object(5)
memory usage: 586.2+ MB


Creating a DataFrame to store the processing of the DataFrame **df_groupnode1.**
The DataFrame will have the following attributes:
- **Uniprot_AF_id_RING**: Uniprot
- **F_AF_RING**: F AlphaFold  
- **Node_RING**: the node can be a residue or a ligand        
- **Node_pos_RING**: position of the node in the chain
- **Node_chain_RING**: chain       
- **Node_type**: node type: *source* (S) or *target* (T)              
- **Inter_Lig_tot**: total residue-ligand interactions            
- **Inter_Res_tot**: total residue-residue interactions        
- **Inter_IAC_Lig_tot**: total interactions of type IAC with ligand
- **Inter_VDW_Lig_tot**: total interactions of type VDW with ligand
- **Inter_HBOND_Lig_tot**: total interactions of type HBOND with ligand       
- **Inter_PIPISTACK_Lig_tot**: total interactions of type PIPISTACK with ligand   
- **Inter_IONIC_Lig_tot**: total interactions of type IONIC with ligand      
- **Inter_SSBOND_Lig_tot**: total interactions of type SSBOND with ligand      
- **Inter_PICATION_Lig_tot**: total interactions of type PICATION with ligand    
- **Inter_IAC_Res_tot**: total interactions of type IAC between residues
- **Inter_VDW_Res_tot**: total interactions of type VDW between residues
- **Inter_HBOND_Res_tot**: total interactions of type HBOND between residues               
- **Inter_PIPISTACK_Res_tot**: total interactions of type PIPISTACKK between residues           
- **Inter_IONIC_Res_tot**: total interactions of type IONIC between residues       
- **Inter_SSBOND_Res_tot**: total interactions of type SSBOND between residues              
- **Inter_PICATION_Res_tot**: total interactions of type PICATION between residues

In [None]:
COLUMN_NAMES=['Uniprot_AF_id_RING','F_AF_RING','Node_RING','Node_pos_RING','Node_chain_RING','Node_type','Inter_Lig_tot','Inter_Res_tot',
              'Inter_IAC_Lig_tot','Inter_VDW_Lig_tot', 'Inter_HBOND_Lig_tot', 'Inter_PIPISTACK_Lig_tot', 'Inter_IONIC_Lig_tot',
              'Inter_SSBOND_Lig_tot', 'Inter_PICATION_Lig_tot','Inter_IAC_Res_tot','Inter_VDW_Res_tot', 'Inter_HBOND_Res_tot',
              'Inter_PIPISTACK_Res_tot', 'Inter_IONIC_Res_tot','Inter_SSBOND_Res_tot', 'Inter_PICATION_Res_tot']
df_proc_RING_1 = pd.DataFrame(columns=COLUMN_NAMES)

In [None]:
df_proc_RING_1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 22 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Uniprot_AF_id_RING       0 non-null      object
 1   F_AF_RING                0 non-null      object
 2   Node_RING                0 non-null      object
 3   Node_pos_RING            0 non-null      object
 4   Node_chain_RING          0 non-null      object
 5   Node_type                0 non-null      object
 6   Inter_Lig_tot            0 non-null      object
 7   Inter_Res_tot            0 non-null      object
 8   Inter_IAC_Lig_tot        0 non-null      object
 9   Inter_VDW_Lig_tot        0 non-null      object
 10  Inter_HBOND_Lig_tot      0 non-null      object
 11  Inter_PIPISTACK_Lig_tot  0 non-null      object
 12  Inter_IONIC_Lig_tot      0 non-null      object
 13  Inter_SSBOND_Lig_tot     0 non-null      object
 14  Inter_PICATION_Lig_tot   0 non-null      object
 15  Inter_

Processing nodes that are source (nodes present in the **Residue1_RING** attribute) stored in the **df_groupnode1** dataframe

In [None]:

l_Uniprot_AF_id_RING = []
l_F_AF_RING = []
l_Node_RING = []
l__Node_pos_RING = []
l_Node_chain_RING = []
l_Node_type = []
l_Inter_Lig_tot = []
l_Inter_Res_tot = []
l_Inter_IAC_Lig_tot = []
l_Inter_VDW_Lig_tot = []
l_Inter_HBOND_Lig_tot = []
l_Inter_PIPISTACK_Lig_tot = []
l_Inter_IONIC_Lig_tot = []
l_Inter_SSBOND_Lig_tot = []
l_Inter_PICATION_Lig_tot = []
l_Inter_IAC_Res_tot = []
l_Inter_VDW_Res_tot = []
l_Inter_HBOND_Res_tot = []
l_Inter_PIPISTACK_Res_tot = []
l_Inter_IONIC_Res_tot = []
l_Inter_SSBOND_Res_tot = []
l_Inter_PICATION_Res_tot = []

l_Uniprot = []
l_F = []
l_Node = []
l_pos = []
l_chain = []
l_type = []
l_lig = []
l_res = []
l_IAC_L = []
l_VDW_L = []
l_HBOND_L = []
l_PIPISTACK_L = []
l_IONIC_L = []
l_SSBOND_L = []
l_PICATION_L = []
l_IAC_R = []
l_VDW_R = []
l_HBOND_R = []
l_PIPISTACK_R = []
l_IONIC_R = []
l_SSBOND_R = []
l_PICATION_R = []

def process_reg_group(df):
  uniprot = 0
  F = 0
  no = 0
  pos = 0
  chain = 0
  mudou = False
  primeiro = False

  totlig = 0
  totres = 0
  tot1 = 0
  tot2 = 0
  tot3 = 0
  tot4 = 0
  tot5 = 0
  tot6 = 0
  tot7 = 0
  tot8 = 0
  tot9 = 0
  tot10 = 0
  tot11 = 0
  tot12 = 0
  tot13 = 0
  tot14 = 0

  for i in df.itertuples():

    if (i.Index == 0):  #First record
       primeiro = True
       #print("e o primeiro")
    if (((i.Uniprot_AF_id_RING == uniprot) and (i.F_AF_RING == F) and (i.Residue1_RING  == no) and (i.Residue1_pos_RING == pos) and (i.Residue1_chain_RING == chain))
    or primeiro):
      if(primeiro == True):
        primeiro = False
      uniprot = i.Uniprot_AF_id_RING
      F = i.F_AF_RING
      no = i.Residue1_RING
      pos = i.Residue1_pos_RING
      chain = i.Residue1_chain_RING
      interacao = i.Interaction_RING.split(":")[0]
      if ("LIG" in i.Interaction_RING):
        totlig = totlig + i.count
        if (interacao == "IAC"):
          tot1 = tot1 + i.count
        elif (interacao == "VDW"):
          tot2 = tot2 + i.count
        elif (interacao == "HBOND"):
          tot3 = tot3 + i.count
        elif (interacao == "PIPISTACK"):
          tot4 = tot4 + i.count
        elif (interacao == "IONIC"):
          tot5 = tot5 + i.count
        elif (interacao == "SSBOND"):
          tot6 = tot6 + i.count
        elif (interacao == "PICATION"):
          tot7 = tot7 + i.count
      else:    #interacao residuo - residuo
        totres = totres + i.count
        if (interacao == "IAC"):
          tot8 = tot8 + i.count
        elif (interacao == "VDW"):
          tot9 = tot9 + i.count
        elif (interacao == "HBOND"):
          tot10 = tot10 + i.count
        elif (interacao == "PIPISTACK"):
          tot11 = tot11 + i.count
        elif (interacao == "IONIC"):
          tot12 = tot12 + i.count
        elif (interacao == "SSBOND"):
          tot13 = tot13 + i.count
        elif (interacao == "PICATION"):
          tot14 = tot14 + i.count
    else:  #key changed ((i.Uniprot_AF_id_RING == uniprot) & (i.F_AF_RING == F) & (i.Residue1_RING  == no) & (i.Residue1_pos_RING == pos) & (i.Residue1_chain_RING == chain))
      l_Uniprot_AF_id_RING.append(uniprot)
      l_F_AF_RING.append(F)
      l_Node_RING.append(no)
      l__Node_pos_RING.append(pos)
      l_Node_chain_RING.append(chain)
      l_Node_type.append("S")
      l_Inter_Lig_tot.append(totlig)
      l_Inter_Res_tot.append(totres)
      l_Inter_IAC_Lig_tot.append(tot1)
      l_Inter_VDW_Lig_tot.append(tot2)
      l_Inter_HBOND_Lig_tot.append(tot3)
      l_Inter_PIPISTACK_Lig_tot.append(tot4)
      l_Inter_IONIC_Lig_tot.append(tot5)
      l_Inter_SSBOND_Lig_tot.append(tot6)
      l_Inter_PICATION_Lig_tot.append(tot7)
      l_Inter_IAC_Res_tot.append(tot8)
      l_Inter_VDW_Res_tot.append(tot9)
      l_Inter_HBOND_Res_tot.append(tot10)
      l_Inter_PIPISTACK_Res_tot.append(tot11)
      l_Inter_IONIC_Res_tot.append(tot12)
      l_Inter_SSBOND_Res_tot.append(tot13)
      l_Inter_PICATION_Res_tot.append(tot14)
      totlig = 0
      totres = 0
      tot1 = 0
      tot2 = 0
      tot3 = 0
      tot4 = 0
      tot5 = 0
      tot6 = 0
      tot7 = 0
      tot8 = 0
      tot9 = 0
      tot10 = 0
      tot11 = 0
      tot12 = 0
      tot13 = 0
      tot14 = 0
      uniprot = i.Uniprot_AF_id_RING
      F = i.F_AF_RING
      no = i.Residue1_RING
      pos = i.Residue1_pos_RING
      chain = i.Residue1_chain_RING
      interacao = i.Interaction_RING.split(":")[0]
      if ("LIG" in i.Interaction_RING):
        totlig = totlig + i.count
        if (interacao == "IAC"):
          tot1 = tot1 + i.count
        elif (interacao == "VDW"):
          tot2 = tot2 + i.count
        elif (interacao == "HBOND"):
          tot3 = tot3 + i.count
        elif (interacao == "PIPISTACK"):
          tot4 = tot4 + i.count
        elif (interacao == "IONIC"):
          tot5 = tot5 + i.count
        elif (interacao == "SSBOND"):
          tot6 = tot6 + i.count
        elif (interacao == "PICATION"):
          tot7 = tot7 + i.count
      else:    #interacao residuo - residuo
        totres = totres + i.count
        if (interacao == "IAC"):
          tot8 = tot8 + i.count
        elif (interacao == "VDW"):
          tot9 = tot9 + i.count
        elif (interacao == "HBOND"):
          tot10 = tot10 + i.count
        elif (interacao == "PIPISTACK"):
          tot11 = tot11 + i.count
        elif (interacao == "IONIC"):
          tot12 = tot12 + i.count
        elif (interacao == "SSBOND"):
          tot13 = tot13 + i.count
        elif (interacao == "PICATION"):
          tot14 = tot14 + i.count
  return l_Uniprot_AF_id_RING, l_F_AF_RING, l_Node_RING,l__Node_pos_RING,l_Node_chain_RING,l_Node_type,l_Inter_Lig_tot,l_Inter_Res_tot,l_Inter_IAC_Lig_tot,l_Inter_VDW_Lig_tot,l_Inter_HBOND_Lig_tot,l_Inter_PIPISTACK_Lig_tot,l_Inter_IONIC_Lig_tot,l_Inter_SSBOND_Lig_tot,l_Inter_PICATION_Lig_tot,l_Inter_IAC_Res_tot,l_Inter_VDW_Res_tot,l_Inter_HBOND_Res_tot,l_Inter_PIPISTACK_Res_tot,l_Inter_IONIC_Res_tot,l_Inter_SSBOND_Res_tot,l_Inter_PICATION_Res_tot


In [None]:
#Processing nodes that are source in the edges
l_Uniprot1,l_F1,l_Node1,l_pos1,l_chain1,l_type1,l_lig1,l_res1,l_IAC_L1,l_VDW_L1,l_HBOND_L1,l_PIPISTACK_L1,l_IONIC_L1,l_SSBOND_L1,l_PICATION_L1,l_IAC_R1,l_VDW_R1,l_HBOND_R1,l_PIPISTACK_R1,l_IONIC_R1,l_SSBOND_R1,l_PICATION_R1 = process_reg_group(df_groupnode1)

In [None]:
#Resulting size of processing Source nodes
tam = len(l_Uniprot1)
print(tam)

6019614


In [None]:
df_proc_RING_1['Uniprot_AF_id_RING'] = l_Uniprot1
df_proc_RING_1['F_AF_RING'] = l_F1
df_proc_RING_1['Node_RING'] = l_Node1
df_proc_RING_1['Node_pos_RING'] = l_pos1
df_proc_RING_1['Node_chain_RING'] = l_chain1
df_proc_RING_1['Node_type'] = l_type1
df_proc_RING_1['Inter_Lig_tot'] = l_lig1
df_proc_RING_1['Inter_Res_tot'] = l_res1
df_proc_RING_1['Inter_IAC_Lig_tot'] = l_IAC_L1
df_proc_RING_1['Inter_VDW_Lig_tot'] = l_VDW_L1
df_proc_RING_1['Inter_HBOND_Lig_tot'] = l_HBOND_L1
df_proc_RING_1['Inter_PIPISTACK_Lig_tot'] = l_PIPISTACK_L1
df_proc_RING_1['Inter_IONIC_Lig_tot'] = l_IONIC_L1
df_proc_RING_1['Inter_SSBOND_Lig_tot'] = l_SSBOND_L1
df_proc_RING_1['Inter_PICATION_Lig_tot'] = l_PICATION_L1
df_proc_RING_1['Inter_IAC_Res_tot'] = l_IAC_R1
df_proc_RING_1['Inter_VDW_Res_tot'] = l_VDW_R1
df_proc_RING_1['Inter_HBOND_Res_tot'] = l_HBOND_R1
df_proc_RING_1['Inter_PIPISTACK_Res_tot'] = l_PIPISTACK_R1
df_proc_RING_1['Inter_IONIC_Res_tot'] = l_IONIC_R1
df_proc_RING_1['Inter_SSBOND_Res_tot'] = l_SSBOND_R1
df_proc_RING_1['Inter_PICATION_Res_tot'] = l_PICATION_R1

In [None]:
df_proc_RING_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019614 entries, 0 to 6019613
Data columns (total 22 columns):
 #   Column                   Dtype 
---  ------                   ----- 
 0   Uniprot_AF_id_RING       object
 1   F_AF_RING                object
 2   Node_RING                object
 3   Node_pos_RING            int64 
 4   Node_chain_RING          object
 5   Node_type                object
 6   Inter_Lig_tot            int64 
 7   Inter_Res_tot            int64 
 8   Inter_IAC_Lig_tot        int64 
 9   Inter_VDW_Lig_tot        int64 
 10  Inter_HBOND_Lig_tot      int64 
 11  Inter_PIPISTACK_Lig_tot  int64 
 12  Inter_IONIC_Lig_tot      int64 
 13  Inter_SSBOND_Lig_tot     int64 
 14  Inter_PICATION_Lig_tot   int64 
 15  Inter_IAC_Res_tot        int64 
 16  Inter_VDW_Res_tot        int64 
 17  Inter_HBOND_Res_tot      int64 
 18  Inter_PIPISTACK_Res_tot  int64 
 19  Inter_IONIC_Res_tot      int64 
 20  Inter_SSBOND_Res_tot     int64 
 21  Inter_PICATION_Res_tot   int64 

In [None]:
df_proc_RING_1.head(10)

Unnamed: 0,Uniprot_AF_id_RING,F_AF_RING,Node_RING,Node_pos_RING,Node_chain_RING,Node_type,Inter_Lig_tot,Inter_Res_tot,Inter_IAC_Lig_tot,Inter_VDW_Lig_tot,Inter_HBOND_Lig_tot,Inter_PIPISTACK_Lig_tot,Inter_IONIC_Lig_tot,Inter_SSBOND_Lig_tot,Inter_PICATION_Lig_tot,Inter_IAC_Res_tot,Inter_VDW_Res_tot,Inter_HBOND_Res_tot,Inter_PIPISTACK_Res_tot,Inter_IONIC_Res_tot,Inter_SSBOND_Res_tot,Inter_PICATION_Res_tot
0,A0AUZ9,F1,Ala,213,A,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
1,A0AUZ9,F1,Ala,214,A,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
2,A0AUZ9,F1,Ala,222,A,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
3,A0AUZ9,F1,Ala,238,A,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
4,A0AUZ9,F1,Ala,250,A,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
5,A0AUZ9,F1,Ala,318,A,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
6,A0AUZ9,F1,Ala,319,A,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
7,A0AUZ9,F1,Ala,325,A,S,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
8,A0AUZ9,F1,Ala,328,A,S,0,2,0,0,0,0,0,0,0,0,1,1,0,0,0,0
9,A0AUZ9,F1,Ala,376,A,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0


###1.1.13  Processing interaction types of Target nodes of edges


Let's group the edge target node records by:: **Uniprot_AF_id_RING**, **F_AF_RING**, **Residue2_RING**, **Residue2_pos_RING**, **Residue2_chain_RING** e contando os tipos de interação que eles possuem.

Solution based on: https://stackoverflow.com/questions/29836477/pandas-create-new-column-with-count-from-groupby

In [None]:
df_groupnode2 = df_RING_edge.groupby(['Uniprot_AF_id_RING', 'F_AF_RING',	'Residue2_RING', 'Residue2_pos_RING','Residue2_chain_RING','Interaction_RING'])["interacao_RING"].count().reset_index(name="count")

In [None]:
df_groupnode2.head(10)

Unnamed: 0,Uniprot_AF_id_RING,F_AF_RING,Residue2_RING,Residue2_pos_RING,Residue2_chain_RING,Interaction_RING,count
0,A0AUZ9,F1,Ala,222,A,HBOND:MC_MC,2
1,A0AUZ9,F1,Ala,238,A,HBOND:MC_MC,1
2,A0AUZ9,F1,Ala,250,A,HBOND:MC_MC,2
3,A0AUZ9,F1,Ala,325,A,HBOND:MC_MC,2
4,A0AUZ9,F1,Ala,328,A,HBOND:MC_MC,1
5,A0AUZ9,F1,Ala,343,A,HBOND:SC_MC,1
6,A0AUZ9,F1,Ala,343,A,VDW:SC_SC,1
7,A0AUZ9,F1,Ala,376,A,HBOND:MC_MC,2
8,A0AUZ9,F1,Ala,387,A,HBOND:MC_MC,2
9,A0AUZ9,F1,Ala,408,A,HBOND:MC_MC,2


Creating a DataFrame to store the processing of the DataFrame **df_groupnode2.**
The DataFrame will have the following attributes:
- **Uniprot_AF_id_RING**: Uniprot
- **F_AF_RING**: F AlphaFold  
- **Node_RING**: the node can be a residue or a ligand        
- **Node_pos_RING**: position of the node in the chain
- **Node_chain_RING**: chain       
- **Node_type**: node type: *source* (S) or *target* (T)              
- **Inter_Lig_tot**: total residue-ligand interactions            
- **Inter_Res_tot**: total residue-residue interactions        
- **Inter_IAC_Lig_tot**: total interactions of type IAC with ligand
- **Inter_VDW_Lig_tot**: total interactions of type VDW with ligand
- **Inter_HBOND_Lig_tot**: total interactions of type HBOND with ligand       
- **Inter_PIPISTACK_Lig_tot**: total interactions of type PIPISTACK with ligand   
- **Inter_IONIC_Lig_tot**: total interactions of type IONIC with ligand      
- **Inter_SSBOND_Lig_tot**: total interactions of type SSBOND with ligand      
- **Inter_PICATION_Lig_tot**: total interactions of type PICATION with ligand    
- **Inter_IAC_Res_tot**: total interactions of type IAC between residues
- **Inter_VDW_Res_tot**: total interactions of type VDW between residues
- **Inter_HBOND_Res_tot**: total interactions of type HBOND between residues               
- **Inter_PIPISTACK_Res_tot**: total interactions of type PIPISTACKK between residues           
- **Inter_IONIC_Res_tot**: total interactions of type IONIC between residues       
- **Inter_SSBOND_Res_tot**: total interactions of type SSBOND between residues              
- **Inter_PICATION_Res_tot**: total interactions of type PICATION between residues

In [None]:
COLUMN_NAMES=['Uniprot_AF_id_RING','F_AF_RING','Node_RING','Node_pos_RING','Node_chain_RING','Node_type','Inter_Lig_tot','Inter_Res_tot',
              'Inter_IAC_Lig_tot','Inter_VDW_Lig_tot', 'Inter_HBOND_Lig_tot', 'Inter_PIPISTACK_Lig_tot', 'Inter_IONIC_Lig_tot',
              'Inter_SSBOND_Lig_tot', 'Inter_PICATION_Lig_tot','Inter_IAC_Res_tot','Inter_VDW_Res_tot', 'Inter_HBOND_Res_tot',
              'Inter_PIPISTACK_Res_tot', 'Inter_IONIC_Res_tot','Inter_SSBOND_Res_tot', 'Inter_PICATION_Res_tot']
df_proc_RING_2 = pd.DataFrame(columns=COLUMN_NAMES)

In [None]:
df_proc_RING_2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 22 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Uniprot_AF_id_RING       0 non-null      object
 1   F_AF_RING                0 non-null      object
 2   Node_RING                0 non-null      object
 3   Node_pos_RING            0 non-null      object
 4   Node_chain_RING          0 non-null      object
 5   Node_type                0 non-null      object
 6   Inter_Lig_tot            0 non-null      object
 7   Inter_Res_tot            0 non-null      object
 8   Inter_IAC_Lig_tot        0 non-null      object
 9   Inter_VDW_Lig_tot        0 non-null      object
 10  Inter_HBOND_Lig_tot      0 non-null      object
 11  Inter_PIPISTACK_Lig_tot  0 non-null      object
 12  Inter_IONIC_Lig_tot      0 non-null      object
 13  Inter_SSBOND_Lig_tot     0 non-null      object
 14  Inter_PICATION_Lig_tot   0 non-null      object
 15  Inter_

Processing target nodes (nodes present in the **Residue2_RING** attribute) stored in the **df_groupnode2** dataframe.

In [None]:

l_Uniprot_AF_id_RING = []
l_F_AF_RING = []
l_Node_RING = []
l__Node_pos_RING = []
l_Node_chain_RING = []
l_Node_type = []
l_Inter_Lig_tot = []
l_Inter_Res_tot = []
l_Inter_IAC_Lig_tot = []
l_Inter_VDW_Lig_tot = []
l_Inter_HBOND_Lig_tot = []
l_Inter_PIPISTACK_Lig_tot = []
l_Inter_IONIC_Lig_tot = []
l_Inter_SSBOND_Lig_tot = []
l_Inter_PICATION_Lig_tot = []
l_Inter_IAC_Res_tot = []
l_Inter_VDW_Res_tot = []
l_Inter_HBOND_Res_tot = []
l_Inter_PIPISTACK_Res_tot = []
l_Inter_IONIC_Res_tot = []
l_Inter_SSBOND_Res_tot = []
l_Inter_PICATION_Res_tot = []

l_Uniprot = []
l_F = []
l_Node = []
l_pos = []
l_chain = []
l_type = []
l_lig = []
l_res = []
l_IAC_L = []
l_VDW_L = []
l_HBOND_L = []
l_PIPISTACK_L = []
l_IONIC_L = []
l_SSBOND_L = []
l_PICATION_L = []
l_IAC_R = []
l_VDW_R = []
l_HBOND_R = []
l_PIPISTACK_R = []
l_IONIC_R = []
l_SSBOND_R = []
l_PICATION_R = []

def process_reg_group(df):
  uniprot = 0
  F = 0
  no = 0
  pos = 0
  chain = 0
  mudou = False
  primeiro = False

  totlig = 0
  totres = 0
  tot1 = 0
  tot2 = 0
  tot3 = 0
  tot4 = 0
  tot5 = 0
  tot6 = 0
  tot7 = 0
  tot8 = 0
  tot9 = 0
  tot10 = 0
  tot11 = 0
  tot12 = 0
  tot13 = 0
  tot14 = 0

  for i in df.itertuples():
    if (i.Index == 0):  #First record
       primeiro = True
       #print("é o primeiro")
    if (((i.Uniprot_AF_id_RING == uniprot) and (i.F_AF_RING == F) and (i.Residue2_RING  == no) and (i.Residue2_pos_RING == pos) and (i.Residue2_chain_RING == chain))
    or primeiro):
      if(primeiro == True):
        primeiro = False
      uniprot = i.Uniprot_AF_id_RING
      F = i.F_AF_RING
      no = i.Residue2_RING
      pos = i.Residue2_pos_RING
      chain = i.Residue2_chain_RING
      interacao = i.Interaction_RING.split(":")[0]
      if ("LIG" in i.Interaction_RING):
        totlig = totlig + i.count
        if (interacao == "IAC"):
          tot1 = tot1 + i.count
        elif (interacao == "VDW"):
          tot2 = tot2 + i.count
        elif (interacao == "HBOND"):
          tot3 = tot3 + i.count
        elif (interacao == "PIPISTACK"):
          tot4 = tot4 + i.count
        elif (interacao == "IONIC"):
          tot5 = tot5 + i.count
        elif (interacao == "SSBOND"):
          tot6 = tot6 + i.count
        elif (interacao == "PICATION"):
          tot7 = tot7 + i.count
      else:    #interacao residuo - residuo
        totres = totres + i.count
        if (interacao == "IAC"):
          tot8 = tot8 + i.count
        elif (interacao == "VDW"):
          tot9 = tot9 + i.count
        elif (interacao == "HBOND"):
          tot10 = tot10 + i.count
        elif (interacao == "PIPISTACK"):
          tot11 = tot11 + i.count
        elif (interacao == "IONIC"):
          tot12 = tot12 + i.count
        elif (interacao == "SSBOND"):
          tot13 = tot13 + i.count
        elif (interacao == "PICATION"):
          tot14 = tot14 + i.count
    else: #key changed ((i.Uniprot_AF_id_RING == uniprot) & (i.F_AF_RING == F) & (i.Residue2_RING  == no) & (i.Residue2_pos_RING == pos) & (i.Residue2_chain_RING == chain))
      l_Uniprot_AF_id_RING.append(uniprot)
      l_F_AF_RING.append(F)
      l_Node_RING.append(no)
      l__Node_pos_RING.append(pos)
      l_Node_chain_RING.append(chain)
      l_Node_type.append("T")
      l_Inter_Lig_tot.append(totlig)
      l_Inter_Res_tot.append(totres)
      l_Inter_IAC_Lig_tot.append(tot1)
      l_Inter_VDW_Lig_tot.append(tot2)
      l_Inter_HBOND_Lig_tot.append(tot3)
      l_Inter_PIPISTACK_Lig_tot.append(tot4)
      l_Inter_IONIC_Lig_tot.append(tot5)
      l_Inter_SSBOND_Lig_tot.append(tot6)
      l_Inter_PICATION_Lig_tot.append(tot7)
      l_Inter_IAC_Res_tot.append(tot8)
      l_Inter_VDW_Res_tot.append(tot9)
      l_Inter_HBOND_Res_tot.append(tot10)
      l_Inter_PIPISTACK_Res_tot.append(tot11)
      l_Inter_IONIC_Res_tot.append(tot12)
      l_Inter_SSBOND_Res_tot.append(tot13)
      l_Inter_PICATION_Res_tot.append(tot14)
      totlig = 0
      totres = 0
      tot1 = 0
      tot2 = 0
      tot3 = 0
      tot4 = 0
      tot5 = 0
      tot6 = 0
      tot7 = 0
      tot8 = 0
      tot9 = 0
      tot10 = 0
      tot11 = 0
      tot12 = 0
      tot13 = 0
      tot14 = 0
      uniprot = i.Uniprot_AF_id_RING
      F = i.F_AF_RING
      no = i.Residue2_RING
      pos = i.Residue2_pos_RING
      chain = i.Residue2_chain_RING
      interacao = i.Interaction_RING.split(":")[0]
      if ("LIG" in i.Interaction_RING):
        totlig = totlig + i.count
        if (interacao == "IAC"):
          tot1 = tot1 + i.count
        elif (interacao == "VDW"):
          tot2 = tot2 + i.count
        elif (interacao == "HBOND"):
          tot3 = tot3 + i.count
        elif (interacao == "PIPISTACK"):
          tot4 = tot4 + i.count
        elif (interacao == "IONIC"):
          tot5 = tot5 + i.count
        elif (interacao == "SSBOND"):
          tot6 = tot6 + i.count
        elif (interacao == "PICATION"):
          tot7 = tot7 + i.count
      else:    #interacao residuo - residuo
        totres = totres + i.count
        if (interacao == "IAC"):
          tot8 = tot8 + i.count
        elif (interacao == "VDW"):
          tot9 = tot9 + i.count
        elif (interacao == "HBOND"):
          tot10 = tot10 + i.count
        elif (interacao == "PIPISTACK"):
          tot11 = tot11 + i.count
        elif (interacao == "IONIC"):
          tot12 = tot12 + i.count
        elif (interacao == "SSBOND"):
          tot13 = tot13 + i.count
        elif (interacao == "PICATION"):
          tot14 = tot14 + i.count
  return l_Uniprot_AF_id_RING,l_F_AF_RING,l_Node_RING,l__Node_pos_RING,l_Node_chain_RING,l_Node_type,l_Inter_Lig_tot,l_Inter_Res_tot,l_Inter_IAC_Lig_tot,l_Inter_VDW_Lig_tot,l_Inter_HBOND_Lig_tot,l_Inter_PIPISTACK_Lig_tot,l_Inter_IONIC_Lig_tot,l_Inter_SSBOND_Lig_tot,l_Inter_PICATION_Lig_tot,l_Inter_IAC_Res_tot,l_Inter_VDW_Res_tot,l_Inter_HBOND_Res_tot,l_Inter_PIPISTACK_Res_tot,l_Inter_IONIC_Res_tot,l_Inter_SSBOND_Res_tot,l_Inter_PICATION_Res_tot


In [None]:
#Processing nodes that are the target in the edges
l_Uniprot2, l_F2, l_Node2,l_pos2,l_chain2,l_type2,l_lig2,l_res2,l_IAC_L2,l_VDW_L2,l_HBOND_L2,l_PIPISTACK_L2,l_IONIC_L2,l_SSBOND_L2,l_PICATION_L2,l_IAC_R2,l_VDW_R2,l_HBOND_R2,l_PIPISTACK_R2,l_IONIC_R2,l_SSBOND_R2,l_PICATION_R2 = process_reg_group(df_groupnode2)

In [None]:
#Resulting size after processing the target  nodes.
tam = len(l_Uniprot2)
print(tam)

6065772


In [None]:
df_proc_RING_2['Uniprot_AF_id_RING'] = l_Uniprot2
df_proc_RING_2['F_AF_RING'] = l_F2
df_proc_RING_2['Node_RING'] = l_Node2
df_proc_RING_2['Node_pos_RING'] = l_pos2
df_proc_RING_2['Node_chain_RING'] = l_chain2
df_proc_RING_2['Node_type'] = l_type2
df_proc_RING_2['Inter_Lig_tot'] = l_lig2
df_proc_RING_2['Inter_Res_tot'] = l_res2
df_proc_RING_2['Inter_IAC_Lig_tot'] = l_IAC_L2
df_proc_RING_2['Inter_VDW_Lig_tot'] = l_VDW_L2
df_proc_RING_2['Inter_HBOND_Lig_tot'] = l_HBOND_L2
df_proc_RING_2['Inter_PIPISTACK_Lig_tot'] = l_PIPISTACK_L2
df_proc_RING_2['Inter_IONIC_Lig_tot'] = l_IONIC_L2
df_proc_RING_2['Inter_SSBOND_Lig_tot'] = l_SSBOND_L2
df_proc_RING_2['Inter_PICATION_Lig_tot'] = l_PICATION_L2
df_proc_RING_2['Inter_IAC_Res_tot'] = l_IAC_R2
df_proc_RING_2['Inter_VDW_Res_tot'] = l_VDW_R2
df_proc_RING_2['Inter_HBOND_Res_tot'] = l_HBOND_R2
df_proc_RING_2['Inter_PIPISTACK_Res_tot'] = l_PIPISTACK_R2
df_proc_RING_2['Inter_IONIC_Res_tot'] = l_IONIC_R2
df_proc_RING_2['Inter_SSBOND_Res_tot'] = l_SSBOND_R2
df_proc_RING_2['Inter_PICATION_Res_tot'] = l_PICATION_R2

In [None]:
df_proc_RING_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6065772 entries, 0 to 6065771
Data columns (total 22 columns):
 #   Column                   Dtype 
---  ------                   ----- 
 0   Uniprot_AF_id_RING       object
 1   F_AF_RING                object
 2   Node_RING                object
 3   Node_pos_RING            int64 
 4   Node_chain_RING          object
 5   Node_type                object
 6   Inter_Lig_tot            int64 
 7   Inter_Res_tot            int64 
 8   Inter_IAC_Lig_tot        int64 
 9   Inter_VDW_Lig_tot        int64 
 10  Inter_HBOND_Lig_tot      int64 
 11  Inter_PIPISTACK_Lig_tot  int64 
 12  Inter_IONIC_Lig_tot      int64 
 13  Inter_SSBOND_Lig_tot     int64 
 14  Inter_PICATION_Lig_tot   int64 
 15  Inter_IAC_Res_tot        int64 
 16  Inter_VDW_Res_tot        int64 
 17  Inter_HBOND_Res_tot      int64 
 18  Inter_PIPISTACK_Res_tot  int64 
 19  Inter_IONIC_Res_tot      int64 
 20  Inter_SSBOND_Res_tot     int64 
 21  Inter_PICATION_Res_tot   int64 

In [None]:
df_proc_RING_2.head(20)

Unnamed: 0,Uniprot_AF_id_RING,F_AF_RING,Node_RING,Node_pos_RING,Node_chain_RING,Node_type,Inter_Lig_tot,Inter_Res_tot,Inter_IAC_Lig_tot,Inter_VDW_Lig_tot,Inter_HBOND_Lig_tot,Inter_PIPISTACK_Lig_tot,Inter_IONIC_Lig_tot,Inter_SSBOND_Lig_tot,Inter_PICATION_Lig_tot,Inter_IAC_Res_tot,Inter_VDW_Res_tot,Inter_HBOND_Res_tot,Inter_PIPISTACK_Res_tot,Inter_IONIC_Res_tot,Inter_SSBOND_Res_tot,Inter_PICATION_Res_tot
0,A0AUZ9,F1,Ala,222,A,T,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
1,A0AUZ9,F1,Ala,238,A,T,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,A0AUZ9,F1,Ala,250,A,T,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
3,A0AUZ9,F1,Ala,325,A,T,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
4,A0AUZ9,F1,Ala,328,A,T,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
5,A0AUZ9,F1,Ala,343,A,T,0,2,0,0,0,0,0,0,0,0,1,1,0,0,0,0
6,A0AUZ9,F1,Ala,376,A,T,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
7,A0AUZ9,F1,Ala,387,A,T,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
8,A0AUZ9,F1,Ala,408,A,T,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
9,A0AUZ9,F1,Ala,432,A,T,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0


###1.1.14 Integration of Databases that have the interactions of source and target nodes

In [None]:
df_proc_RING  = df_proc_RING_1.append(df_proc_RING_2, ignore_index=True)

In [None]:
df_proc_RING.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12085386 entries, 0 to 12085385
Data columns (total 22 columns):
 #   Column                   Dtype 
---  ------                   ----- 
 0   Uniprot_AF_id_RING       object
 1   F_AF_RING                object
 2   Node_RING                object
 3   Node_pos_RING            int64 
 4   Node_chain_RING          object
 5   Node_type                object
 6   Inter_Lig_tot            int64 
 7   Inter_Res_tot            int64 
 8   Inter_IAC_Lig_tot        int64 
 9   Inter_VDW_Lig_tot        int64 
 10  Inter_HBOND_Lig_tot      int64 
 11  Inter_PIPISTACK_Lig_tot  int64 
 12  Inter_IONIC_Lig_tot      int64 
 13  Inter_SSBOND_Lig_tot     int64 
 14  Inter_PICATION_Lig_tot   int64 
 15  Inter_IAC_Res_tot        int64 
 16  Inter_VDW_Res_tot        int64 
 17  Inter_HBOND_Res_tot      int64 
 18  Inter_PIPISTACK_Res_tot  int64 
 19  Inter_IONIC_Res_tot      int64 
 20  Inter_SSBOND_Res_tot     int64 
 21  Inter_PICATION_Res_tot   int6

In [None]:
df_proc_RING.head(20)

Unnamed: 0,Uniprot_AF_id_RING,F_AF_RING,Node_RING,Node_pos_RING,Node_chain_RING,Node_type,Inter_Lig_tot,Inter_Res_tot,Inter_IAC_Lig_tot,Inter_VDW_Lig_tot,Inter_HBOND_Lig_tot,Inter_PIPISTACK_Lig_tot,Inter_IONIC_Lig_tot,Inter_SSBOND_Lig_tot,Inter_PICATION_Lig_tot,Inter_IAC_Res_tot,Inter_VDW_Res_tot,Inter_HBOND_Res_tot,Inter_PIPISTACK_Res_tot,Inter_IONIC_Res_tot,Inter_SSBOND_Res_tot,Inter_PICATION_Res_tot
0,A0AUZ9,F1,Ala,213,A,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
1,A0AUZ9,F1,Ala,214,A,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
2,A0AUZ9,F1,Ala,222,A,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
3,A0AUZ9,F1,Ala,238,A,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
4,A0AUZ9,F1,Ala,250,A,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
5,A0AUZ9,F1,Ala,318,A,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
6,A0AUZ9,F1,Ala,319,A,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
7,A0AUZ9,F1,Ala,325,A,S,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
8,A0AUZ9,F1,Ala,328,A,S,0,2,0,0,0,0,0,0,0,0,1,1,0,0,0,0
9,A0AUZ9,F1,Ala,376,A,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0


In [None]:
#Checking for the existence of 'missing' values.
df_proc_RING.isna().sum()

Uniprot_AF_id_RING         0
F_AF_RING                  0
Node_RING                  0
Node_pos_RING              0
Node_chain_RING            0
Node_type                  0
Inter_Lig_tot              0
Inter_Res_tot              0
Inter_IAC_Lig_tot          0
Inter_VDW_Lig_tot          0
Inter_HBOND_Lig_tot        0
Inter_PIPISTACK_Lig_tot    0
Inter_IONIC_Lig_tot        0
Inter_SSBOND_Lig_tot       0
Inter_PICATION_Lig_tot     0
Inter_IAC_Res_tot          0
Inter_VDW_Res_tot          0
Inter_HBOND_Res_tot        0
Inter_PIPISTACK_Res_tot    0
Inter_IONIC_Res_tot        0
Inter_SSBOND_Res_tot       0
Inter_PICATION_Res_tot     0
dtype: int64

In [None]:
#Identify duplicates records in the data
dupes=df_proc_RING.duplicated()
sum(dupes)

0

###1.1.15  Generating a file with the RING edges database, characterizing the nodes as source and target

In this file, each node is characterized by two records: one counts its interactions when it is the source, and the other counts its interactions when it is the target.

In [None]:
df_proc_RING.to_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRINGAlphaFold/edgesDB_proc_ST.csv",sep='\t',index=False)

##1.2 Generating an edge file without explicitly stating the node when it is source or target

In [None]:
#increasing the viewing capacity of columns and rows
import pandas as pd

pd.set_option('display.max_columns', 7000)
pd.set_option('display.max_rows',1000000)
pd.set_option('display.width', 7000)

In [None]:
import pandas as pd
df_RING_edge = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRINGAlphaFold/edgesDB_proc_ST.csv",delimiter='\t')

In [None]:
df_RING_edge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12085386 entries, 0 to 12085385
Data columns (total 22 columns):
 #   Column                   Dtype 
---  ------                   ----- 
 0   Uniprot_AF_id_RING       object
 1   F_AF_RING                object
 2   Node_RING                object
 3   Node_pos_RING            int64 
 4   Node_chain_RING          object
 5   Node_type                object
 6   Inter_Lig_tot            int64 
 7   Inter_Res_tot            int64 
 8   Inter_IAC_Lig_tot        int64 
 9   Inter_VDW_Lig_tot        int64 
 10  Inter_HBOND_Lig_tot      int64 
 11  Inter_PIPISTACK_Lig_tot  int64 
 12  Inter_IONIC_Lig_tot      int64 
 13  Inter_SSBOND_Lig_tot     int64 
 14  Inter_PICATION_Lig_tot   int64 
 15  Inter_IAC_Res_tot        int64 
 16  Inter_VDW_Res_tot        int64 
 17  Inter_HBOND_Res_tot      int64 
 18  Inter_PIPISTACK_Res_tot  int64 
 19  Inter_IONIC_Res_tot      int64 
 20  Inter_SSBOND_Res_tot     int64 
 21  Inter_PICATION_Res_tot   int6

Let's sort the file by the following attributes to facilitate our processing:
- Uniprot_AF_id_RING   
- F_AF_RING           
- Node_RING
- Node_pos_RING
- Node_chain_RING  

The dataframe **df_ord** will be generated

In [None]:
df_ord = df_RING_edge.sort_values(by=['Uniprot_AF_id_RING',	'F_AF_RING', 'Node_RING', 'Node_pos_RING', 'Node_chain_RING'], ignore_index=True)

In [None]:
df_ord.head(10)

Unnamed: 0,Uniprot_AF_id_RING,F_AF_RING,Node_RING,Node_pos_RING,Node_chain_RING,Node_type,Inter_Lig_tot,Inter_Res_tot,Inter_IAC_Lig_tot,Inter_VDW_Lig_tot,Inter_HBOND_Lig_tot,Inter_PIPISTACK_Lig_tot,Inter_IONIC_Lig_tot,Inter_SSBOND_Lig_tot,Inter_PICATION_Lig_tot,Inter_IAC_Res_tot,Inter_VDW_Res_tot,Inter_HBOND_Res_tot,Inter_PIPISTACK_Res_tot,Inter_IONIC_Res_tot,Inter_SSBOND_Res_tot,Inter_PICATION_Res_tot
0,A0AUZ9,F1,Ala,213,A,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
1,A0AUZ9,F1,Ala,214,A,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
2,A0AUZ9,F1,Ala,222,A,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
3,A0AUZ9,F1,Ala,222,A,T,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
4,A0AUZ9,F1,Ala,238,A,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
5,A0AUZ9,F1,Ala,238,A,T,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
6,A0AUZ9,F1,Ala,250,A,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
7,A0AUZ9,F1,Ala,250,A,T,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
8,A0AUZ9,F1,Ala,318,A,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
9,A0AUZ9,F1,Ala,319,A,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0


Creating a Dataframe that will store the processing of the **df_proc_edge** dataframe.
The Dataframe will have the following attributes:
- **Uniprot_AF_id_RING**: Uniprot  
- **F_AF_RING**: AlphaFold  F
- **Node_RING**: the node can be a residue or a ligand        
- **Node_pos_RING**: position of the node in the chain
- **Node_chain_RING**: chain                     
- **Inter_Lig_tot**: total residue-ligand interactions            
- **Inter_Res_tot**: total residue-residue interactions        
- **Inter_IAC_Lig_tot**: total interactions of type IAC with ligand
- **Inter_VDW_Lig_tot**: total interactions of type VDW with ligand
- **Inter_HBOND_Lig_tot**: total interactions of type HBOND with ligand       
- **Inter_PIPISTACK_Lig_tot**: total interactions of type PIPISTACK with ligand   
- **Inter_IONIC_Lig_tot**: total interactions of type IONIC with ligand      
- **Inter_SSBOND_Lig_tot**: total interactions of type SSBOND with ligand      
- **Inter_PICATION_Lig_tot**: total interactions of type PICATION with ligand    
- **Inter_IAC_Res_tot**: total interactions of type IAC between residues
- **Inter_VDW_Res_tot**: total interactions of type VDW between residues
- **Inter_HBOND_Res_tot**: total interactions of type HBOND between residues               
- **Inter_PIPISTACK_Res_tot**: total interactions of type PIPISTACKK between residues           
- **Inter_IONIC_Res_tot**: total interactions of type IONIC between residues       
- **Inter_SSBOND_Res_tot**: total interactions of type SSBOND between residues              
- **Inter_PICATION_Res_tot**: total interactions of type PICATION between residues

In [None]:
COLUMN_NAMES=['Uniprot_AF_id_RING','F_AF_RING','Node_RING','Node_pos_RING','Node_chain_RING','Inter_Lig_tot','Inter_Res_tot',
              'Inter_IAC_Lig_tot','Inter_VDW_Lig_tot', 'Inter_HBOND_Lig_tot', 'Inter_PIPISTACK_Lig_tot', 'Inter_IONIC_Lig_tot',
              'Inter_SSBOND_Lig_tot', 'Inter_PICATION_Lig_tot','Inter_IAC_Res_tot','Inter_VDW_Res_tot', 'Inter_HBOND_Res_tot',
              'Inter_PIPISTACK_Res_tot', 'Inter_IONIC_Res_tot','Inter_SSBOND_Res_tot', 'Inter_PICATION_Res_tot']
df_proc_edge = pd.DataFrame(columns=COLUMN_NAMES)

In [None]:
df_proc_edge.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Uniprot_AF_id_RING       0 non-null      object
 1   F_AF_RING                0 non-null      object
 2   Node_RING                0 non-null      object
 3   Node_pos_RING            0 non-null      object
 4   Node_chain_RING          0 non-null      object
 5   Inter_Lig_tot            0 non-null      object
 6   Inter_Res_tot            0 non-null      object
 7   Inter_IAC_Lig_tot        0 non-null      object
 8   Inter_VDW_Lig_tot        0 non-null      object
 9   Inter_HBOND_Lig_tot      0 non-null      object
 10  Inter_PIPISTACK_Lig_tot  0 non-null      object
 11  Inter_IONIC_Lig_tot      0 non-null      object
 12  Inter_SSBOND_Lig_tot     0 non-null      object
 13  Inter_PICATION_Lig_tot   0 non-null      object
 14  Inter_IAC_Res_tot        0 non-null      object
 15  Inter_

The processing will total the number of interactions of a node (belonging to a Uniprot-AF) in a given F, in a given chain and a given position, regardless of whether it was Source or Target, adding everything up. As a result, the dataframe **df_proc_edge** will be generated.

In [None]:

l_Uniprot_AF_id_RING = []
l_F_AF_RING = []
l_Node_RING = []
l__Node_pos_RING = []
l_Node_chain_RING = []
l_Inter_Lig_tot = []
l_Inter_Res_tot = []
l_Inter_IAC_Lig_tot = []
l_Inter_VDW_Lig_tot = []
l_Inter_HBOND_Lig_tot = []
l_Inter_PIPISTACK_Lig_tot = []
l_Inter_IONIC_Lig_tot = []
l_Inter_SSBOND_Lig_tot = []
l_Inter_PICATION_Lig_tot = []
l_Inter_IAC_Res_tot = []
l_Inter_VDW_Res_tot = []
l_Inter_HBOND_Res_tot = []
l_Inter_PIPISTACK_Res_tot = []
l_Inter_IONIC_Res_tot = []
l_Inter_SSBOND_Res_tot = []
l_Inter_PICATION_Res_tot = []

l_Uniprot = []
l_F = []
l_Node = []
l_pos = []
l_chain = []
l_lig = []
l_res = []
l_IAC_L = []
l_VDW_L = []
l_HBOND_L = []
l_PIPISTACK_L = []
l_IONIC_L = []
l_SSBOND_L = []
l_PICATION_L = []
l_IAC_R = []
l_VDW_R = []
l_HBOND_R = []
l_PIPISTACK_R = []
l_IONIC_R = []
l_SSBOND_R = []
l_PICATION_R = []

def process_reg_ord(df):
  uniprot = 0
  F = 0
  no = 0
  pos = 0
  chain = 0
  mudou = False
  primeiro = False

  totlig = 0
  totres = 0
  tot1 = 0
  tot2 = 0
  tot3 = 0
  tot4 = 0
  tot5 = 0
  tot6 = 0
  tot7 = 0
  tot8 = 0
  tot9 = 0
  tot10 = 0
  tot11 = 0
  tot12 = 0
  tot13 = 0
  tot14 = 0

  for i in df.itertuples():

    if (i.Index == 0):  #First record
       primeiro = True
       print("e o primeiro")
    if (((i.Uniprot_AF_id_RING == uniprot) and (i.F_AF_RING == F) and (i.Node_RING  == no) and (i.Node_pos_RING == pos) and (i.Node_chain_RING == chain))
    or primeiro):
      if(primeiro == True):
        primeiro = False
      uniprot = i.Uniprot_AF_id_RING
      F = i.F_AF_RING
      no = i.Node_RING
      pos = i.Node_pos_RING
      chain = i.Node_chain_RING
      totlig = totlig + i.Inter_Lig_tot
      totres = totres + i.Inter_Res_tot
      tot1 = tot1 + i.Inter_IAC_Lig_tot
      tot2 = tot2 + i.Inter_VDW_Lig_tot
      tot3 = tot3 + i.Inter_HBOND_Lig_tot
      tot4 = tot4 + i.Inter_PIPISTACK_Lig_tot
      tot5 = tot5 + i.Inter_IONIC_Lig_tot
      tot6 = tot6 + i.Inter_SSBOND_Lig_tot
      tot7 = tot7 + i.Inter_PICATION_Lig_tot
      tot8 = tot8 + i.Inter_IAC_Res_tot
      tot9 = tot9 + i.Inter_VDW_Res_tot
      tot10 = tot10 + i.Inter_HBOND_Res_tot
      tot11 = tot11 + i.Inter_PIPISTACK_Res_tot
      tot12 = tot12 + i.Inter_IONIC_Res_tot
      tot13 = tot13 + i.Inter_SSBOND_Res_tot
      tot14 = tot14 + i.Inter_PICATION_Res_tot
    else:  #key changed ((i.Uniprot_AF_id_RING != uniprot) & (i.F_AF_RING != F) & (i.Node_RING  "!=" no) & (i.Node_pos_RING != pos) & (i.Node_chain_RING != chain))
      l_Uniprot_AF_id_RING.append(uniprot)
      l_F_AF_RING.append(F)
      l_Node_RING.append(no)
      l__Node_pos_RING.append(pos)
      l_Node_chain_RING.append(chain)
      l_Inter_Lig_tot.append(totlig)
      l_Inter_Res_tot.append(totres)
      l_Inter_IAC_Lig_tot.append(tot1)
      l_Inter_VDW_Lig_tot.append(tot2)
      l_Inter_HBOND_Lig_tot.append(tot3)
      l_Inter_PIPISTACK_Lig_tot.append(tot4)
      l_Inter_IONIC_Lig_tot.append(tot5)
      l_Inter_SSBOND_Lig_tot.append(tot6)
      l_Inter_PICATION_Lig_tot.append(tot7)
      l_Inter_IAC_Res_tot.append(tot8)
      l_Inter_VDW_Res_tot.append(tot9)
      l_Inter_HBOND_Res_tot.append(tot10)
      l_Inter_PIPISTACK_Res_tot.append(tot11)
      l_Inter_IONIC_Res_tot.append(tot12)
      l_Inter_SSBOND_Res_tot.append(tot13)
      l_Inter_PICATION_Res_tot.append(tot14)
      totlig = 0
      totres = 0
      tot1 = 0
      tot2 = 0
      tot3 = 0
      tot4 = 0
      tot5 = 0
      tot6 = 0
      tot7 = 0
      tot8 = 0
      tot9 = 0
      tot10 = 0
      tot11 = 0
      tot12 = 0
      tot13 = 0
      tot14 = 0
      uniprot = i.Uniprot_AF_id_RING
      F = i.F_AF_RING
      no = i.Node_RING
      pos = i.Node_pos_RING
      chain = i.Node_chain_RING
      totlig = totlig + i.Inter_Lig_tot
      totres = totres + i.Inter_Res_tot
      tot1 = tot1 + i.Inter_IAC_Lig_tot
      tot2 = tot2 + i.Inter_VDW_Lig_tot
      tot3 = tot3 + i.Inter_HBOND_Lig_tot
      tot4 = tot4 + i.Inter_PIPISTACK_Lig_tot
      tot5 = tot5 + i.Inter_IONIC_Lig_tot
      tot6 = tot6 + i.Inter_SSBOND_Lig_tot
      tot7 = tot7 + i.Inter_PICATION_Lig_tot
      tot8 = tot8 + i.Inter_IAC_Res_tot
      tot9 = tot9 + i.Inter_VDW_Res_tot
      tot10 = tot10 + i.Inter_HBOND_Res_tot
      tot11 = tot11 + i.Inter_PIPISTACK_Res_tot
      tot12 = tot12 + i.Inter_IONIC_Res_tot
      tot13 = tot13 + i.Inter_SSBOND_Res_tot
      tot14 = tot14 + i.Inter_PICATION_Res_tot

  return l_Uniprot_AF_id_RING,l_F_AF_RING,l_Node_RING,l__Node_pos_RING,l_Node_chain_RING,l_Inter_Lig_tot,l_Inter_Res_tot,l_Inter_IAC_Lig_tot,l_Inter_VDW_Lig_tot,l_Inter_HBOND_Lig_tot,l_Inter_PIPISTACK_Lig_tot,l_Inter_IONIC_Lig_tot,l_Inter_SSBOND_Lig_tot,l_Inter_PICATION_Lig_tot,l_Inter_IAC_Res_tot,l_Inter_VDW_Res_tot,l_Inter_HBOND_Res_tot,l_Inter_PIPISTACK_Res_tot,l_Inter_IONIC_Res_tot,l_Inter_SSBOND_Res_tot,l_Inter_PICATION_Res_tot


In [None]:
#Processing the edges and joining the information when it is source and target
l_Uniprot1,l_F1,l_Node1,l_pos1,l_chain1,l_lig1,l_res1,l_IAC_L1,l_VDW_L1,l_HBOND_L1,l_PIPISTACK_L1,l_IONIC_L1,l_SSBOND_L1,l_PICATION_L1,l_IAC_R1,l_VDW_R1,l_HBOND_R1,l_PIPISTACK_R1,l_IONIC_R1,l_SSBOND_R1,l_PICATION_R1 = process_reg_ord(df_ord)

e o primeiro


In [None]:
#Resulting size of processing Source
tam = len(l_Uniprot1)
print(tam)

7669552


In [None]:
df_proc_edge['Uniprot_AF_id_RING'] = l_Uniprot1
df_proc_edge['F_AF_RING'] = l_F1
df_proc_edge['Node_RING'] = l_Node1
df_proc_edge['Node_pos_RING'] = l_pos1
df_proc_edge['Node_chain_RING'] = l_chain1
df_proc_edge['Inter_Lig_tot'] = l_lig1
df_proc_edge['Inter_Res_tot'] = l_res1
df_proc_edge['Inter_IAC_Lig_tot'] = l_IAC_L1
df_proc_edge['Inter_VDW_Lig_tot'] = l_VDW_L1
df_proc_edge['Inter_HBOND_Lig_tot'] = l_HBOND_L1
df_proc_edge['Inter_PIPISTACK_Lig_tot'] = l_PIPISTACK_L1
df_proc_edge['Inter_IONIC_Lig_tot'] = l_IONIC_L1
df_proc_edge['Inter_SSBOND_Lig_tot'] = l_SSBOND_L1
df_proc_edge['Inter_PICATION_Lig_tot'] = l_PICATION_L1
df_proc_edge['Inter_IAC_Res_tot'] = l_IAC_R1
df_proc_edge['Inter_VDW_Res_tot'] = l_VDW_R1
df_proc_edge['Inter_HBOND_Res_tot'] = l_HBOND_R1
df_proc_edge['Inter_PIPISTACK_Res_tot'] = l_PIPISTACK_R1
df_proc_edge['Inter_IONIC_Res_tot'] = l_IONIC_R1
df_proc_edge['Inter_SSBOND_Res_tot'] = l_SSBOND_R1
df_proc_edge['Inter_PICATION_Res_tot'] = l_PICATION_R1

In [None]:
df_proc_edge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7669552 entries, 0 to 7669551
Data columns (total 21 columns):
 #   Column                   Dtype 
---  ------                   ----- 
 0   Uniprot_AF_id_RING       object
 1   F_AF_RING                object
 2   Node_RING                object
 3   Node_pos_RING            int64 
 4   Node_chain_RING          object
 5   Inter_Lig_tot            int64 
 6   Inter_Res_tot            int64 
 7   Inter_IAC_Lig_tot        int64 
 8   Inter_VDW_Lig_tot        int64 
 9   Inter_HBOND_Lig_tot      int64 
 10  Inter_PIPISTACK_Lig_tot  int64 
 11  Inter_IONIC_Lig_tot      int64 
 12  Inter_SSBOND_Lig_tot     int64 
 13  Inter_PICATION_Lig_tot   int64 
 14  Inter_IAC_Res_tot        int64 
 15  Inter_VDW_Res_tot        int64 
 16  Inter_HBOND_Res_tot      int64 
 17  Inter_PIPISTACK_Res_tot  int64 
 18  Inter_IONIC_Res_tot      int64 
 19  Inter_SSBOND_Res_tot     int64 
 20  Inter_PICATION_Res_tot   int64 
dtypes: int64(17), object(4)
memory 

Comparing the result (**df_proc_edge**) with the ordered edges database (**df_ord**)

In [None]:
df_ord.head(10)

Unnamed: 0,Uniprot_AF_id_RING,F_AF_RING,Node_RING,Node_pos_RING,Node_chain_RING,Node_type,Inter_Lig_tot,Inter_Res_tot,Inter_IAC_Lig_tot,Inter_VDW_Lig_tot,Inter_HBOND_Lig_tot,Inter_PIPISTACK_Lig_tot,Inter_IONIC_Lig_tot,Inter_SSBOND_Lig_tot,Inter_PICATION_Lig_tot,Inter_IAC_Res_tot,Inter_VDW_Res_tot,Inter_HBOND_Res_tot,Inter_PIPISTACK_Res_tot,Inter_IONIC_Res_tot,Inter_SSBOND_Res_tot,Inter_PICATION_Res_tot
0,A0AUZ9,F1,Ala,213,A,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
1,A0AUZ9,F1,Ala,214,A,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
2,A0AUZ9,F1,Ala,222,A,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
3,A0AUZ9,F1,Ala,222,A,T,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
4,A0AUZ9,F1,Ala,238,A,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
5,A0AUZ9,F1,Ala,238,A,T,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
6,A0AUZ9,F1,Ala,250,A,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
7,A0AUZ9,F1,Ala,250,A,T,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
8,A0AUZ9,F1,Ala,318,A,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
9,A0AUZ9,F1,Ala,319,A,S,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0


In [None]:
df_proc_edge.head(10)

Unnamed: 0,Uniprot_AF_id_RING,F_AF_RING,Node_RING,Node_pos_RING,Node_chain_RING,Inter_Lig_tot,Inter_Res_tot,Inter_IAC_Lig_tot,Inter_VDW_Lig_tot,Inter_HBOND_Lig_tot,Inter_PIPISTACK_Lig_tot,Inter_IONIC_Lig_tot,Inter_SSBOND_Lig_tot,Inter_PICATION_Lig_tot,Inter_IAC_Res_tot,Inter_VDW_Res_tot,Inter_HBOND_Res_tot,Inter_PIPISTACK_Res_tot,Inter_IONIC_Res_tot,Inter_SSBOND_Res_tot,Inter_PICATION_Res_tot
0,A0AUZ9,F1,Ala,213,A,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
1,A0AUZ9,F1,Ala,214,A,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
2,A0AUZ9,F1,Ala,222,A,0,4,0,0,0,0,0,0,0,0,0,4,0,0,0,0
3,A0AUZ9,F1,Ala,238,A,0,3,0,0,0,0,0,0,0,0,0,3,0,0,0,0
4,A0AUZ9,F1,Ala,250,A,0,4,0,0,0,0,0,0,0,0,0,4,0,0,0,0
5,A0AUZ9,F1,Ala,318,A,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
6,A0AUZ9,F1,Ala,319,A,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0
7,A0AUZ9,F1,Ala,325,A,0,3,0,0,0,0,0,0,0,0,0,3,0,0,0,0
8,A0AUZ9,F1,Ala,328,A,0,3,0,0,0,0,0,0,0,0,1,2,0,0,0,0
9,A0AUZ9,F1,Ala,343,A,0,2,0,0,0,0,0,0,0,0,1,1,0,0,0,0


In [None]:
df_proc_edge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7669552 entries, 0 to 7669551
Data columns (total 21 columns):
 #   Column                   Dtype 
---  ------                   ----- 
 0   Uniprot_AF_id_RING       object
 1   F_AF_RING                object
 2   Node_RING                object
 3   Node_pos_RING            int64 
 4   Node_chain_RING          object
 5   Inter_Lig_tot            int64 
 6   Inter_Res_tot            int64 
 7   Inter_IAC_Lig_tot        int64 
 8   Inter_VDW_Lig_tot        int64 
 9   Inter_HBOND_Lig_tot      int64 
 10  Inter_PIPISTACK_Lig_tot  int64 
 11  Inter_IONIC_Lig_tot      int64 
 12  Inter_SSBOND_Lig_tot     int64 
 13  Inter_PICATION_Lig_tot   int64 
 14  Inter_IAC_Res_tot        int64 
 15  Inter_VDW_Res_tot        int64 
 16  Inter_HBOND_Res_tot      int64 
 17  Inter_PIPISTACK_Res_tot  int64 
 18  Inter_IONIC_Res_tot      int64 
 19  Inter_SSBOND_Res_tot     int64 
 20  Inter_PICATION_Res_tot   int64 
dtypes: int64(17), object(4)
memory 

In [None]:
df_proc_edge.columns

Index(['Uniprot_AF_id_RING', 'F_AF_RING', 'Node_RING', 'Node_pos_RING', 'Node_chain_RING', 'Inter_Lig_tot', 'Inter_Res_tot', 'Inter_IAC_Lig_tot', 'Inter_VDW_Lig_tot', 'Inter_HBOND_Lig_tot', 'Inter_PIPISTACK_Lig_tot', 'Inter_IONIC_Lig_tot', 'Inter_SSBOND_Lig_tot', 'Inter_PICATION_Lig_tot', 'Inter_IAC_Res_tot', 'Inter_VDW_Res_tot', 'Inter_HBOND_Res_tot', 'Inter_PIPISTACK_Res_tot', 'Inter_IONIC_Res_tot', 'Inter_SSBOND_Res_tot', 'Inter_PICATION_Res_tot'], dtype='object')

In [None]:
def categories_column(df):
    for col in ['Uniprot_AF_id_RING', 'F_AF_RING', 'Node_RING', 'Node_pos_RING', 'Node_chain_RING']:
        mydic= df[col].value_counts().to_dict()
        print(col, mydic)
        print('\n')

categories_column(df_proc_edge)

Uniprot_AF_id_RING {'Q8NF91': 49529, 'Q8WXH0': 35319, 'Q9UPN3': 34741, 'Q03001': 33569, 'Q8WXG9': 30309, 'Q96RW7': 27441, 'Q9NU22': 25332, 'O75445': 24636, 'Q14204': 22990, 'Q15149': 22982, 'Q8TE73': 22953, 'P21817': 22783, 'Q6V0I7': 22777, 'Q15413': 22615, 'Q5T4S7': 22373, 'Q92736': 22334, 'Q9NYC9': 22173, 'Q8IVF4': 22060, 'Q96DT5': 22011, 'Q9P225': 21963, 'Q96JB1': 21829, 'P98164': 21673, 'Q9NZJ4': 21047, 'Q4G0P3': 20979, 'O95714': 20777, 'Q07954': 20750, 'Q9NZR2': 20702, 'Q96M86': 20688, 'Q8NCM8': 20534, 'Q14517': 20381, 'Q8TDW7': 20380, 'Q86WI1': 19635, 'Q8WXX0': 19360, 'Q8TD57': 19345, 'Q9C0G6': 19264, 'Q9NYQ8': 19155, 'Q15751': 18905, 'P78527': 18074, 'P98160': 17937, 'Q09666': 17809, 'P98161': 17761, 'P08F94': 17592, 'Q9Y4A5': 16924, 'Q9NRC6': 16529, 'Q96Q15': 15734, 'Q685J3': 15717, 'O60494': 15528, 'O15230': 15496, 'Q7Z407': 15305, 'Q7Z7G8': 15150, 'P46939': 14864, 'Q99996': 14720, 'Q709C8': 14468, 'Q96PZ7': 14347, 'Q2LD37': 14230, 'Q8IZT6': 14204, 'Q7Z408': 14111, 'P78509': 1

In [None]:
#Checking for missing values.
df_proc_edge.isna().sum()

Uniprot_AF_id_RING         0
F_AF_RING                  0
Node_RING                  0
Node_pos_RING              0
Node_chain_RING            0
Inter_Lig_tot              0
Inter_Res_tot              0
Inter_IAC_Lig_tot          0
Inter_VDW_Lig_tot          0
Inter_HBOND_Lig_tot        0
Inter_PIPISTACK_Lig_tot    0
Inter_IONIC_Lig_tot        0
Inter_SSBOND_Lig_tot       0
Inter_PICATION_Lig_tot     0
Inter_IAC_Res_tot          0
Inter_VDW_Res_tot          0
Inter_HBOND_Res_tot        0
Inter_PIPISTACK_Res_tot    0
Inter_IONIC_Res_tot        0
Inter_SSBOND_Res_tot       0
Inter_PICATION_Res_tot     0
dtype: int64

In [None]:
#Identify duplicates records in the data
dupes=df_proc_edge.duplicated()
sum(dupes)

0

Let's check for duplicates.

In [None]:
df_proc_edge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7669552 entries, 0 to 7669551
Data columns (total 21 columns):
 #   Column                   Dtype 
---  ------                   ----- 
 0   Uniprot_AF_id_RING       object
 1   F_AF_RING                object
 2   Node_RING                object
 3   Node_pos_RING            int64 
 4   Node_chain_RING          object
 5   Inter_Lig_tot            int64 
 6   Inter_Res_tot            int64 
 7   Inter_IAC_Lig_tot        int64 
 8   Inter_VDW_Lig_tot        int64 
 9   Inter_HBOND_Lig_tot      int64 
 10  Inter_PIPISTACK_Lig_tot  int64 
 11  Inter_IONIC_Lig_tot      int64 
 12  Inter_SSBOND_Lig_tot     int64 
 13  Inter_PICATION_Lig_tot   int64 
 14  Inter_IAC_Res_tot        int64 
 15  Inter_VDW_Res_tot        int64 
 16  Inter_HBOND_Res_tot      int64 
 17  Inter_PIPISTACK_Res_tot  int64 
 18  Inter_IONIC_Res_tot      int64 
 19  Inter_SSBOND_Res_tot     int64 
 20  Inter_PICATION_Res_tot   int64 
dtypes: int64(17), object(4)
memory 

In [None]:
res = df_proc_edge[['Uniprot_AF_id_RING','F_AF_RING','Node_RING','Node_pos_RING']].sort_values(by='Uniprot_AF_id_RING')

In [None]:
res.head(200)

Unnamed: 0,Uniprot_AF_id_RING,F_AF_RING,Node_RING,Node_pos_RING
0,A0AUZ9,F1,Ala,213
177,A0AUZ9,F1,Lys,216
178,A0AUZ9,F1,Lys,230
179,A0AUZ9,F1,Lys,232
180,A0AUZ9,F1,Lys,243
181,A0AUZ9,F1,Lys,251
182,A0AUZ9,F1,Lys,255
183,A0AUZ9,F1,Lys,262
184,A0AUZ9,F1,Lys,266
185,A0AUZ9,F1,Lys,271


In [None]:
df_proc_edge.to_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRINGAlphaFold/edgesDB_proc.csv",sep='\t',index=False)

#2 -Reading and processing the RING nodes file

##2.1 Processing the *nodesDB.txt* database

In [None]:
#increasing the viewing capacity of columns and rows
import pandas as pd

pd.set_option('display.max_columns', 7000)
pd.set_option('display.max_rows',1000000)
pd.set_option('display.width', 7000)

In [None]:
import pandas as pd

df_RING_nodes = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRINGAlphaFold/nodesDB.txt",index_col=False, header=None, delimiter='\t')


In [None]:
df_RING_nodes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7669554 entries, 0 to 7669553
Data columns (total 14 columns):
 #   Column  Dtype  
---  ------  -----  
 0   0       object 
 1   1       object 
 2   2       object 
 3   3       int64  
 4   4       object 
 5   5       object 
 6   6       int64  
 7   7       float64
 8   8       float64
 9   9       float64
 10  10      float64
 11  11      object 
 12  12      float64
 13  13      float64
dtypes: float64(6), int64(2), object(6)
memory usage: 819.2+ MB


In [None]:
df_RING_nodes.head(100)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,AF-A0AUZ9-F1-model_v4.pdb.nodes,A:38:_:GLU,A,38,GLU,,1,40.31,-21.856,32.756,54.251,AF-A0AUZ9-F1-model_v4.pdb#38.A,-12.176,-0.12
1,AF-A0AUZ9-F1-model_v4.pdb.nodes,A:118:_:ASN,A,118,ASN,,2,29.9,-16.932,5.694,2.723,AF-A0AUZ9-F1-model_v4.pdb#118.A,-38.785,-0.065
2,AF-A0AUZ9-F1-model_v4.pdb.nodes,A:131:_:GLU,A,131,GLU,,2,37.29,-35.378,17.112,-28.386,AF-A0AUZ9-F1-model_v4.pdb#131.A,71.569,-0.306
3,AF-A0AUZ9-F1-model_v4.pdb.nodes,A:132:_:PHE,A,132,PHE,,4,35.84,-36.765,19.741,-30.761,AF-A0AUZ9-F1-model_v4.pdb#132.A,-15.654,-0.007
4,AF-A0AUZ9-F1-model_v4.pdb.nodes,A:211:_:SER,A,211,SER,,2,50.26,-45.704,32.789,-1.701,AF-A0AUZ9-F1-model_v4.pdb#211.A,-15.998,-0.32
5,AF-A0AUZ9-F1-model_v4.pdb.nodes,A:212:_:SER,A,212,SER,,6,58.17,-45.714,30.046,0.934,AF-A0AUZ9-F1-model_v4.pdb#212.A,-37.472,0.722
6,AF-A0AUZ9-F1-model_v4.pdb.nodes,A:213:_:ALA,A,213,ALA,H,2,66.0,-43.161,29.634,3.752,AF-A0AUZ9-F1-model_v4.pdb#213.A,-28.577,0.637
7,AF-A0AUZ9-F1-model_v4.pdb.nodes,A:214:_:ALA,A,214,ALA,H,2,67.33,-42.111,26.335,2.052,AF-A0AUZ9-F1-model_v4.pdb#214.A,-43.074,0.541
8,AF-A0AUZ9-F1-model_v4.pdb.nodes,A:215:_:GLU,A,215,GLU,H,6,68.39,-41.317,28.056,-1.332,AF-A0AUZ9-F1-model_v4.pdb#215.A,-88.078,0.369
9,AF-A0AUZ9-F1-model_v4.pdb.nodes,A:216:_:LYS,A,216,LYS,H,5,78.53,-38.962,30.462,0.557,AF-A0AUZ9-F1-model_v4.pdb#216.A,-122.191,0.188


###2.1.1 Renaming the fields

In [None]:
df_RING_nodes.rename(columns={0: 'FileName',
                       1: 'NodeId_RING',
                       2: 'Chain_RING',
                       3: 'Position_RING',
                       4: 'Residue_RING',
                       5: 'Dssp_RING',
                       6: 'Degree_RING',
                       7: 'pLDDT_RING',   #No contexto do AlphaFold vamos denominar o campo Bfactor_CA de pLDDT
                       8: 'x',
                       9: 'y',
                       10: 'z',
                       11: 'pdbFileName',
                       12: 'Rapdf_RING',
                       13: 'Tap_RING'}, inplace=True)

In [None]:
df_RING_nodes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7669554 entries, 0 to 7669553
Data columns (total 14 columns):
 #   Column         Dtype  
---  ------         -----  
 0   FileName       object 
 1   NodeId_RING    object 
 2   Chain_RING     object 
 3   Position_RING  int64  
 4   Residue_RING   object 
 5   Dssp_RING      object 
 6   Degree_RING    int64  
 7   pLDDT_RING     float64
 8   x              float64
 9   y              float64
 10  z              float64
 11  pdbFileName    object 
 12  Rapdf_RING     float64
 13  Tap_RING       float64
dtypes: float64(6), int64(2), object(6)
memory usage: 819.2+ MB


In [None]:
df_RING_nodes.head()

Unnamed: 0,FileName,NodeId_RING,Chain_RING,Position_RING,Residue_RING,Dssp_RING,Degree_RING,pLDDT_RING,x,y,z,pdbFileName,Rapdf_RING,Tap_RING
0,AF-A0AUZ9-F1-model_v4.pdb.nodes,A:38:_:GLU,A,38,GLU,,1,40.31,-21.856,32.756,54.251,AF-A0AUZ9-F1-model_v4.pdb#38.A,-12.176,-0.12
1,AF-A0AUZ9-F1-model_v4.pdb.nodes,A:118:_:ASN,A,118,ASN,,2,29.9,-16.932,5.694,2.723,AF-A0AUZ9-F1-model_v4.pdb#118.A,-38.785,-0.065
2,AF-A0AUZ9-F1-model_v4.pdb.nodes,A:131:_:GLU,A,131,GLU,,2,37.29,-35.378,17.112,-28.386,AF-A0AUZ9-F1-model_v4.pdb#131.A,71.569,-0.306
3,AF-A0AUZ9-F1-model_v4.pdb.nodes,A:132:_:PHE,A,132,PHE,,4,35.84,-36.765,19.741,-30.761,AF-A0AUZ9-F1-model_v4.pdb#132.A,-15.654,-0.007
4,AF-A0AUZ9-F1-model_v4.pdb.nodes,A:211:_:SER,A,211,SER,,2,50.26,-45.704,32.789,-1.701,AF-A0AUZ9-F1-model_v4.pdb#211.A,-15.998,-0.32


###2.1.2  Selection of fields that will be used

In [None]:
#Selection of fields
df_RING_nodes = df_RING_nodes.loc[:,['FileName','NodeId_RING', 'Chain_RING','Position_RING', 'Residue_RING','Dssp_RING','Degree_RING','pLDDT_RING']]

In [None]:
df_RING_nodes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7669554 entries, 0 to 7669553
Data columns (total 8 columns):
 #   Column         Dtype  
---  ------         -----  
 0   FileName       object 
 1   NodeId_RING    object 
 2   Chain_RING     object 
 3   Position_RING  int64  
 4   Residue_RING   object 
 5   Dssp_RING      object 
 6   Degree_RING    int64  
 7   pLDDT_RING     float64
dtypes: float64(1), int64(2), object(5)
memory usage: 468.1+ MB


###2.1.3 Generating the *Uniprot_AF_id_RING* attribute extracted from the *FileName* attribute.

In [None]:
def get_uniprot(filename):
  aux = filename.split("-")[1]   #AF-A0AUZ9-F1-model_v4.pdb.edges
  return aux

In [None]:
df_RING_nodes['Uniprot_AF_id_RING'] = df_RING_nodes['FileName'].apply(get_uniprot)

In [None]:
df_RING_nodes.head()

Unnamed: 0,FileName,NodeId_RING,Chain_RING,Position_RING,Residue_RING,Dssp_RING,Degree_RING,pLDDT_RING,Uniprot_AF_id_RING
0,AF-A0AUZ9-F1-model_v4.pdb.nodes,A:38:_:GLU,A,38,GLU,,1,40.31,A0AUZ9
1,AF-A0AUZ9-F1-model_v4.pdb.nodes,A:118:_:ASN,A,118,ASN,,2,29.9,A0AUZ9
2,AF-A0AUZ9-F1-model_v4.pdb.nodes,A:131:_:GLU,A,131,GLU,,2,37.29,A0AUZ9
3,AF-A0AUZ9-F1-model_v4.pdb.nodes,A:132:_:PHE,A,132,PHE,,4,35.84,A0AUZ9
4,AF-A0AUZ9-F1-model_v4.pdb.nodes,A:211:_:SER,A,211,SER,,2,50.26,A0AUZ9


###2.1.4 Generating the *F_AF_RING* attribute extracted from the *FileName* attribute

In [None]:
def get_F(filename):

  aux1 = filename.split("-")[2]  #AF-A0AUZ9-F1-model_v4.pdb.edges
  return aux1

In [None]:
df_RING_nodes['F_AF_RING'] = df_RING_nodes['FileName'].apply(get_F)

In [None]:
df_RING_nodes.tail(50)

Unnamed: 0,FileName,NodeId_RING,Chain_RING,Position_RING,Residue_RING,Dssp_RING,Degree_RING,pLDDT_RING,Uniprot_AF_id_RING,F_AF_RING
7669504,AF-Q9Y6Z7-F1-model_v4.pdb.nodes,A:216:_:ARG,A,216,ARG,T,7,94.6,Q9Y6Z7,F1
7669505,AF-Q9Y6Z7-F1-model_v4.pdb.nodes,A:218:_:GLY,A,218,GLY,T,1,93.39,Q9Y6Z7,F1
7669506,AF-Q9Y6Z7-F1-model_v4.pdb.nodes,A:219:_:GLN,A,219,GLN,,7,95.06,Q9Y6Z7,F1
7669507,AF-Q9Y6Z7-F1-model_v4.pdb.nodes,A:220:_:TYR,A,220,TYR,,20,96.86,Q9Y6Z7,F1
7669508,AF-Q9Y6Z7-F1-model_v4.pdb.nodes,A:221:_:MET,A,221,MET,E,5,97.16,Q9Y6Z7,F1
7669509,AF-Q9Y6Z7-F1-model_v4.pdb.nodes,A:222:_:PHE,A,222,PHE,E,15,97.55,Q9Y6Z7,F1
7669510,AF-Q9Y6Z7-F1-model_v4.pdb.nodes,A:223:_:THR,A,223,THR,T,8,96.45,Q9Y6Z7,F1
7669511,AF-Q9Y6Z7-F1-model_v4.pdb.nodes,A:224:_:ASP,A,224,ASP,T,9,95.22,Q9Y6Z7,F1
7669512,AF-Q9Y6Z7-F1-model_v4.pdb.nodes,A:225:_:ASN,A,225,ASN,T,1,94.38,Q9Y6Z7,F1
7669513,AF-Q9Y6Z7-F1-model_v4.pdb.nodes,A:226:_:THR,A,226,THR,E,2,95.47,Q9Y6Z7,F1


The **FileName** attribute will no longer be needed, so it will be removed.

In [None]:
del df_RING_nodes['FileName']

In [None]:
df_RING_nodes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7669554 entries, 0 to 7669553
Data columns (total 9 columns):
 #   Column              Dtype  
---  ------              -----  
 0   NodeId_RING         object 
 1   Chain_RING          object 
 2   Position_RING       int64  
 3   Residue_RING        object 
 4   Dssp_RING           object 
 5   Degree_RING         int64  
 6   pLDDT_RING          float64
 7   Uniprot_AF_id_RING  object 
 8   F_AF_RING           object 
dtypes: float64(1), int64(2), object(6)
memory usage: 526.6+ MB


In [None]:
#Identify duplicates records in the data
dupes=df_RING_nodes.duplicated()
sum(dupes)

0

In [None]:
#Checking for missing values
df_RING_nodes.isna().sum()

NodeId_RING           0
Chain_RING            0
Position_RING         0
Residue_RING          0
Dssp_RING             0
Degree_RING           0
pLDDT_RING            0
Uniprot_AF_id_RING    0
F_AF_RING             0
dtype: int64

In [None]:
df_RING_nodes.head()

Unnamed: 0,NodeId_RING,Chain_RING,Position_RING,Residue_RING,Dssp_RING,Degree_RING,pLDDT_RING,Uniprot_AF_id_RING,F_AF_RING
0,A:38:_:GLU,A,38,GLU,,1,40.31,A0AUZ9,F1
1,A:118:_:ASN,A,118,ASN,,2,29.9,A0AUZ9,F1
2,A:131:_:GLU,A,131,GLU,,2,37.29,A0AUZ9,F1
3,A:132:_:PHE,A,132,PHE,,4,35.84,A0AUZ9,F1
4,A:211:_:SER,A,211,SER,,2,50.26,A0AUZ9,F1


In [None]:
df_RING_nodes.columns

Index(['NodeId_RING', 'Chain_RING', 'Position_RING', 'Residue_RING', 'Dssp_RING', 'Degree_RING', 'pLDDT_RING', 'Uniprot_AF_id_RING', 'F_AF_RING'], dtype='object')

In [None]:
#Except NodeId
def categories_column(df):
    for col in ['Uniprot_AF_id_RING',	'F_AF_RING', 'Chain_RING',	'Position_RING',	'Residue_RING',	'Dssp_RING',	'Degree_RING',	'pLDDT_RING']:
        mydic= df[col].value_counts().to_dict()
        print(col, mydic)
        print('\n')

categories_column(df_RING_nodes)

Uniprot_AF_id_RING {'Q8NF91': 49529, 'Q8WXH0': 35319, 'Q9UPN3': 34741, 'Q03001': 33569, 'Q8WXG9': 30309, 'Q96RW7': 27441, 'Q9NU22': 25332, 'O75445': 24636, 'Q14204': 22990, 'Q15149': 22982, 'Q8TE73': 22953, 'P21817': 22783, 'Q6V0I7': 22777, 'Q15413': 22615, 'Q5T4S7': 22373, 'Q92736': 22334, 'Q9NYC9': 22173, 'Q8IVF4': 22060, 'Q96DT5': 22011, 'Q9P225': 21963, 'Q96JB1': 21829, 'P98164': 21673, 'Q9NZJ4': 21047, 'Q4G0P3': 20979, 'O95714': 20777, 'Q07954': 20750, 'Q9NZR2': 20702, 'Q96M86': 20688, 'Q8NCM8': 20534, 'Q14517': 20381, 'Q8TDW7': 20380, 'Q86WI1': 19635, 'Q8WXX0': 19360, 'Q8TD57': 19345, 'Q9C0G6': 19264, 'Q9NYQ8': 19155, 'Q15751': 18905, 'P78527': 18074, 'P98160': 17937, 'Q09666': 17809, 'P98161': 17761, 'P08F94': 17592, 'Q9Y4A5': 16924, 'Q9NRC6': 16529, 'Q96Q15': 15734, 'Q685J3': 15717, 'O60494': 15528, 'O15230': 15496, 'Q7Z407': 15305, 'Q7Z7G8': 15150, 'P46939': 14864, 'Q99996': 14720, 'Q709C8': 14468, 'Q96PZ7': 14347, 'Q2LD37': 14230, 'Q8IZT6': 14204, 'Q7Z408': 14111, 'P78509': 1

In [None]:
df_RING_nodes['Residue_RING'].value_counts()

LEU    908070
GLU    533650
VAL    522981
ALA    512650
SER    505002
LYS    443008
ARG    434394
ILE    419089
THR    391382
GLY    389432
GLN    378407
ASP    358144
PHE    341844
ASN    286443
PRO    282835
TYR    248207
CYS    212140
HIS    209539
MET    178996
TRP    113341
Name: Residue_RING, dtype: int64

In [None]:
df_RING_nodes['Dssp_RING'].value_counts()

H    3980300
E    1651891
     1037644
S     331591
T     300097
G     298746
B      58521
I      10764
Name: Dssp_RING, dtype: int64

In [None]:
res = df_RING_nodes.query('Dssp_RING == " " ')

In [None]:
res.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1037644 entries, 0 to 7669553
Data columns (total 9 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   NodeId_RING         1037644 non-null  object 
 1   Chain_RING          1037644 non-null  object 
 2   Position_RING       1037644 non-null  int64  
 3   Residue_RING        1037644 non-null  object 
 4   Dssp_RING           1037644 non-null  object 
 5   Degree_RING         1037644 non-null  int64  
 6   pLDDT_RING          1037644 non-null  float64
 7   Uniprot_AF_id_RING  1037644 non-null  object 
 8   F_AF_RING           1037644 non-null  object 
dtypes: float64(1), int64(2), object(6)
memory usage: 79.2+ MB


In [None]:
res.head(100)

Unnamed: 0,NodeId_RING,Chain_RING,Position_RING,Residue_RING,Dssp_RING,Degree_RING,pLDDT_RING,Uniprot_AF_id_RING,F_AF_RING
0,A:38:_:GLU,A,38,GLU,,1,40.31,A0AUZ9,F1
1,A:118:_:ASN,A,118,ASN,,2,29.9,A0AUZ9,F1
2,A:131:_:GLU,A,131,GLU,,2,37.29,A0AUZ9,F1
3,A:132:_:PHE,A,132,PHE,,4,35.84,A0AUZ9,F1
4,A:211:_:SER,A,211,SER,,2,50.26,A0AUZ9,F1
5,A:212:_:SER,A,212,SER,,6,58.17,A0AUZ9,F1
65,A:317:_:THR,A,317,THR,,5,58.06,A0AUZ9,F1
88,A:340:_:ASP,A,340,ASP,,4,61.9,A0AUZ9,F1
89,A:343:_:ALA,A,343,ALA,,3,61.45,A0AUZ9,F1
137,A:411:_:GLY,A,411,GLY,,1,73.11,A0AUZ9,F1


Records with **Dssp = " "** will be discarded.

In [None]:
base_nodes_valida  = df_RING_nodes.query('Dssp_RING !=  " "')

In [None]:
base_nodes_valida.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6631910 entries, 6 to 7669551
Data columns (total 9 columns):
 #   Column              Dtype  
---  ------              -----  
 0   NodeId_RING         object 
 1   Chain_RING          object 
 2   Position_RING       int64  
 3   Residue_RING        object 
 4   Dssp_RING           object 
 5   Degree_RING         int64  
 6   pLDDT_RING          float64
 7   Uniprot_AF_id_RING  object 
 8   F_AF_RING           object 
dtypes: float64(1), int64(2), object(6)
memory usage: 506.0+ MB


In [None]:
#Except NodeId
def categories_column(df):
    for col in ['Uniprot_AF_id_RING','F_AF_RING',	'Chain_RING',	'Position_RING',	'Residue_RING',	'Dssp_RING',	'Degree_RING',	'pLDDT_RING']:
        mydic= df[col].value_counts().to_dict()
        print(col, mydic)
        print('\n')

categories_column(base_nodes_valida)

Uniprot_AF_id_RING {'Q8NF91': 47420, 'Q8WXH0': 33647, 'Q9UPN3': 32670, 'Q03001': 31506, 'Q8WXG9': 24100, 'Q96RW7': 23634, 'Q9NU22': 21877, 'Q15149': 21833, 'Q8TE73': 20334, 'Q14204': 20315, 'Q5T4S7': 19905, 'P21817': 19758, 'Q15413': 19659, 'Q9NYC9': 19542, 'Q8IVF4': 19519, 'Q9P225': 19449, 'Q96DT5': 19419, 'Q92736': 19382, 'Q96JB1': 19197, 'O75445': 18956, 'P98164': 18456, 'Q8NCM8': 18248, 'Q96M86': 18228, 'Q6V0I7': 17955, 'O95714': 17938, 'Q9NZR2': 17857, 'Q07954': 17824, 'Q4G0P3': 17755, 'Q9NZJ4': 17658, 'Q8WXX0': 16953, 'Q8TD57': 16951, 'Q9C0G6': 16918, 'Q15751': 16374, 'Q8TDW7': 16315, 'Q14517': 16195, 'P78527': 16192, 'Q86WI1': 16138, 'Q9NRC6': 15775, 'Q9Y4A5': 15318, 'Q9NYQ8': 15286, 'P98161': 15172, 'P98160': 15092, 'P08F94': 14777, 'Q99996': 14497, 'Q96Q15': 14183, 'P46939': 14125, 'Q8IZT6': 13690, 'Q7Z7G8': 13025, 'O15230': 13014, 'Q14789': 12988, 'O60494': 12737, 'Q709C8': 12718, 'Q2LD37': 12323, 'Q99698': 12173, 'Q7Z407': 12112, 'Q0VDD8': 11831, 'Q70CQ2': 11613, 'Q96PZ7': 1

###2.1.5  Generating an intermediate file with the processed *nodesDB.txt* database  

In [None]:
base_nodes_valida.to_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRINGAlphaFold/nodesDB_sel.csv",sep='\t',index=False)

###2.1.6 Processing the *Residue* attribute

In [None]:
#increasing the viewing capacity of columns and rows
import pandas as pd

pd.set_option('display.max_columns', 7000)
pd.set_option('display.max_rows',1000000)
pd.set_option('display.width', 7000)

In [None]:
import pandas as pd
df_RING_nodes = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRINGAlphaFold/nodesDB_sel.csv",delimiter='\t')

In [None]:
df_RING_nodes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6631910 entries, 0 to 6631909
Data columns (total 9 columns):
 #   Column              Dtype  
---  ------              -----  
 0   NodeId_RING         object 
 1   Chain_RING          object 
 2   Position_RING       int64  
 3   Residue_RING        object 
 4   Dssp_RING           object 
 5   Degree_RING         int64  
 6   pLDDT_RING          float64
 7   Uniprot_AF_id_RING  object 
 8   F_AF_RING           object 
dtypes: float64(1), int64(2), object(6)
memory usage: 455.4+ MB


Let's use the Amino Acid pattern, where only the first letter will be capitalized.

In [None]:
Amin = ['ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HIS', 'ILE', 'LEU', 'LYS',
      'MET', 'PHE', 'PRO', 'PYL', 'SER', 'SEC', 'THR', 'TRP', 'TYR', 'VAL', 'ASX', 'GLX']

df_RING_nodes["Residue_RING"] = df_RING_nodes["Residue_RING"].apply(lambda x: x.capitalize() if x in Amin else x)


In [None]:
df_RING_nodes["Residue_RING"].value_counts()

Leu    817050
Glu    482846
Val    467228
Ala    463814
Ser    411384
Lys    391977
Arg    383184
Ile    376359
Gly    344161
Gln    341366
Thr    332227
Phe    304579
Asp    277850
Asn    234975
Tyr    221429
His    180666
Cys    176973
Pro    165852
Met    156725
Trp    101265
Name: Residue_RING, dtype: int64

In [None]:
df_RING_nodes.head(270)

Unnamed: 0,NodeId_RING,Chain_RING,Position_RING,Residue_RING,Dssp_RING,Degree_RING,pLDDT_RING,Uniprot_AF_id_RING,F_AF_RING
0,A:213:_:ALA,A,213,Ala,H,2,66.0,A0AUZ9,F1
1,A:214:_:ALA,A,214,Ala,H,2,67.33,A0AUZ9,F1
2,A:215:_:GLU,A,215,Glu,H,6,68.39,A0AUZ9,F1
3,A:216:_:LYS,A,216,Lys,H,5,78.53,A0AUZ9,F1
4,A:217:_:GLU,A,217,Glu,H,4,76.7,A0AUZ9,F1
5,A:218:_:GLU,A,218,Glu,H,3,79.38,A0AUZ9,F1
6,A:219:_:GLU,A,219,Glu,H,7,85.15,A0AUZ9,F1
7,A:220:_:VAL,A,220,Val,H,3,88.54,A0AUZ9,F1
8,A:221:_:HIS,A,221,His,H,5,88.97,A0AUZ9,F1
9,A:222:_:ALA,A,222,Ala,H,4,90.48,A0AUZ9,F1


In [None]:
df_RING_nodes.tail(30)

Unnamed: 0,NodeId_RING,Chain_RING,Position_RING,Residue_RING,Dssp_RING,Degree_RING,pLDDT_RING,Uniprot_AF_id_RING,F_AF_RING
6631880,A:223:_:THR,A,223,Thr,T,8,96.45,Q9Y6Z7,F1
6631881,A:224:_:ASP,A,224,Asp,T,9,95.22,Q9Y6Z7,F1
6631882,A:225:_:ASN,A,225,Asn,T,1,94.38,Q9Y6Z7,F1
6631883,A:226:_:THR,A,226,Thr,E,2,95.47,Q9Y6Z7,F1
6631884,A:234:_:TRP,A,234,Trp,B,16,97.75,Q9Y6Z7,F1
6631885,A:239:_:PRO,A,239,Pro,S,10,96.31,Q9Y6Z7,F1
6631886,A:242:_:PRO,A,242,Pro,T,1,89.71,Q9Y6Z7,F1
6631887,A:244:_:GLY,A,244,Gly,S,4,91.8,Q9Y6Z7,F1
6631888,A:247:_:ASP,A,247,Asp,E,4,97.82,Q9Y6Z7,F1
6631889,A:248:_:CYS,A,248,Cys,E,14,98.32,Q9Y6Z7,F1


In [None]:
#Checking for missing values (NaN and its variations).
df_RING_nodes.isna().sum()

NodeId_RING           0
Chain_RING            0
Position_RING         0
Residue_RING          0
Dssp_RING             0
Degree_RING           0
pLDDT_RING            0
Uniprot_AF_id_RING    0
F_AF_RING             0
dtype: int64

In [None]:
#Identify duplicates records in the data
dupes=df_RING_nodes.duplicated()
sum(dupes)

0

In [None]:
df_RING_nodes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6631910 entries, 0 to 6631909
Data columns (total 9 columns):
 #   Column              Dtype  
---  ------              -----  
 0   NodeId_RING         object 
 1   Chain_RING          object 
 2   Position_RING       int64  
 3   Residue_RING        object 
 4   Dssp_RING           object 
 5   Degree_RING         int64  
 6   pLDDT_RING          float64
 7   Uniprot_AF_id_RING  object 
 8   F_AF_RING           object 
dtypes: float64(1), int64(2), object(6)
memory usage: 455.4+ MB


###2.1.7 Generating a file with the processed *nodesDB.txt* database.

In [None]:
df_RING_nodes.to_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRINGAlphaFold/nodesDB_proc0.csv",sep='\t',index=False)

##2.2 - Generating the *global pLDDT* attribute in the *RING* nodes file.

The **pLDDT_global** attribute (representing the resolution) was generated by Elionai and is located in the **plddt-global.csv** file. The fields of this file were processed through the Notebook **Trata_Arq_plDDT** which is located in the folder **drive/My Drive/ProcessaNovaBase/TrataArq_pLDDT**. The file generated from this processing is **plddt-global_proc**.

In [None]:
#increasing the viewing capacity of columns and rows
import pandas as pd

pd.set_option('display.max_columns', 7000)
pd.set_option('display.max_rows',1000000)
pd.set_option('display.width', 7000)

In [None]:
import pandas as pd
df_RING_nodes = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRINGAlphaFold/nodesDB_proc0.csv",delimiter='\t')

In [None]:
df_RING_nodes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6631910 entries, 0 to 6631909
Data columns (total 9 columns):
 #   Column              Dtype  
---  ------              -----  
 0   NodeId_RING         object 
 1   Chain_RING          object 
 2   Position_RING       int64  
 3   Residue_RING        object 
 4   Dssp_RING           object 
 5   Degree_RING         int64  
 6   pLDDT_RING          float64
 7   Uniprot_AF_id_RING  object 
 8   F_AF_RING           object 
dtypes: float64(1), int64(2), object(6)
memory usage: 455.4+ MB


###2.2.1 Reading the *plddt-global_proc* database which has the resolution of *Uniprot-AF*.

In [None]:
#Reading the database that has the resolution of Uniprot-AF.
import pandas as pd
base_plddt = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArq_pLDDT/plddt-global_proc.csv", delimiter=',')

In [None]:
base_plddt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17107 entries, 0 to 17106
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Uniprot_AF_id  17107 non-null  object 
 1   F_AF           17107 non-null  object 
 2   pLDDT_global   17107 non-null  float64
dtypes: float64(1), object(2)
memory usage: 401.1+ KB


In [None]:
base_plddt.head()

Unnamed: 0,Uniprot_AF_id,F_AF,pLDDT_global
0,Q9Y6Z7,F1,81.64
1,Q9Y6Y9,F1,87.88
2,Q9Y6Y8,F1,66.07
3,Q9Y6Y1,F1,52.18
4,Q9Y6Y0,F1,84.82


In [None]:
#checking missing values
base_plddt.isna().sum()

Uniprot_AF_id    0
F_AF             0
pLDDT_global     0
dtype: int64

In [None]:
#Identify duplicates records in the data
dupes=base_plddt.duplicated()
sum(dupes)

0

In [None]:
def categories_column(df):
    for col in ['Uniprot_AF_id','F_AF','pLDDT_global' ]:
        mydic= df[col].value_counts().to_dict()
        print(col, mydic)
        print('\n')

categories_column(base_plddt)

Uniprot_AF_id {'Q8NF91': 38, 'Q03001': 32, 'Q9UPN3': 31, 'Q8WXH0': 29, 'Q8WXG9': 26, 'Q09666': 24, 'Q8IVF2': 23, 'Q9HC84': 23, 'Q96RW7': 23, 'O14686': 22, 'Q9NU22': 22, 'O75445': 21, 'Q9Y6V0': 20, 'Q2LD37': 20, 'Q5T4S7': 20, 'P21817': 20, 'Q4G0P3': 20, 'O95714': 19, 'Q15751': 19, 'Q8NEZ4': 19, 'Q6V0I7': 19, 'Q15413': 19, 'Q92736': 19, 'Q96M86': 18, 'P98164': 18, 'Q14204': 18, 'Q15149': 18, 'Q8TE73': 18, 'Q9NZR2': 17, 'Q96JB1': 17, 'Q14517': 17, 'Q9P225': 17, 'Q9NZJ4': 17, 'Q07954': 17, 'Q8IVF4': 17, 'Q9NYC9': 17, 'Q8TDW7': 17, 'Q96DT5': 17, 'Q685J3': 17, 'Q86WI1': 16, 'P98160': 16, 'Q8NCM8': 16, 'Q12955': 16, 'Q9NYQ8': 16, 'P98161': 16, 'P08F94': 15, 'P78527': 15, 'Q8TD57': 15, 'Q8N3K9': 15, 'Q7Z7G8': 15, 'Q8WXX0': 15, 'Q9C0G6': 15, 'P20930': 15, 'Q99996': 14, 'Q01484': 14, 'Q03164': 14, 'Q9UPA5': 14, 'Q99698': 14, 'Q9Y4A5': 14, 'Q96T58': 13, 'Q15911': 13, 'O15230': 13, 'Q709C8': 13, 'Q9NRC6': 13, 'Q7Z407': 13, 'O60494': 13, 'Q96Q15': 13, 'Q8IZQ1': 12, 'Q96PZ7': 12, 'Q4LDE5': 12, 'Q70C

In [None]:
base_plddt

Output hidden; open in https://colab.research.google.com to view.

In [None]:
base_plddt.query("Uniprot_AF_id == ' '")

Unnamed: 0,Uniprot_AF_id,F_AF,pLDDT_global


In [None]:
base_plddt.query("Uniprot_AF_id == 'O15417'")

Unnamed: 0,Uniprot_AF_id,F_AF,pLDDT_global
16240,O15417,F9,51.82
16241,O15417,F6,50.91
16242,O15417,F7,50.88
16243,O15417,F8,49.87
16244,O15417,F5,49.17
16245,O15417,F4,48.26
16246,O15417,F2,46.58
16247,O15417,F3,46.58
16248,O15417,F1,43.36


In [None]:
base_plddt.query("Uniprot_AF_id == 'Q8WXG9'")

Unnamed: 0,Uniprot_AF_id,F_AF,pLDDT_global
5438,Q8WXG9,F3,86.27
5439,Q8WXG9,F12,85.08
5440,Q8WXG9,F5,84.76
5441,Q8WXG9,F6,84.59
5442,Q8WXG9,F19,84.45
5443,Q8WXG9,F17,84.38
5444,Q8WXG9,F14,84.34
5445,Q8WXG9,F11,84.19
5446,Q8WXG9,F2,84.11
5447,Q8WXG9,F10,84.06


In [None]:
base_plddt.query("Uniprot_AF_id == 'P98160'")

Unnamed: 0,Uniprot_AF_id,F_AF,pLDDT_global
11850,P98160,F3,84.88
11851,P98160,F4,82.85
11852,P98160,F5,82.74
11853,P98160,F6,81.63
11854,P98160,F2,81.13
11855,P98160,F16,80.59
11856,P98160,F7,79.94
11857,P98160,F8,79.36
11858,P98160,F15,78.79
11859,P98160,F13,78.17


In [None]:
base_plddt.query("Uniprot_AF_id == 'Q5T011'")

Unnamed: 0,Uniprot_AF_id,F_AF,pLDDT_global
9500,Q5T011,F1,73.51
9501,Q5T011,F12,73.39
9502,Q5T011,F11,71.03
9503,Q5T011,F10,69.14
9504,Q5T011,F3,67.97
9505,Q5T011,F2,67.33
9506,Q5T011,F9,65.89
9507,Q5T011,F8,65.07
9508,Q5T011,F4,62.67
9509,Q5T011,F7,62.21


In [None]:
base_plddt.query("Uniprot_AF_id == 'A0AUZ9'")

Unnamed: 0,Uniprot_AF_id,F_AF,pLDDT_global
17105,A0AUZ9,F1,50.24


###2.2.2 Joining the df_RING_nodes table (through the fields: Uniprot_AF_id_RING  e F_AF_RING) with base_plddt table (through the fields Uniprot_AF_id e F_AF), to add the *global pLDDT* information.

In [None]:
#Attributes of base_33 that will be the key in the join with ALL-AF_NodesResult_proc
def categories_column(df):
    for col in ['Uniprot_AF_id_RING', 'F_AF_RING']:
        mydic= df[col].value_counts().to_dict()
        print(col, mydic)
        print('\n')

categories_column(df_RING_nodes)

Uniprot_AF_id_RING {'Q8NF91': 47420, 'Q8WXH0': 33647, 'Q9UPN3': 32670, 'Q03001': 31506, 'Q8WXG9': 24100, 'Q96RW7': 23634, 'Q9NU22': 21877, 'Q15149': 21833, 'Q8TE73': 20334, 'Q14204': 20315, 'Q5T4S7': 19905, 'P21817': 19758, 'Q15413': 19659, 'Q9NYC9': 19542, 'Q8IVF4': 19519, 'Q9P225': 19449, 'Q96DT5': 19419, 'Q92736': 19382, 'Q96JB1': 19197, 'O75445': 18956, 'P98164': 18456, 'Q8NCM8': 18248, 'Q96M86': 18228, 'Q6V0I7': 17955, 'O95714': 17938, 'Q9NZR2': 17857, 'Q07954': 17824, 'Q4G0P3': 17755, 'Q9NZJ4': 17658, 'Q8WXX0': 16953, 'Q8TD57': 16951, 'Q9C0G6': 16918, 'Q15751': 16374, 'Q8TDW7': 16315, 'Q14517': 16195, 'P78527': 16192, 'Q86WI1': 16138, 'Q9NRC6': 15775, 'Q9Y4A5': 15318, 'Q9NYQ8': 15286, 'P98161': 15172, 'P98160': 15092, 'P08F94': 14777, 'Q99996': 14497, 'Q96Q15': 14183, 'P46939': 14125, 'Q8IZT6': 13690, 'Q7Z7G8': 13025, 'O15230': 13014, 'Q14789': 12988, 'O60494': 12737, 'Q709C8': 12718, 'Q2LD37': 12323, 'Q99698': 12173, 'Q7Z407': 12112, 'Q0VDD8': 11831, 'Q70CQ2': 11613, 'Q96PZ7': 1

In [None]:
#Attributes that will be the key in the join with the base_33 database.
def categories_column(df):
    for col in ['Uniprot_AF_id', 'F_AF']:
        mydic= df[col].value_counts().to_dict()
        print(col, mydic)
        print('\n')

categories_column(base_plddt)

Uniprot_AF_id {'Q8NF91': 38, 'Q03001': 32, 'Q9UPN3': 31, 'Q8WXH0': 29, 'Q8WXG9': 26, 'Q09666': 24, 'Q8IVF2': 23, 'Q9HC84': 23, 'Q96RW7': 23, 'O14686': 22, 'Q9NU22': 22, 'O75445': 21, 'Q9Y6V0': 20, 'Q2LD37': 20, 'Q5T4S7': 20, 'P21817': 20, 'Q4G0P3': 20, 'O95714': 19, 'Q15751': 19, 'Q8NEZ4': 19, 'Q6V0I7': 19, 'Q15413': 19, 'Q92736': 19, 'Q96M86': 18, 'P98164': 18, 'Q14204': 18, 'Q15149': 18, 'Q8TE73': 18, 'Q9NZR2': 17, 'Q96JB1': 17, 'Q14517': 17, 'Q9P225': 17, 'Q9NZJ4': 17, 'Q07954': 17, 'Q8IVF4': 17, 'Q9NYC9': 17, 'Q8TDW7': 17, 'Q96DT5': 17, 'Q685J3': 17, 'Q86WI1': 16, 'P98160': 16, 'Q8NCM8': 16, 'Q12955': 16, 'Q9NYQ8': 16, 'P98161': 16, 'P08F94': 15, 'P78527': 15, 'Q8TD57': 15, 'Q8N3K9': 15, 'Q7Z7G8': 15, 'Q8WXX0': 15, 'Q9C0G6': 15, 'P20930': 15, 'Q99996': 14, 'Q01484': 14, 'Q03164': 14, 'Q9UPA5': 14, 'Q99698': 14, 'Q9Y4A5': 14, 'Q96T58': 13, 'Q15911': 13, 'O15230': 13, 'Q709C8': 13, 'Q9NRC6': 13, 'Q7Z407': 13, 'O60494': 13, 'Q96Q15': 13, 'Q8IZQ1': 12, 'Q96PZ7': 12, 'Q4LDE5': 12, 'Q70C

In [None]:
import pandas as pd
base_merge = pd.merge(df_RING_nodes, base_plddt, left_on=['Uniprot_AF_id_RING', 'F_AF_RING'], right_on=['Uniprot_AF_id', 'F_AF'],how='left')


In [None]:
base_merge.info(max_cols=150)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6631910 entries, 0 to 6631909
Data columns (total 12 columns):
 #   Column              Dtype  
---  ------              -----  
 0   NodeId_RING         object 
 1   Chain_RING          object 
 2   Position_RING       int64  
 3   Residue_RING        object 
 4   Dssp_RING           object 
 5   Degree_RING         int64  
 6   pLDDT_RING          float64
 7   Uniprot_AF_id_RING  object 
 8   F_AF_RING           object 
 9   Uniprot_AF_id       object 
 10  F_AF                object 
 11  pLDDT_global        float64
dtypes: float64(2), int64(2), object(8)
memory usage: 657.8+ MB


In [None]:
base_merge.head(250)

Unnamed: 0,NodeId_RING,Chain_RING,Position_RING,Residue_RING,Dssp_RING,Degree_RING,pLDDT_RING,Uniprot_AF_id_RING,F_AF_RING,Uniprot_AF_id,F_AF,pLDDT_global
0,A:213:_:ALA,A,213,Ala,H,2,66.0,A0AUZ9,F1,A0AUZ9,F1,50.24
1,A:214:_:ALA,A,214,Ala,H,2,67.33,A0AUZ9,F1,A0AUZ9,F1,50.24
2,A:215:_:GLU,A,215,Glu,H,6,68.39,A0AUZ9,F1,A0AUZ9,F1,50.24
3,A:216:_:LYS,A,216,Lys,H,5,78.53,A0AUZ9,F1,A0AUZ9,F1,50.24
4,A:217:_:GLU,A,217,Glu,H,4,76.7,A0AUZ9,F1,A0AUZ9,F1,50.24
5,A:218:_:GLU,A,218,Glu,H,3,79.38,A0AUZ9,F1,A0AUZ9,F1,50.24
6,A:219:_:GLU,A,219,Glu,H,7,85.15,A0AUZ9,F1,A0AUZ9,F1,50.24
7,A:220:_:VAL,A,220,Val,H,3,88.54,A0AUZ9,F1,A0AUZ9,F1,50.24
8,A:221:_:HIS,A,221,His,H,5,88.97,A0AUZ9,F1,A0AUZ9,F1,50.24
9,A:222:_:ALA,A,222,Ala,H,4,90.48,A0AUZ9,F1,A0AUZ9,F1,50.24


In [None]:
#Identifies the record that has NaN values.
df = base_merge[base_merge.isna().any(axis=1)]

In [None]:
df

Unnamed: 0,NodeId_RING,Chain_RING,Position_RING,Residue_RING,Dssp_RING,Degree_RING,pLDDT_RING,Uniprot_AF_id_RING,F_AF_RING,Uniprot_AF_id,F_AF,pLDDT_global


In [None]:
#Identify duplicates records in the data
dupes=base_merge.duplicated()
sum(dupes)

0

In [None]:
base_merge.query("Uniprot_AF_id_RING == 'Q5T011' & Residue_RING == 'Val' & Position_RING == 986")

Unnamed: 0,NodeId_RING,Chain_RING,Position_RING,Residue_RING,Dssp_RING,Degree_RING,pLDDT_RING,Uniprot_AF_id_RING,F_AF_RING,Uniprot_AF_id,F_AF,pLDDT_global
2915087,A:986:_:VAL,A,986,Val,E,1,86.6,Q5T011,F1,Q5T011,F1,73.51
2917375,A:986:_:VAL,A,986,Val,E,2,90.09,Q5T011,F12,Q5T011,F12,73.39
2920663,A:986:_:VAL,A,986,Val,S,7,88.29,Q5T011,F6,Q5T011,F6,61.89


In [None]:
base_merge.query("Uniprot_AF_id_RING == 'Q8WXG9' & Residue_RING == 'Gly' & Position_RING == 295")

Unnamed: 0,NodeId_RING,Chain_RING,Position_RING,Residue_RING,Dssp_RING,Degree_RING,pLDDT_RING,Uniprot_AF_id_RING,F_AF_RING,Uniprot_AF_id,F_AF,pLDDT_global
4487799,A:295:_:GLY,A,295,Gly,S,4,85.44,Q8WXG9,F1,Q8WXG9,F1,81.51
4490595,A:295:_:GLY,A,295,Gly,H,2,79.18,Q8WXG9,F12,Q8WXG9,F12,85.08
4498279,A:295:_:GLY,A,295,Gly,G,2,68.69,Q8WXG9,F2,Q8WXG9,F2,84.11


In [None]:
base_merge.query("Uniprot_AF_id_RING == 'Q7Z7M0' & Residue_RING == 'Arg' & Position_RING == 933")

Unnamed: 0,NodeId_RING,Chain_RING,Position_RING,Residue_RING,Dssp_RING,Degree_RING,pLDDT_RING,Uniprot_AF_id_RING,F_AF_RING,Uniprot_AF_id,F_AF,pLDDT_global
3567687,A:933:_:ARG,A,933,Arg,T,7,90.5,Q7Z7M0,F1,Q7Z7M0,F1,83.29
3573162,A:933:_:ARG,A,933,Arg,S,6,75.07,Q7Z7M0,F7,Q7Z7M0,F7,78.39


In [None]:
base_merge.query("Uniprot_AF_id_RING == 'P98160' & Residue_RING == 'Ser' & Position_RING == 180")

Unnamed: 0,NodeId_RING,Chain_RING,Position_RING,Residue_RING,Dssp_RING,Degree_RING,pLDDT_RING,Uniprot_AF_id_RING,F_AF_RING,Uniprot_AF_id,F_AF,pLDDT_global
1878852,A:180:_:SER,A,180,Ser,E,3,87.79,P98160,F1,P98160,F1,78.07
1889251,A:180:_:SER,A,180,Ser,G,1,90.17,P98160,F5,P98160,F5,82.74
1892109,A:180:_:SER,A,180,Ser,E,2,89.58,P98160,F8,P98160,F8,79.36
1893048,A:180:_:SER,A,180,Ser,E,1,90.02,P98160,F9,P98160,F9,78.13


##2.3 Generating a file with the processed *nodesDB.txt* database

In [None]:
base_merge.to_csv("drive/My Drive/ProcessaNovaBase/TrataArqsRINGAlphaFold/nodesDB_proc.csv",sep='\t',index=False)