#0 - Basic Settings

In [None]:
#Permission to access any file on Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Increasing the capacity to view columns and rows
import pandas as pd

pd.set_option('display.max_columns', 7000)
pd.set_option('display.max_rows',90000)

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, 
and then re-execute this cell.


In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('To enable a high-RAM runtime, select the Runtime > "Change runtime type"')
  print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
  print('re-execute this cell.')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 27.3 gigabytes of available RAM

You are using a high-RAM runtime!


#1 - Reading and processing the files ALL-AF_NodesResult and ALL-AF_GraphsResult which had the protein structure generated by AlphaFold

The files **ALL-AF_NodesResult.csv** and **ALL-AF_GraphsResult** contain the records of all NodesResult and GraphsResult files respectively, generated by the Diego&Dalmolin script.  The Diego Morais & Dalmolin's R script takes as input the edges files generated by RING and calculates the **Clustering Coefficient**, the **Betweenness** and other data from the interaction network of each AlphaFold PDB. This script is in the **TrataArqsScriptDiegoAlphaFold** folder in this drive.


##1.1 Processing of the *ALL-AF_NodesResult* database

In [None]:
import pandas as pd

base_NodesResult = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsScriptDiegoAlphaFold/ALL-AF_NodesResult.csv",index_col=False, delimiter=',')

In [None]:
base_NodesResult.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7695638 entries, 0 to 7695637
Data columns (total 8 columns):
 #   Column               Dtype  
---  ------               -----  
 0   Unnamed: 0           int64  
 1   node                 object 
 2   degree               int64  
 3   aminoAcid            object 
 4   triangles            int64  
 5   clusteringCoef       float64
 6   betweennessWeighted  float64
 7   filename             object 
dtypes: float64(2), int64(3), object(3)
memory usage: 469.7+ MB


In [None]:
base_NodesResult.head()

Unnamed: 0.1,Unnamed: 0,node,degree,aminoAcid,triangles,clusteringCoef,betweennessWeighted,filename
0,1,A:118:_:ASN,2,ASN,1,1.0,0.0,AF-A0AUZ9-F1-model_v4.pdb.edges
1,2,A:131:_:GLU,2,GLU,0,0.0,0.0,AF-A0AUZ9-F1-model_v4.pdb.edges
2,3,A:132:_:PHE,4,PHE,0,0.0,0.00049,AF-A0AUZ9-F1-model_v4.pdb.edges
3,4,A:211:_:SER,2,SER,0,0.0,0.0,AF-A0AUZ9-F1-model_v4.pdb.edges
4,5,A:212:_:SER,6,SER,0,0.0,0.000387,AF-A0AUZ9-F1-model_v4.pdb.edges


In [None]:
base_NodesResult.tail()

Unnamed: 0.1,Unnamed: 0,node,degree,aminoAcid,triangles,clusteringCoef,betweennessWeighted,filename
7695633,7695634,A:5:_:ALA,1,ALA,0,0.0,0.0,AF-Q9Y6Z7-F1-model_v4.pdb.edges
7695634,7695635,A:6:_:SER,3,SER,0,0.0,0.00155,AF-Q9Y6Z7-F1-model_v4.pdb.edges
7695635,7695636,A:7:_:LEU,2,LEU,0,0.0,0.000423,AF-Q9Y6Z7-F1-model_v4.pdb.edges
7695636,7695637,A:8:_:LEU,3,LEU,0,0.0,0.00155,AF-Q9Y6Z7-F1-model_v4.pdb.edges
7695637,7695638,A:9:_:ARG,4,ARG,0,0.0,0.002113,AF-Q9Y6Z7-F1-model_v4.pdb.edges


In [None]:
#checking missing values
base_NodesResult.isna().sum()

Unnamed: 0              0
node                    0
degree                  0
aminoAcid               0
triangles               0
clusteringCoef          0
betweennessWeighted    32
filename                0
dtype: int64

In [None]:
def categories_column(df):
    for col in ['degree', 'aminoAcid', 'triangles']:
        mydic= df[col].value_counts().to_dict()
        print(col, mydic)
        print('\n')

categories_column(base_NodesResult)

degree {4: 1081048, 3: 942523, 2: 936190, 5: 810831, 1: 760610, 6: 659826, 7: 542485, 8: 439566, 9: 346372, 10: 265002, 11: 203191, 12: 152457, 13: 116492, 14: 89460, 15: 70316, 16: 57475, 17: 46326, 18: 37200, 19: 30024, 20: 23936, 21: 19205, 22: 15174, 23: 11854, 24: 9292, 25: 7099, 26: 5415, 27: 4118, 28: 3131, 29: 2314, 30: 1660, 31: 1245, 32: 922, 33: 716, 34: 558, 35: 412, 36: 305, 37: 233, 38: 189, 39: 133, 40: 79, 41: 62, 43: 41, 42: 32, 44: 27, 45: 17, 46: 16, 47: 16, 49: 7, 48: 5, 53: 5, 50: 5, 55: 4, 52: 3, 57: 3, 51: 3, 56: 2, 54: 2, 58: 1, 60: 1, 66: 1, 73: 1}


aminoAcid {'LEU': 911051, 'GLU': 535788, 'VAL': 524668, 'ALA': 514198, 'SER': 506771, 'LYS': 444848, 'ARG': 435989, 'ILE': 420522, 'THR': 392681, 'GLY': 390496, 'GLN': 379766, 'ASP': 359382, 'PHE': 342922, 'ASN': 287436, 'PRO': 283770, 'TYR': 249076, 'CYS': 212769, 'HIS': 210217, 'MET': 179555, 'TRP': 113733}


triangles {0: 4752711, 1: 1271258, 2: 679023, 3: 419953, 4: 246732, 5: 147803, 6: 82837, 7: 47562, 8: 236

In [None]:
def categories_column(df):
    for col in ['clusteringCoef']:
        mydic= df[col].value_counts().to_dict()
        print(col, mydic)
        print('\n')

categories_column(base_NodesResult)

clusteringCoef {0.0: 4752711, 0.0666666666666667: 235902, 0.1: 197818, 0.166666666666667: 164824, 0.0476190476190476: 163496, 0.333333333333333: 145710, 0.0357142857142857: 118556, 0.2: 104291, 0.133333333333333: 100741, 0.0952380952380952: 97511, 0.0714285714285714: 87302, 0.0277777777777778: 85575, 0.142857142857143: 80116, 0.0555555555555556: 70875, 0.0222222222222222: 59834, 0.107142857142857: 54012, 0.0444444444444444: 53770, 0.0833333333333333: 51050, 1.0: 50186, 0.111111111111111: 47057, 0.0181818181818182: 41925, 0.0363636363636364: 40696, 0.0545454545454545: 34022, 0.0303030303030303: 30495, 0.0151515151515152: 29663, 0.0888888888888889: 27805, 0.0454545454545455: 26120, 0.0727272727272727: 23147, 0.0909090909090909: 22871, 0.19047619047619: 22683, 0.0256410256410256: 22390, 0.0128205128205128: 21355, 0.0384615384615385: 20148, 0.3: 19216, 0.0606060606060606: 18656, 0.021978021978022: 16715, 0.138888888888889: 16368, 0.032967032967033: 15907, 0.019047619047619: 15590, 0.010989

In [None]:
base_NodesResult['betweennessWeighted'].value_counts()

0.000000    1736954
0.009524        214
0.006061        182
0.015152        153
0.023810        149
             ...   
0.011630          1
0.003959          1
0.013998          1
0.008573          1
0.095027          1
Name: betweennessWeighted, Length: 3165374, dtype: int64

###1.1.1 Selection of fields to be used

In [None]:
base_NodesResult.columns

Index(['Unnamed: 0', 'node', 'degree', 'aminoAcid', 'triangles',
       'clusteringCoef', 'betweennessWeighted', 'filename'],
      dtype='object')

In [None]:
#Field selection
base_NodesResult = base_NodesResult.loc[:,['node','degree','aminoAcid','triangles','clusteringCoef','betweennessWeighted','filename']]

In [None]:
base_NodesResult.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7695638 entries, 0 to 7695637
Data columns (total 7 columns):
 #   Column               Dtype  
---  ------               -----  
 0   node                 object 
 1   degree               int64  
 2   aminoAcid            object 
 3   triangles            int64  
 4   clusteringCoef       float64
 5   betweennessWeighted  float64
 6   filename             object 
dtypes: float64(2), int64(2), object(3)
memory usage: 411.0+ MB


###1.1.2 Renaming the fields

In [None]:
base_NodesResult.rename(columns={'node': 'node_ScriptR',
                                 'degree': 'degree_node_ScriptR',
                                 'aminoAcid': 'aminoAcid_ScriptR',
                                 'triangles': 'triangles_node',
                                 'clusteringCoef': 'clusteringCoef_node',
                                 'betweennessWeighted': 'betweennessWeighted_node'
                                  }, inplace=True)

In [None]:
base_NodesResult.head()

Unnamed: 0,node_ScriptR,degree_node_ScriptR,aminoAcid_ScriptR,triangles_node,clusteringCoef_node,betweennessWeighted_node,filename
0,A:118:_:ASN,2,ASN,1,1.0,0.0,AF-A0AUZ9-F1-model_v4.pdb.edges
1,A:131:_:GLU,2,GLU,0,0.0,0.0,AF-A0AUZ9-F1-model_v4.pdb.edges
2,A:132:_:PHE,4,PHE,0,0.0,0.00049,AF-A0AUZ9-F1-model_v4.pdb.edges
3,A:211:_:SER,2,SER,0,0.0,0.0,AF-A0AUZ9-F1-model_v4.pdb.edges
4,A:212:_:SER,6,SER,0,0.0,0.000387,AF-A0AUZ9-F1-model_v4.pdb.edges


###1.1.3 Generation of an intermediate file with the selected fields from the *base_NodesResult* database.

In [None]:
base_NodesResult.to_csv("drive/My Drive/ProcessaNovaBase/TrataArqsScriptDiegoAlphaFold/ALL-AF_NodesResult_sel.csv",sep=',',index=False)

###1.1.4 Extraction of its position and chain in the *node* attribute.

The **node_ScriptR** attribute was obtained from the **edge** file (RING output). It has the following format:

$<chain> : <index> : <insertion_code> : <residue_3_letter_code>$

The chain and the position (index) will be extracted:
**node_pos_ScriptR**: the position of the residue

**node_chain_ScriptR**: the chain where the residue is located

In [None]:
#Increasing the capacity to view columns and rows
import pandas as pd

pd.set_option('display.max_columns', 7000)
pd.set_option('display.max_rows',90000)

In [None]:
import pandas as pd
base_NodesResult = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsScriptDiegoAlphaFold/ALL-AF_NodesResult_sel.csv",sep=',')

In [None]:
base_NodesResult.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7695638 entries, 0 to 7695637
Data columns (total 7 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   node_ScriptR              object 
 1   degree_node_ScriptR       int64  
 2   aminoAcid_ScriptR         object 
 3   triangles_node            int64  
 4   clusteringCoef_node       float64
 5   betweennessWeighted_node  float64
 6   filename                  object 
dtypes: float64(2), int64(2), object(3)
memory usage: 411.0+ MB


In [None]:
base_NodesResult["node_ScriptR"].value_counts()

A:1:_:MET       2676
A:66:_:LEU      1632
A:114:_:LEU     1611
A:86:_:LEU      1607
A:14:_:LEU      1581
A:15:_:LEU      1567
A:115:_:LEU     1564
A:107:_:LEU     1563
A:13:_:LEU      1546
A:82:_:LEU      1538
A:84:_:LEU      1531
A:63:_:LEU      1526
A:90:_:LEU      1522
A:96:_:LEU      1520
A:81:_:LEU      1514
A:97:_:LEU      1508
A:104:_:LEU     1506
A:117:_:LEU     1505
A:116:_:LEU     1504
A:106:_:LEU     1504
A:101:_:LEU     1502
A:79:_:LEU      1501
A:98:_:LEU      1497
A:12:_:LEU      1497
A:105:_:LEU     1488
A:87:_:LEU      1487
A:113:_:LEU     1487
A:103:_:LEU     1487
A:67:_:LEU      1486
A:85:_:LEU      1485
A:166:_:LEU     1482
A:55:_:LEU      1482
A:144:_:LEU     1482
A:62:_:LEU      1479
A:75:_:LEU      1478
A:93:_:LEU      1476
A:44:_:LEU      1473
A:58:_:LEU      1470
A:99:_:LEU      1470
A:112:_:LEU     1467
A:142:_:LEU     1467
A:83:_:LEU      1466
A:56:_:LEU      1463
A:64:_:LEU      1462
A:65:_:LEU      1461
A:102:_:LEU     1460
A:153:_:LEU     1459
A:77:_:LEU   

In [None]:
base_NodesResult["node_pos_ScriptR"] = base_NodesResult["node_ScriptR"].apply(lambda x: x.split(":")[1])

In [None]:
base_NodesResult["node_chain_ScriptR"] = base_NodesResult["node_ScriptR"].apply(lambda x: x.split(":")[0].upper())

In [None]:
base_NodesResult["node_chain_ScriptR"].value_counts()

A    7695638
Name: node_chain_ScriptR, dtype: int64

###1.1.5 Conversion of the *aminoAcid_ScriptR* attribute to the pattern used.

In [None]:
base_NodesResult['aminoAcid_ScriptR'].value_counts()

LEU    911051
GLU    535788
VAL    524668
ALA    514198
SER    506771
LYS    444848
ARG    435989
ILE    420522
THR    392681
GLY    390496
GLN    379766
ASP    359382
PHE    342922
ASN    287436
PRO    283770
TYR    249076
CYS    212769
HIS    210217
MET    179555
TRP    113733
Name: aminoAcid_ScriptR, dtype: int64

In [None]:
#Converting to the pattern: Ala, Arg, Asn,...
Amin = ['LEU','GLU','VAL','ALA','SER','LYS','ARG','ILE','THR','GLY','GLN', 'ASP','PHE', 'ASN', 'PRO', 'TYR', 'CYS', 'HIS', 'MET', 'TRP']

base_NodesResult["aminoAcid_ScriptR"] = base_NodesResult["aminoAcid_ScriptR"].apply(lambda x: x.capitalize() if x in Amin else x)

In [None]:
base_NodesResult["aminoAcid_ScriptR"].value_counts()

Leu    911051
Glu    535788
Val    524668
Ala    514198
Ser    506771
Lys    444848
Arg    435989
Ile    420522
Thr    392681
Gly    390496
Gln    379766
Asp    359382
Phe    342922
Asn    287436
Pro    283770
Tyr    249076
Cys    212769
His    210217
Met    179555
Trp    113733
Name: aminoAcid_ScriptR, dtype: int64

In [None]:
base_NodesResult.head(25)

Unnamed: 0,node_ScriptR,degree_node_ScriptR,aminoAcid_ScriptR,triangles_node,clusteringCoef_node,betweennessWeighted_node,filename,node_pos_ScriptR,node_chain_ScriptR
0,A:118:_:ASN,2,Asn,1,1.0,0.0,AF-A0AUZ9-F1-model_v4.pdb.edges,118,A
1,A:131:_:GLU,2,Glu,0,0.0,0.0,AF-A0AUZ9-F1-model_v4.pdb.edges,131,A
2,A:132:_:PHE,4,Phe,0,0.0,0.00049,AF-A0AUZ9-F1-model_v4.pdb.edges,132,A
3,A:211:_:SER,2,Ser,0,0.0,0.0,AF-A0AUZ9-F1-model_v4.pdb.edges,211,A
4,A:212:_:SER,6,Ser,0,0.0,0.000387,AF-A0AUZ9-F1-model_v4.pdb.edges,212,A
5,A:213:_:ALA,2,Ala,0,0.0,0.000232,AF-A0AUZ9-F1-model_v4.pdb.edges,213,A
6,A:214:_:ALA,2,Ala,0,0.0,0.000181,AF-A0AUZ9-F1-model_v4.pdb.edges,214,A
7,A:215:_:GLU,6,Glu,0,0.0,0.003172,AF-A0AUZ9-F1-model_v4.pdb.edges,215,A
8,A:216:_:LYS,5,Lys,0,0.0,0.004048,AF-A0AUZ9-F1-model_v4.pdb.edges,216,A
9,A:217:_:GLU,4,Glu,0,0.0,0.003404,AF-A0AUZ9-F1-model_v4.pdb.edges,217,A


###1.1.6 Generation of the *Uniprot_AF_id_ScriptR* attribute extracted from the *filename* attribute  

In [None]:
def get_uniprot(filename):

  aux = filename.split("-")[1]  # AF-A0AUZ9-F1-model_v4.pdb.edges
  return aux

In [None]:
base_NodesResult['Uniprot_AF_id_ScriptR'] = base_NodesResult['filename'].apply(get_uniprot)

In [None]:
base_NodesResult["Uniprot_AF_id_ScriptR"].value_counts()

Q8NF91    49529
Q8WXH0    35319
Q9UPN3    34741
Q03001    33569
Q8WXG9    30309
Q96RW7    27441
Q9NU22    25332
O75445    24636
Q14204    22990
Q15149    22982
Q8TE73    22953
P21817    22783
Q6V0I7    22777
Q15413    22615
Q5T4S7    22373
Q92736    22334
Q9NYC9    22173
Q8IVF4    22060
Q96DT5    22011
Q9P225    21963
Q96JB1    21829
P98164    21673
Q9NZJ4    21047
Q4G0P3    20979
O95714    20777
Q07954    20750
Q9NZR2    20702
Q96M86    20688
Q8NCM8    20534
Q14517    20381
Q8TDW7    20380
Q86WI1    19635
Q8WXX0    19360
Q8TD57    19345
Q9C0G6    19264
Q9NYQ8    19155
Q15751    18905
P78527    18074
P98160    17937
Q09666    17809
P98161    17761
P08F94    17592
Q9Y4A5    16924
Q9NRC6    16529
Q96Q15    15734
Q685J3    15717
O60494    15528
O15230    15496
Q7Z407    15305
Q7Z7G8    15150
P46939    14864
Q99996    14720
Q709C8    14468
Q96PZ7    14347
Q2LD37    14230
Q8IZT6    14204
Q7Z408    14111
P78509    14016
Q99698    13976
Q4LDE5    13594
Q0VDD8    13517
Q9H251    13306
Q70CQ2  

In [None]:
base_NodesResult.head()

Unnamed: 0,node_ScriptR,degree_node_ScriptR,aminoAcid_ScriptR,triangles_node,clusteringCoef_node,betweennessWeighted_node,filename,node_pos_ScriptR,node_chain_ScriptR,Uniprot_AF_id_ScriptR
0,A:118:_:ASN,2,Asn,1,1.0,0.0,AF-A0AUZ9-F1-model_v4.pdb.edges,118,A,A0AUZ9
1,A:131:_:GLU,2,Glu,0,0.0,0.0,AF-A0AUZ9-F1-model_v4.pdb.edges,131,A,A0AUZ9
2,A:132:_:PHE,4,Phe,0,0.0,0.00049,AF-A0AUZ9-F1-model_v4.pdb.edges,132,A,A0AUZ9
3,A:211:_:SER,2,Ser,0,0.0,0.0,AF-A0AUZ9-F1-model_v4.pdb.edges,211,A,A0AUZ9
4,A:212:_:SER,6,Ser,0,0.0,0.000387,AF-A0AUZ9-F1-model_v4.pdb.edges,212,A,A0AUZ9


In [None]:
base_NodesResult.tail()

Unnamed: 0,node_ScriptR,degree_node_ScriptR,aminoAcid_ScriptR,triangles_node,clusteringCoef_node,betweennessWeighted_node,filename,node_pos_ScriptR,node_chain_ScriptR,Uniprot_AF_id_ScriptR
7695633,A:5:_:ALA,1,Ala,0,0.0,0.0,AF-Q9Y6Z7-F1-model_v4.pdb.edges,5,A,Q9Y6Z7
7695634,A:6:_:SER,3,Ser,0,0.0,0.00155,AF-Q9Y6Z7-F1-model_v4.pdb.edges,6,A,Q9Y6Z7
7695635,A:7:_:LEU,2,Leu,0,0.0,0.000423,AF-Q9Y6Z7-F1-model_v4.pdb.edges,7,A,Q9Y6Z7
7695636,A:8:_:LEU,3,Leu,0,0.0,0.00155,AF-Q9Y6Z7-F1-model_v4.pdb.edges,8,A,Q9Y6Z7
7695637,A:9:_:ARG,4,Arg,0,0.0,0.002113,AF-Q9Y6Z7-F1-model_v4.pdb.edges,9,A,Q9Y6Z7


###1.1.7 Generation of the *F_AF_ScriptR* attribute extracted from the *FileName* attribute.  

In [None]:
def get_F(filename):

  aux1 = filename.split("-")[2]  # AF-A0AUZ9-F1-model_v4.pdb.edges
  return aux1

In [None]:
base_NodesResult['F_AF_ScriptR'] = base_NodesResult['filename'].apply(get_F)

In [None]:
base_NodesResult.tail(50)

Unnamed: 0,node_ScriptR,degree_node_ScriptR,aminoAcid_ScriptR,triangles_node,clusteringCoef_node,betweennessWeighted_node,filename,node_pos_ScriptR,node_chain_ScriptR,Uniprot_AF_id_ScriptR,F_AF_ScriptR
7695588,A:23:_:ILE,5,Ile,0,0.0,0.001338,AF-Q9Y6Z7-F1-model_v4.pdb.edges,23,A,Q9Y6Z7,F1
7695589,A:231:_:TYR,11,Tyr,3,0.054545,0.012539,AF-Q9Y6Z7-F1-model_v4.pdb.edges,231,A,Q9Y6Z7,F1
7695590,A:232:_:SER,2,Ser,1,1.0,0.0,AF-Q9Y6Z7-F1-model_v4.pdb.edges,232,A,Q9Y6Z7,F1
7695591,A:233:_:ASN,7,Asn,2,0.095238,0.008876,AF-Q9Y6Z7-F1-model_v4.pdb.edges,233,A,Q9Y6Z7,F1
7695592,A:234:_:TRP,16,Trp,4,0.033333,0.039659,AF-Q9Y6Z7-F1-model_v4.pdb.edges,234,A,Q9Y6Z7,F1
7695593,A:235:_:ASN,4,Asn,0,0.0,0.006058,AF-Q9Y6Z7-F1-model_v4.pdb.edges,235,A,Q9Y6Z7,F1
7695594,A:238:_:GLU,6,Glu,1,0.066667,0.000493,AF-Q9Y6Z7-F1-model_v4.pdb.edges,238,A,Q9Y6Z7,F1
7695595,A:239:_:PRO,10,Pro,1,0.022222,0.032192,AF-Q9Y6Z7-F1-model_v4.pdb.edges,239,A,Q9Y6Z7,F1
7695596,A:24:_:GLN,2,Gln,0,0.0,0.0,AF-Q9Y6Z7-F1-model_v4.pdb.edges,24,A,Q9Y6Z7,F1
7695597,A:240:_:SER,2,Ser,0,0.0,0.0,AF-Q9Y6Z7-F1-model_v4.pdb.edges,240,A,Q9Y6Z7,F1


The **filename** attribute will no longer be needed, it will be removed.

In [None]:
del base_NodesResult['filename']

In [None]:
base_NodesResult.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7695638 entries, 0 to 7695637
Data columns (total 10 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   node_ScriptR              object 
 1   degree_node_ScriptR       int64  
 2   aminoAcid_ScriptR         object 
 3   triangles_node            int64  
 4   clusteringCoef_node       float64
 5   betweennessWeighted_node  float64
 6   node_pos_ScriptR          object 
 7   node_chain_ScriptR        object 
 8   Uniprot_AF_id_ScriptR     object 
 9   F_AF_ScriptR              object 
dtypes: float64(2), int64(2), object(6)
memory usage: 587.1+ MB


###1.1.8 Removal of records that have the *betweennessWeighted_node* attribute with the value *NaN*

In [None]:
def categories_column(df):
    for col in ['degree_node_ScriptR', 'aminoAcid_ScriptR', 'triangles_node', 'clusteringCoef_node']:
        mydic= df[col].value_counts().to_dict()
        print(col, mydic)
        print('\n')

categories_column(base_NodesResult)

degree_node_ScriptR {4: 1081048, 3: 942523, 2: 936190, 5: 810831, 1: 760610, 6: 659826, 7: 542485, 8: 439566, 9: 346372, 10: 265002, 11: 203191, 12: 152457, 13: 116492, 14: 89460, 15: 70316, 16: 57475, 17: 46326, 18: 37200, 19: 30024, 20: 23936, 21: 19205, 22: 15174, 23: 11854, 24: 9292, 25: 7099, 26: 5415, 27: 4118, 28: 3131, 29: 2314, 30: 1660, 31: 1245, 32: 922, 33: 716, 34: 558, 35: 412, 36: 305, 37: 233, 38: 189, 39: 133, 40: 79, 41: 62, 43: 41, 42: 32, 44: 27, 45: 17, 46: 16, 47: 16, 49: 7, 48: 5, 53: 5, 50: 5, 55: 4, 52: 3, 57: 3, 51: 3, 56: 2, 54: 2, 58: 1, 60: 1, 66: 1, 73: 1}


aminoAcid_ScriptR {'Leu': 911051, 'Glu': 535788, 'Val': 524668, 'Ala': 514198, 'Ser': 506771, 'Lys': 444848, 'Arg': 435989, 'Ile': 420522, 'Thr': 392681, 'Gly': 390496, 'Gln': 379766, 'Asp': 359382, 'Phe': 342922, 'Asn': 287436, 'Pro': 283770, 'Tyr': 249076, 'Cys': 212769, 'His': 210217, 'Met': 179555, 'Trp': 113733}


triangles_node {0: 4752711, 1: 1271258, 2: 679023, 3: 419953, 4: 246732, 5: 147803, 

In [None]:
base_NodesResult['betweennessWeighted_node'].value_counts()

0.000000    1736954
0.009524        214
0.006061        182
0.015152        153
0.023810        149
             ...   
0.011630          1
0.003959          1
0.013998          1
0.008573          1
0.095027          1
Name: betweennessWeighted_node, Length: 3165374, dtype: int64

In [None]:
def categories_column(df):
    for col in ['node_pos_ScriptR', 'node_chain_ScriptR', 'Uniprot_AF_id_ScriptR', 'F_AF_ScriptR']:
        mydic= df[col].value_counts().to_dict()
        print(col, mydic)
        print('\n')

categories_column(base_NodesResult)

node_pos_ScriptR {'95': 12318, '91': 12317, '93': 12278, '97': 12274, '82': 12274, '115': 12268, '106': 12267, '120': 12266, '94': 12266, '98': 12265, '114': 12264, '108': 12264, '87': 12261, '83': 12261, '109': 12261, '92': 12252, '118': 12251, '86': 12249, '96': 12249, '113': 12245, '117': 12242, '81': 12238, '99': 12238, '103': 12238, '85': 12233, '84': 12233, '116': 12229, '119': 12226, '111': 12224, '107': 12222, '80': 12222, '110': 12218, '101': 12213, '112': 12207, '105': 12205, '90': 12200, '102': 12196, '122': 12192, '104': 12186, '123': 12176, '100': 12170, '126': 12158, '88': 12158, '121': 12156, '124': 12151, '79': 12140, '125': 12126, '68': 12124, '77': 12120, '89': 12115, '75': 12111, '78': 12102, '67': 12086, '76': 12079, '72': 12075, '74': 12073, '70': 12060, '63': 12059, '61': 12055, '62': 12055, '69': 12052, '128': 12048, '65': 12048, '71': 12045, '127': 12042, '66': 12038, '73': 11993, '129': 11978, '60': 11970, '64': 11966, '59': 11933, '141': 11927, '134': 11909, '

In [None]:
#checking for missing values
base_NodesResult.isna().sum()

node_ScriptR                 0
degree_node_ScriptR          0
aminoAcid_ScriptR            0
triangles_node               0
clusteringCoef_node          0
betweennessWeighted_node    32
node_pos_ScriptR             0
node_chain_ScriptR           0
Uniprot_AF_id_ScriptR        0
F_AF_ScriptR                 0
dtype: int64

In [None]:
#Converting NaN values to '.'
import numpy as np
base_NodesResult = base_NodesResult.replace(np.nan, '.', regex=True)

In [None]:
#checking for missing values
base_NodesResult.isna().sum()

node_ScriptR                0
degree_node_ScriptR         0
aminoAcid_ScriptR           0
triangles_node              0
clusteringCoef_node         0
betweennessWeighted_node    0
node_pos_ScriptR            0
node_chain_ScriptR          0
Uniprot_AF_id_ScriptR       0
F_AF_ScriptR                0
dtype: int64

In [None]:
import numpy as np
def clean_column(df):
  for char in ['.']:
    for col in ['betweennessWeighted_node']:
      df = df[df[col] != char]
  return df

base_NodesResult_ok = clean_column(base_NodesResult)

In [None]:
base_NodesResult_ok.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7695606 entries, 0 to 7695637
Data columns (total 10 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   node_ScriptR              object 
 1   degree_node_ScriptR       int64  
 2   aminoAcid_ScriptR         object 
 3   triangles_node            int64  
 4   clusteringCoef_node       float64
 5   betweennessWeighted_node  object 
 6   node_pos_ScriptR          object 
 7   node_chain_ScriptR        object 
 8   Uniprot_AF_id_ScriptR     object 
 9   F_AF_ScriptR              object 
dtypes: float64(1), int64(2), object(7)
memory usage: 645.8+ MB


In [None]:
#Identify duplicates records in the data
dupes=base_NodesResult_ok.duplicated()
sum(dupes)

0

###1.1.9 Generating a file with the processed *base_NodesResult* database

In [None]:
base_NodesResult_ok.to_csv("drive/My Drive/ProcessaNovaBase/TrataArqsScriptDiegoAlphaFold/ALL-AF_NodesResult_proc.csv",sep=',',index=False)

##1.2 Processing of the *ALL-AF_GraphsResult* database

In [None]:
#Increasing the capacity to view columns and rows
import pandas as pd

pd.set_option('display.max_columns', 7000)
pd.set_option('display.max_rows',90000)

In [None]:
import pandas as pd

base_GraphsResult = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsScriptDiegoAlphaFold/ALL-AF_GraphsResult.csv",index_col=False, delimiter=',')

In [None]:
base_GraphsResult.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17091 entries, 0 to 17090
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           17091 non-null  int64  
 1   degree               17091 non-null  float64
 2   clusteringCoef       17091 non-null  float64
 3   betweennessWeighted  17075 non-null  float64
 4   graphAssortativity   17071 non-null  float64
 5   filename             17091 non-null  object 
dtypes: float64(4), int64(1), object(1)
memory usage: 801.3+ KB


In [None]:
base_GraphsResult.head()

Unnamed: 0.1,Unnamed: 0,degree,clusteringCoef,betweennessWeighted,graphAssortativity,filename
0,1,3.664286,0.013895,0.007536,0.390473,AF-A0AUZ9-F1-model_v4.pdb.edges
1,2,5.507576,0.04131,0.007514,0.39217,AF-A0AV02-F1-model_v4.pdb.edges
2,3,5.54902,0.048405,0.020242,0.410746,AF-A0AV96-F1-model_v4.pdb.edges
3,4,7.157058,0.059276,0.021251,0.240322,AF-A0AVF1-F1-model_v4.pdb.edges
4,5,5.949527,0.057706,0.029168,0.395552,AF-A0AVI4-F1-model_v4.pdb.edges


In [None]:
base_GraphsResult.tail()

Unnamed: 0.1,Unnamed: 0,degree,clusteringCoef,betweennessWeighted,graphAssortativity,filename
17086,17087,6.077071,0.078517,0.005897,0.317584,AF-Q9Y6Y0-F1-model_v4.pdb.edges
17087,17088,5.159639,0.035099,0.009461,0.436944,AF-Q9Y6Y1-F1-model_v4.pdb.edges
17088,17089,6.108475,0.039751,0.010267,0.405921,AF-Q9Y6Y8-F1-model_v4.pdb.edges
17089,17090,4.911765,0.042665,0.036651,0.387542,AF-Q9Y6Y9-F1-model_v4.pdb.edges
17090,17091,6.329412,0.039308,0.022113,0.329955,AF-Q9Y6Z7-F1-model_v4.pdb.edges


In [None]:
#checking for missing values
base_GraphsResult.isna().sum()

Unnamed: 0              0
degree                  0
clusteringCoef          0
betweennessWeighted    16
graphAssortativity     20
filename                0
dtype: int64

In [None]:
def categories_column(df):
    for col in ['degree', 'clusteringCoef','graphAssortativity']:
        mydic= df[col].value_counts().to_dict()
        print(col, mydic)
        print('\n')

categories_column(base_GraphsResult)

degree {6.0: 46, 4.0: 29, 5.0: 20, 1.0: 16, 2.0: 12, 5.33333333333333: 12, 5.5: 10, 4.5: 10, 4.25: 9, 4.66666666666667: 9, 3.0: 9, 5.71428571428571: 8, 5.2: 8, 6.5: 8, 6.66666666666667: 8, 6.28571428571429: 8, 5.25: 7, 6.25: 7, 5.42857142857143: 7, 6.6: 7, 4.47619047619048: 6, 5.41666666666667: 6, 4.28571428571429: 6, 6.4: 6, 4.85714285714286: 6, 2.66666666666667: 6, 4.52631578947368: 6, 4.4390243902439: 6, 4.75: 6, 5.75: 6, 5.81818181818182: 6, 6.57142857142857: 6, 5.76470588235294: 5, 4.8: 5, 6.2: 5, 6.15384615384615: 5, 6.16666666666667: 5, 5.66666666666667: 5, 4.94117647058824: 5, 3.6: 5, 4.22222222222222: 5, 6.1: 5, 5.05084745762712: 5, 6.24: 5, 6.12121212121212: 5, 6.3: 5, 5.73333333333333: 5, 4.90909090909091: 5, 5.6969696969697: 5, 7.0: 5, 5.04761904761905: 5, 6.42857142857143: 4, 3.2: 4, 5.375: 4, 5.4: 4, 1.5: 4, 5.69230769230769: 4, 4.11428571428571: 4, 6.21052631578947: 4, 4.21052631578947: 4, 6.22222222222222: 4, 4.76923076923077: 4, 4.33333333333333: 4, 4.85245901639344: 4

In [None]:
base_GraphsResult['betweennessWeighted'].value_counts()

0.000000    15
0.017224     2
0.009524     2
0.002020     2
0.083333     2
0.001166     2
0.080054     2
0.216667     2
0.053834     1
0.003922     1
0.012510     1
0.011490     1
0.007536     1
0.011680     1
0.003280     1
0.026803     1
0.001010     1
0.016666     1
0.004666     1
0.017091     1
0.001500     1
0.021883     1
0.012569     1
0.002180     1
0.009271     1
0.009143     1
0.008189     1
0.011907     1
0.005515     1
0.013537     1
0.004911     1
0.007591     1
0.001808     1
0.006950     1
0.009908     1
0.015559     1
0.008583     1
0.026264     1
0.010578     1
0.015785     1
0.013101     1
0.012939     1
0.005869     1
0.013466     1
0.020510     1
0.011444     1
0.003714     1
0.014702     1
0.017775     1
0.020981     1
0.023696     1
0.024089     1
0.022322     1
0.011853     1
0.018950     1
0.014136     1
0.012194     1
0.002677     1
0.007265     1
0.013372     1
0.006803     1
0.016141     1
0.016674     1
0.002976     1
0.030248     1
0.014563     1
0.012785  

###1.2.1 Selection of fields to be used

In [None]:
#Field selection
base_GraphsResult = base_GraphsResult.loc[:,['degree','clusteringCoef', 'betweennessWeighted', 'graphAssortativity', 'filename']]

In [None]:
base_GraphsResult.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17091 entries, 0 to 17090
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   degree               17091 non-null  float64
 1   clusteringCoef       17091 non-null  float64
 2   betweennessWeighted  17075 non-null  float64
 3   graphAssortativity   17071 non-null  float64
 4   filename             17091 non-null  object 
dtypes: float64(4), object(1)
memory usage: 667.7+ KB


###1.2.2 Renaming the fields

In [None]:
base_GraphsResult.rename(columns={'degree': 'degree_Graph_ScriptR',
                                 'clusteringCoef': 'clusteringCoef_Graph',
                                 'betweennessWeighted': 'betweennessWeighted_Graph',
                                  'filename': 'filename_Graph'
                                  }, inplace=True)

In [None]:
base_GraphsResult.head()

Unnamed: 0,degree_Graph_ScriptR,clusteringCoef_Graph,betweennessWeighted_Graph,graphAssortativity,filename_Graph
0,3.664286,0.013895,0.007536,0.390473,AF-A0AUZ9-F1-model_v4.pdb.edges
1,5.507576,0.04131,0.007514,0.39217,AF-A0AV02-F1-model_v4.pdb.edges
2,5.54902,0.048405,0.020242,0.410746,AF-A0AV96-F1-model_v4.pdb.edges
3,7.157058,0.059276,0.021251,0.240322,AF-A0AVF1-F1-model_v4.pdb.edges
4,5.949527,0.057706,0.029168,0.395552,AF-A0AVI4-F1-model_v4.pdb.edges


###1.2.3 Generation of an intermediate file with the selected fields from the *base_GraphsResult* database.  

In [None]:
base_GraphsResult.to_csv("drive/My Drive/ProcessaNovaBase/TrataArqsScriptDiegoAlphaFold/ALL-AF_GraphsResult_sel.csv",sep=',',index=False)

###1.2.4 Generation of the *Uniprot_AF_id_Graph_ScriptR* attribute extracted from the *filename_Graph* attribute.

In [None]:
#Increasing the capacity to view columns and rows
import pandas as pd

pd.set_option('display.max_columns', 7000)
pd.set_option('display.max_rows',90000)

In [None]:
import pandas as pd
base_GraphsResult = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsScriptDiegoAlphaFold/ALL-AF_GraphsResult_sel.csv",sep=',')

In [None]:
base_GraphsResult.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17091 entries, 0 to 17090
Data columns (total 5 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   degree_Graph_ScriptR       17091 non-null  float64
 1   clusteringCoef_Graph       17091 non-null  float64
 2   betweennessWeighted_Graph  17075 non-null  float64
 3   graphAssortativity         17071 non-null  float64
 4   filename_Graph             17091 non-null  object 
dtypes: float64(4), object(1)
memory usage: 667.7+ KB


In [None]:
def get_uniprot(filename):

  aux = filename.split("-")[1]  # AF-A0AUZ9-F1-model_v4.pdb.edges

  return aux

In [None]:
base_GraphsResult['Uniprot_AF_id_Graph_ScriptR'] = base_GraphsResult['filename_Graph'].apply(get_uniprot)

In [None]:
base_GraphsResult["Uniprot_AF_id_Graph_ScriptR"].value_counts()

Q8NF91    38
Q03001    32
Q9UPN3    31
Q8WXH0    29
Q8WXG9    26
Q09666    24
Q96RW7    23
Q8IVF2    23
Q9HC84    23
O14686    22
Q9NU22    22
O75445    21
Q5T4S7    20
P21817    20
Q9Y6V0    20
Q2LD37    20
Q4G0P3    20
Q8NEZ4    19
Q15413    19
Q6V0I7    19
Q15751    19
O95714    19
Q92736    19
P98164    18
Q8TE73    18
Q15149    18
Q14204    18
Q96M86    18
Q14517    17
Q685J3    17
Q96JB1    17
Q07954    17
Q9NZR2    17
Q9NYC9    17
Q8IVF4    17
Q8TDW7    17
Q9NZJ4    17
Q96DT5    17
Q9P225    17
Q86WI1    16
Q12955    16
Q9NYQ8    16
Q8NCM8    16
P98160    16
P98161    16
Q8N3K9    15
Q7Z7G8    15
P20930    15
P78527    15
Q8WXX0    15
Q8TD57    15
Q9C0G6    15
P08F94    15
Q99996    14
Q01484    14
Q9UPA5    14
Q9Y4A5    14
Q03164    14
Q99698    14
Q709C8    13
Q9NRC6    13
Q96T58    13
Q7Z407    13
Q15911    13
O60494    13
Q96Q15    13
O15230    13
P78509    12
Q5T011    12
Q4LDE5    12
Q9UKN7    12
Q0VDD8    12
P46939    12
Q96PZ7    12
Q8IZQ1    12
Q8IZT6    12
Q7Z408    12

In [None]:
base_GraphsResult.head()

Unnamed: 0,degree_Graph_ScriptR,clusteringCoef_Graph,betweennessWeighted_Graph,graphAssortativity,filename_Graph,Uniprot_AF_id_Graph_ScriptR
0,3.664286,0.013895,0.007536,0.390473,AF-A0AUZ9-F1-model_v4.pdb.edges,A0AUZ9
1,5.507576,0.04131,0.007514,0.39217,AF-A0AV02-F1-model_v4.pdb.edges,A0AV02
2,5.54902,0.048405,0.020242,0.410746,AF-A0AV96-F1-model_v4.pdb.edges,A0AV96
3,7.157058,0.059276,0.021251,0.240322,AF-A0AVF1-F1-model_v4.pdb.edges,A0AVF1
4,5.949527,0.057706,0.029168,0.395552,AF-A0AVI4-F1-model_v4.pdb.edges,A0AVI4


In [None]:
base_GraphsResult.tail()

Unnamed: 0,degree_Graph_ScriptR,clusteringCoef_Graph,betweennessWeighted_Graph,graphAssortativity,filename_Graph,Uniprot_AF_id_Graph_ScriptR
17086,6.077071,0.078517,0.005897,0.317584,AF-Q9Y6Y0-F1-model_v4.pdb.edges,Q9Y6Y0
17087,5.159639,0.035099,0.009461,0.436944,AF-Q9Y6Y1-F1-model_v4.pdb.edges,Q9Y6Y1
17088,6.108475,0.039751,0.010267,0.405921,AF-Q9Y6Y8-F1-model_v4.pdb.edges,Q9Y6Y8
17089,4.911765,0.042665,0.036651,0.387542,AF-Q9Y6Y9-F1-model_v4.pdb.edges,Q9Y6Y9
17090,6.329412,0.039308,0.022113,0.329955,AF-Q9Y6Z7-F1-model_v4.pdb.edges,Q9Y6Z7


###1.2.5 Generation of the *F_AF_Graph_ScriptR* attribute extracted from the *FileName* attribute.

In [None]:
def get_F(filename):

  aux1 = filename.split("-")[2]  # AF-A0AUZ9-F1-model_v4.pdb.edges
  return aux1

In [None]:
base_GraphsResult['F_AF_Graph_ScriptR'] = base_GraphsResult['filename_Graph'].apply(get_F)

In [None]:
base_GraphsResult.tail(50)

Unnamed: 0,degree_Graph_ScriptR,clusteringCoef_Graph,betweennessWeighted_Graph,graphAssortativity,filename_Graph,Uniprot_AF_id_Graph_ScriptR,F_AF_Graph_ScriptR
17041,7.007194,0.055948,0.012151,0.459191,AF-Q9Y6P5-F1-model_v4.pdb.edges,Q9Y6P5,F1
17042,5.676329,0.028845,0.016001,0.434037,AF-Q9Y6Q2-F1-model_v4.pdb.edges,Q9Y6Q2,F1
17043,5.218509,0.056392,0.008552,0.365718,AF-Q9Y6Q3-F1-model_v4.pdb.edges,Q9Y6Q3,F1
17044,5.593583,0.04608,0.028891,0.353928,AF-Q9Y6Q5-F1-model_v4.pdb.edges,Q9Y6Q5,F1
17045,4.875576,0.018653,0.016333,0.427626,AF-Q9Y6Q6-F1-model_v4.pdb.edges,Q9Y6Q6,F1
17046,4.468271,0.015343,0.004433,0.650899,AF-Q9Y6Q9-F1-model_v4.pdb.edges,Q9Y6Q9,F1
17047,4.348148,0.034895,0.005991,0.527461,AF-Q9Y6R0-F1-model_v4.pdb.edges,Q9Y6R0,F1
17048,6.181646,0.053207,0.005158,0.472993,AF-Q9Y6R4-F1-model_v4.pdb.edges,Q9Y6R4,F1
17049,5.797235,0.060859,0.00324,0.365102,AF-Q9Y6R6-F1-model_v4.pdb.edges,Q9Y6R6,F1
17050,4.5,0.016776,0.010732,0.487765,AF-Q9Y6R9-F1-model_v4.pdb.edges,Q9Y6R9,F1


The **filename** attribute will no longer be needed, it will be removed.

In [None]:
del base_GraphsResult['filename_Graph']

In [None]:
base_GraphsResult.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17091 entries, 0 to 17090
Data columns (total 6 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   degree_Graph_ScriptR         17091 non-null  float64
 1   clusteringCoef_Graph         17091 non-null  float64
 2   betweennessWeighted_Graph    17075 non-null  float64
 3   graphAssortativity           17071 non-null  float64
 4   Uniprot_AF_id_Graph_ScriptR  17091 non-null  object 
 5   F_AF_Graph_ScriptR           17091 non-null  object 
dtypes: float64(4), object(2)
memory usage: 801.3+ KB


###1.2.6 Deletion of records that have attributes with the value *NaN*

In [None]:
#checking for missing values
base_GraphsResult.isna().sum()

degree_Graph_ScriptR            0
clusteringCoef_Graph            0
betweennessWeighted_Graph      16
graphAssortativity             20
Uniprot_AF_id_Graph_ScriptR     0
F_AF_Graph_ScriptR              0
dtype: int64

In [None]:
#drop all rows that have any NaN values
base_GraphsResult_ok = base_GraphsResult.dropna()


In [None]:
#checking for missing values
base_GraphsResult_ok.isna().sum()

degree_Graph_ScriptR           0
clusteringCoef_Graph           0
betweennessWeighted_Graph      0
graphAssortativity             0
Uniprot_AF_id_Graph_ScriptR    0
F_AF_Graph_ScriptR             0
dtype: int64

In [None]:
base_GraphsResult_ok.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17071 entries, 0 to 17090
Data columns (total 6 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   degree_Graph_ScriptR         17071 non-null  float64
 1   clusteringCoef_Graph         17071 non-null  float64
 2   betweennessWeighted_Graph    17071 non-null  float64
 3   graphAssortativity           17071 non-null  float64
 4   Uniprot_AF_id_Graph_ScriptR  17071 non-null  object 
 5   F_AF_Graph_ScriptR           17071 non-null  object 
dtypes: float64(4), object(2)
memory usage: 933.6+ KB


In [None]:
#Identify duplicates records in the data
dupes=base_GraphsResult_ok.duplicated()
sum(dupes)

0

###1.2.7 Generation of an intermediate file with the processed *base_GraphsResult* database.

In [None]:
base_GraphsResult_ok.to_csv("drive/My Drive/ProcessaNovaBase/TrataArqsScriptDiegoAlphaFold/ALL-AF_GraphsResult_proc.csv",sep=',',index=False)
