#0 - Basic Settings

In [None]:
#Permission to access any file on Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Increasing the capacity to view columns and rows
import pandas as pd

pd.set_option('display.max_columns', 7000)
pd.set_option('display.max_rows',90000)

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, 
and then re-execute this cell.


In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('To enable a high-RAM runtime, select the Runtime > "Change runtime type"')
  print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
  print('re-execute this cell.')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 27.3 gigabytes of available RAM

You are using a high-RAM runtime!


#1 -Reading and processing of cifs_pdbs_NodesResult files


In this section, the following files will be read:
- **cifs_pdbs_NodesResult_G1.csv**
- **cifs_pdbs_NodesResult_G2.csv**
- **cifs_pdbs_NodesResult_G3.csv**
- **cifs_pdbs_NodesResult_G3_3D3W.csv**
- **cifs_pdbs_NodesResult_G4.csv**
- **cifs_pdbs_NodesResult_G5.csv**
- **cifs_pdbs_NodesResult_G6.csv**
- **cifs_pdbs_NodesResult_G7.csv**
- **cifs_pdbs_NodesResult_G8.csv**
- **cifs_pdbs_NodesResult_G9.csv**

They contain the processing of the **R** script by Diego Morais & Dalmolin. This calculates the **Clustering Coefficient** and the **Betweenness** of the interaction network of each PDB. This R script received as input the edge files of each PDB that were generated by **RING**. Such a script is in the **/Meu Drive/ProcessaNovaNase/TrataArqsScriptDiego** folder on this drive.

##1.1 Processing of cifs_pdbs_NodesResult databases

In [None]:
import pandas as pd

df_G1 = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsScriptDiego/cifs_pdbs_NodesResult_G1.csv",index_col=False, delimiter=',')
df_G2 = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsScriptDiego/cifs_pdbs_NodesResult_G2.csv",index_col=False, delimiter=',')
df_G3 = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsScriptDiego/cifs_pdbs_NodesResult_G3.csv",index_col=False, delimiter=',')
df_G3_3D3W = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsScriptDiego/cifs_pdbs_NodesResult_G3_3D3W.csv",index_col=False, delimiter=',')
df_G4 = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsScriptDiego/cifs_pdbs_NodesResult_G4.csv",index_col=False, delimiter=',')
df_G5 = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsScriptDiego/cifs_pdbs_NodesResult_G5.csv",index_col=False, delimiter=',')
df_G6 = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsScriptDiego/cifs_pdbs_NodesResult_G6.csv",index_col=False, delimiter=',')
df_G7 = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsScriptDiego/cifs_pdbs_NodesResult_G7.csv",index_col=False, delimiter=',')
df_G8 = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsScriptDiego/cifs_pdbs_NodesResult_G8.csv",index_col=False, delimiter=',')
df_G9 = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsScriptDiego/cifs_pdbs_NodesResult_G9.csv",index_col=False, delimiter=',')


In [None]:
base_NodesResult = df_G1.append([df_G2, df_G3, df_G3_3D3W, df_G4, df_G5, df_G6, df_G7, df_G8, df_G9], ignore_index=True)

In [None]:
base_NodesResult.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3048870 entries, 0 to 3048869
Data columns (total 8 columns):
 #   Column               Dtype  
---  ------               -----  
 0   Unnamed: 0           int64  
 1   node                 object 
 2   degree               int64  
 3   aminoAcid            object 
 4   triangles            int64  
 5   clusteringCoef       float64
 6   betweennessWeighted  float64
 7   filename             object 
dtypes: float64(2), int64(3), object(3)
memory usage: 186.1+ MB


In [None]:
base_NodesResult.head()

Unnamed: 0.1,Unnamed: 0,node,degree,aminoAcid,triangles,clusteringCoef,betweennessWeighted,filename
0,1,A:10:_:VAL,84,VAL,2,0.000574,0.022039,10GS.pdb.edges
1,2,A:100:_:ARG,11,ARG,3,0.054545,0.032282,10GS.pdb.edges
2,3,A:101:_:CYS,5,CYS,0,0.0,0.014851,10GS.pdb.edges
3,4,A:102:_:LYS,7,LYS,2,0.095238,0.035155,10GS.pdb.edges
4,5,A:103:_:TYR,16,TYR,1,0.008333,0.029593,10GS.pdb.edges


###1.1.1 Selection of fields that will be used

In [None]:
#Selection of fields
base_NodesResult = base_NodesResult.loc[:,['node','degree','triangles','clusteringCoef','betweennessWeighted','filename']]

In [None]:
base_NodesResult.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3048870 entries, 0 to 3048869
Data columns (total 6 columns):
 #   Column               Dtype  
---  ------               -----  
 0   node                 object 
 1   degree               int64  
 2   triangles            int64  
 3   clusteringCoef       float64
 4   betweennessWeighted  float64
 5   filename             object 
dtypes: float64(2), int64(2), object(2)
memory usage: 139.6+ MB


###1.1.2 Renaming of fields

In [None]:
base_NodesResult.rename(columns={'node': 'node_ScriptR',
                                 'degree': 'degree_node_ScriptR',
                                 'triangles': 'triangles_node',
                                 'clusteringCoef': 'clusteringCoef_node',
                                 'betweennessWeighted': 'betweennessWeighted_node'
                                  }, inplace=True)

In [None]:
base_NodesResult.head()

Unnamed: 0,node_ScriptR,degree_node_ScriptR,triangles_node,clusteringCoef_node,betweennessWeighted_node,filename
0,A:10:_:VAL,84,2,0.000574,0.022039,10GS.pdb.edges
1,A:100:_:ARG,11,3,0.054545,0.032282,10GS.pdb.edges
2,A:101:_:CYS,5,0,0.0,0.014851,10GS.pdb.edges
3,A:102:_:LYS,7,2,0.095238,0.035155,10GS.pdb.edges
4,A:103:_:TYR,16,1,0.008333,0.029593,10GS.pdb.edges


###1.1.3 Generation of an intermediate file with the selected fields from the *base_NodesResult* database

In [None]:
base_NodesResult.to_csv("drive/My Drive/ProcessaNovaBase/TrataArqsScriptDiego/cifs_pdbs_NodesResult_All_sel.csv",sep=',',index=False)

###1.1.4 Extraction of the residue, its position, and chain in the *node* attribute

The **node_ScriptR** attribute was obtained from the **edge** file (RING output). It has the following format:

$<chain> : <index> : <insertion_code> : <residue_3_letter_code>$

The chain, position (index), and node (can be a residue or ligand) will be extracted in **node**

**node_id_ScriptR**: can be a residue or ligand, 3-letter code

**node_pos_ScriptR**: the position of the residue

**node_chain_ScriptR**: the chain where the residue is located

In [None]:
#Increasing the capacity to view columns and rows
import pandas as pd

pd.set_option('display.max_columns', 7000)
pd.set_option('display.max_rows',90000)

In [None]:
import pandas as pd
base_NodesResult = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsScriptDiego/cifs_pdbs_NodesResult_All_sel.csv",sep=',')

In [None]:
base_NodesResult.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3048870 entries, 0 to 3048869
Data columns (total 6 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   node_ScriptR              object 
 1   degree_node_ScriptR       int64  
 2   triangles_node            int64  
 3   clusteringCoef_node       float64
 4   betweennessWeighted_node  float64
 5   filename                  object 
dtypes: float64(2), int64(2), object(2)
memory usage: 139.6+ MB


In [None]:
base_NodesResult["node_ScriptR"].value_counts()

A:144:_:LEU     607
A:66:_:LEU      597
A:121:_:VAL     590
A:164:_:LEU     574
A:105:_:LEU     561
               ... 
F:1289:_:ACT      1
B:1040:_:MET      1
N:339:_:LYS       1
I:326:_:GLY       1
S:45:_:TYR        1
Name: node_ScriptR, Length: 263084, dtype: int64

In [None]:
base_NodesResult["node_id_ScriptR"] = base_NodesResult["node_ScriptR"].apply(lambda x: x.split(":")[3])

In [None]:
#Converting to pattern: Ala, Arg, Asn,...
Amin = ['ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HIS', 'ILE', 'LEU', 'LYS',
      'MET', 'PHE', 'PRO', 'PYL', 'SER', 'SEC', 'THR', 'TRP', 'TYR', 'VAL', 'ASX', 'GLX']

base_NodesResult["node_id_ScriptR"] = base_NodesResult["node_id_ScriptR"].apply(lambda x: x.capitalize() if x in Amin else x)

In [None]:
base_NodesResult["node_id_ScriptR"].value_counts()

Leu    311427
Val    219907
Ala    203816
Glu    191880
Lys    184075
Ser    182856
Gly    181078
Ile    163877
Thr    159156
Arg    156769
Asp    156082
Phe    138723
Pro    128487
Gln    126604
Asn    118223
Tyr    115247
His     83041
Met     67470
Cys     65802
Trp     47745
EDO      2740
SO4      2716
DG       2216
NAG      2196
DC       2193
DT       2080
GOL      2052
DA       2052
ZN       2040
MSE      1967
CA       1896
UNX      1439
CL       1390
MG       1158
HEM       675
NA        628
G         597
C         558
A         542
U         535
PO4       480
ACT       431
MAN       309
MN        299
PEG       273
IOD       257
HG        225
BR        193
ACE       189
K         184
FUC       183
MPD       171
SEP       166
DMS       160
FMT       159
GDP       157
NAD       156
BMA       155
FAD       137
CIT       127
ADP       126
GLC       126
PTR       125
NAP       124
BME       122
TPO       117
TYS       110
MES       109
CMO       107
NI        100
CGU        99
GSH   

In [None]:
base_NodesResult["node_pos_ScriptR"] = base_NodesResult["node_ScriptR"].apply(lambda x: x.split(":")[1])

In [None]:
base_NodesResult["node_chain_ScriptR"] = base_NodesResult["node_ScriptR"].apply(lambda x: x.split(":")[0].upper())

In [None]:
base_NodesResult["node_chain_ScriptR"].value_counts()

A    1443111
B     725059
C     256840
D     218103
H      81646
E      78782
F      55704
L      38686
G      37650
I      18768
X      14538
J      12652
K       8775
P       7623
M       7323
R       4744
O       4735
Y       4638
S       4347
N       4149
Q       3684
T       3683
U       3330
Z       2708
W       2251
V       2189
2       1604
1        583
3        258
4        214
0         87
5         82
9         82
8         81
6         81
7         80
Name: node_chain_ScriptR, dtype: int64

In [None]:
base_NodesResult.query("node_chain_ScriptR == '1'")

Unnamed: 0,node_ScriptR,degree_node_ScriptR,triangles_node,clusteringCoef_node,betweennessWeighted_node,filename,node_id_ScriptR,node_pos_ScriptR,node_chain_ScriptR
103150,1:1:_:CYS,14,2,0.021978,0.018978,1C4U.pdb.edges,Cys,1,1
103151,1:1:A:ASP,1,0,0.0,0.0,1C4U.pdb.edges,Asp,1,1
103152,1:1:B:ALA,1,0,0.0,0.0,1C4U.pdb.edges,Ala,1,1
103153,1:1:C:GLU,10,0,0.0,0.005157,1C4U.pdb.edges,Glu,1,1
103154,1:1:D:GLY,4,1,0.166667,0.004229,1C4U.pdb.edges,Gly,1,1
103155,1:1:E:SER,3,1,0.333333,0.0,1C4U.pdb.edges,Ser,1,1
103156,1:10:_:LYS,2,0,0.0,0.009077,1C4U.pdb.edges,Lys,10,1
103157,1:11:_:SER,1,0,0.0,0.0,1C4U.pdb.edges,Ser,11,1
103158,1:12:_:LEU,1,0,0.0,0.0,1C4U.pdb.edges,Leu,12,1
103159,1:13:_:GLU,2,0,0.0,0.0,1C4U.pdb.edges,Glu,13,1


In [None]:
base_NodesResult.head(25)

Unnamed: 0,node_ScriptR,degree_node_ScriptR,triangles_node,clusteringCoef_node,betweennessWeighted_node,filename,node_id_ScriptR,node_pos_ScriptR,node_chain_ScriptR
0,A:10:_:VAL,84,2,0.000574,0.022039,10GS.pdb.edges,Val,10,A
1,A:100:_:ARG,11,3,0.054545,0.032282,10GS.pdb.edges,Arg,100,A
2,A:101:_:CYS,5,0,0.0,0.014851,10GS.pdb.edges,Cys,101,A
3,A:102:_:LYS,7,2,0.095238,0.035155,10GS.pdb.edges,Lys,102,A
4,A:103:_:TYR,16,1,0.008333,0.029593,10GS.pdb.edges,Tyr,103,A
5,A:104:_:ILE,39,3,0.004049,0.01391,10GS.pdb.edges,Ile,104,A
6,A:105:_:SER,4,0,0.0,0.005403,10GS.pdb.edges,Ser,105,A
7,A:106:_:LEU,8,3,0.107143,0.012945,10GS.pdb.edges,Leu,106,A
8,A:107:_:ILE,17,1,0.007353,0.008544,10GS.pdb.edges,Ile,107,A
9,A:108:_:TYR,92,2,0.000478,0.001577,10GS.pdb.edges,Tyr,108,A


###1.1.5 Extraction of *PDB_id* from the *filename* attribute

In [None]:
base_NodesResult["PDB_id_ScriptR"] = base_NodesResult["filename"].apply(lambda x: x.split(".")[0])

In [None]:
base_NodesResult["PDB_id_ScriptR"] = base_NodesResult["PDB_id_ScriptR"].apply(lambda x: x.upper())

In [None]:
base_NodesResult["PDB_id_ScriptR"].value_counts()

5LE5    6046
1QO5    6046
5LF4    6013
5LF6    6004
2Q3E    5401
4DVQ    5317
1ZY8    4558
4AY1    4175
4ZUL    3964
3N80    3911
1YDE    3887
1O02    3885
1O01    3877
1CW3    3860
1O00    3859
1NZZ    3856
1NZX    3845
1O05    3841
1N4S    3819
1N4Q    3803
5Z2C    3690
6I35    3689
1ZMD    3664
1ZMC    3663
3SOM    3588
6K0R    3497
4EJH    3491
2VCV    3376
3LPP    3322
6Y41    3306
5OKM    3267
3HHD    3212
5EOM    3192
5K1A    3067
1MX1    3015
6F3T    2877
7JNT    2836
1O7A    2801
3GJX    2797
7JOV    2792
1R9M    2784
1R9N    2763
3IWP    2715
2C10    2694
5FQD    2686
6YND    2567
3P8C    2567
4OKN    2545
3T3P    2531
1I10    2529
6I7S    2524
6UEL    2515
1HL5    2489
5Q0C    2482
4I5L    2423
5JYO    2371
3HN3    2365
3U1K    2348
1H6K    2301
5UZ0    2272
1Z6T    2240
1PKX    2235
4A63    2214
3HEI    2213
1PL0    2203
2VX2    2197
1DO8    2165
1IRI    2156
1GZ4    2155
1JIQ    2149
3UOM    2142
1JLH    2141
4CG4    2097
3V9G    2093
6HXL    2067
6ZEJ    2010
7DR4    1982

In [None]:
base_NodesResult.head(20)

Unnamed: 0,node_ScriptR,degree_node_ScriptR,triangles_node,clusteringCoef_node,betweennessWeighted_node,filename,node_id_ScriptR,node_pos_ScriptR,node_chain_ScriptR,PDB_id_ScriptR
0,A:10:_:VAL,84,2,0.000574,0.022039,10GS.pdb.edges,Val,10,A,10GS
1,A:100:_:ARG,11,3,0.054545,0.032282,10GS.pdb.edges,Arg,100,A,10GS
2,A:101:_:CYS,5,0,0.0,0.014851,10GS.pdb.edges,Cys,101,A,10GS
3,A:102:_:LYS,7,2,0.095238,0.035155,10GS.pdb.edges,Lys,102,A,10GS
4,A:103:_:TYR,16,1,0.008333,0.029593,10GS.pdb.edges,Tyr,103,A,10GS
5,A:104:_:ILE,39,3,0.004049,0.01391,10GS.pdb.edges,Ile,104,A,10GS
6,A:105:_:SER,4,0,0.0,0.005403,10GS.pdb.edges,Ser,105,A,10GS
7,A:106:_:LEU,8,3,0.107143,0.012945,10GS.pdb.edges,Leu,106,A,10GS
8,A:107:_:ILE,17,1,0.007353,0.008544,10GS.pdb.edges,Ile,107,A,10GS
9,A:108:_:TYR,92,2,0.000478,0.001577,10GS.pdb.edges,Tyr,108,A,10GS


In [None]:
#Checking for 'missing' values.
base_NodesResult.isna().sum()

node_ScriptR                0
degree_node_ScriptR         0
triangles_node              0
clusteringCoef_node         0
betweennessWeighted_node    0
filename                    0
node_id_ScriptR             0
node_pos_ScriptR            0
node_chain_ScriptR          0
PDB_id_ScriptR              0
dtype: int64

###1.1.6 Generation of an intermediate file with the processed *base_NodesResult* database

In [None]:
base_NodesResult.to_csv("drive/My Drive/ProcessaNovaBase/TrataArqsScriptDiego/cifs_pdbs_NodesResult_proc.csv",sep=',',index=False)

##1.2 Processing of cifs_pds_GraphsResult databases

In [None]:
import pandas as pd

df_G1 = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsScriptDiego/cifs_pds_GraphsResult_G1.csv",index_col=False, delimiter=',')
df_G2 = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsScriptDiego/cifs_pds_GraphsResult_G2.csv",index_col=False, delimiter=',')
df_G3 = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsScriptDiego/cifs_pds_GraphsResult_G3.csv",index_col=False, delimiter=',')
df_G3_3D3W = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsScriptDiego/cifs_pds_GraphsResult_G3_3D3W.csv",index_col=False, delimiter=',')
df_G4 = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsScriptDiego/cifs_pds_GraphsResult_G4.csv",index_col=False, delimiter=',')
df_G5 = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsScriptDiego/cifs_pds_GraphsResult_G5.csv",index_col=False, delimiter=',')
df_G6 = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsScriptDiego/cifs_pds_GraphsResult_G6.csv",index_col=False, delimiter=',')
df_G7 = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsScriptDiego/cifs_pds_GraphsResult_G7.csv",index_col=False, delimiter=',')
df_G8 = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsScriptDiego/cifs_pds_GraphsResult_G8.csv",index_col=False, delimiter=',')
df_G9 = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsScriptDiego/cifs_pds_GraphsResult_G9.csv",index_col=False, delimiter=',')


In [None]:
base_GraphsResult = df_G1.append([df_G2, df_G3, df_G3_3D3W, df_G4, df_G5, df_G6, df_G7, df_G8, df_G9], ignore_index=True)

In [None]:
base_GraphsResult.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6713 entries, 0 to 6712
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           6713 non-null   int64  
 1   degree               6713 non-null   float64
 2   clusteringCoef       6713 non-null   float64
 3   betweennessWeighted  6713 non-null   float64
 4   graphAssortativity   6713 non-null   float64
 5   filename             6713 non-null   object 
dtypes: float64(4), int64(1), object(1)
memory usage: 314.8+ KB


In [None]:
base_GraphsResult.head()

Unnamed: 0.1,Unnamed: 0,degree,clusteringCoef,betweennessWeighted,graphAssortativity,filename
0,1,34.842365,0.043091,0.011917,-0.48242,10GS.pdb.edges
1,2,39.482927,0.047376,0.011863,-0.483344,11GS.pdb.edges
2,3,43.3125,0.052289,0.022028,-0.706934,121P.pdb.edges
3,4,8.189655,0.045981,0.020806,0.012907,12CA.pdb.edges
4,5,34.397059,0.052068,0.011775,-0.480735,12GS.pdb.edges


###1.2.1 Selection of fields that will be used.

In [None]:
#Selection of fields
base_GraphsResult = base_GraphsResult.loc[:,['degree','clusteringCoef', 'betweennessWeighted', 'graphAssortativity', 'filename']]

In [None]:
base_GraphsResult.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6713 entries, 0 to 6712
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   degree               6713 non-null   float64
 1   clusteringCoef       6713 non-null   float64
 2   betweennessWeighted  6713 non-null   float64
 3   graphAssortativity   6713 non-null   float64
 4   filename             6713 non-null   object 
dtypes: float64(4), object(1)
memory usage: 262.4+ KB


###1.2.2 Renaming of fields

In [None]:
base_GraphsResult.rename(columns={'degree': 'degree_Graph_ScriptR',
                                 'clusteringCoef': 'clusteringCoef_Graph',
                                 'betweennessWeighted': 'betweennessWeighted_Graph',
                                  'filename': 'filename_Graph'
                                  }, inplace=True)

In [None]:
base_GraphsResult.head()

Unnamed: 0,degree_Graph_ScriptR,clusteringCoef_Graph,betweennessWeighted_Graph,graphAssortativity,filename_Graph
0,34.842365,0.043091,0.011917,-0.48242,10GS.pdb.edges
1,39.482927,0.047376,0.011863,-0.483344,11GS.pdb.edges
2,43.3125,0.052289,0.022028,-0.706934,121P.pdb.edges
3,8.189655,0.045981,0.020806,0.012907,12CA.pdb.edges
4,34.397059,0.052068,0.011775,-0.480735,12GS.pdb.edges


###1.2.3 Generation of an intermediate file with the selected fields from the *base_GraphsResult* database

In [None]:
base_GraphsResult.to_csv("drive/My Drive/ProcessaNovaBase/TrataArqsScriptDiego/cifs_pds_GraphsResult_All_sel.csv",sep=',',index=False)

###1.2.4 Extraction of the *PDB_id* from the *filename* attribute.

In [None]:
#Increasing the capacity to view columns and rows
import pandas as pd

pd.set_option('display.max_columns', 7000)
pd.set_option('display.max_rows',90000)

In [None]:
import pandas as pd
base_GraphsResult = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArqsScriptDiego/cifs_pds_GraphsResult_All_sel.csv",sep=',')

In [None]:
base_GraphsResult.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6713 entries, 0 to 6712
Data columns (total 5 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   degree_Graph_ScriptR       6713 non-null   float64
 1   clusteringCoef_Graph       6713 non-null   float64
 2   betweennessWeighted_Graph  6713 non-null   float64
 3   graphAssortativity         6713 non-null   float64
 4   filename_Graph             6713 non-null   object 
dtypes: float64(4), object(1)
memory usage: 262.4+ KB


In [None]:
base_GraphsResult["PDB_id_Graph_ScriptR"] = base_GraphsResult["filename_Graph"].apply(lambda x: x.split(".")[0])

In [None]:
#Processing to ensure that the code is all uppercase
base_GraphsResult["PDB_id_Graph_ScriptR"] = base_GraphsResult["PDB_id_Graph_ScriptR"].apply(lambda x: x.upper())

In [None]:
base_GraphsResult["PDB_id_Graph_ScriptR"].value_counts()

10GS    1
4EFO    1
4D86    1
4D6K    1
4D4Z    1
4D1P    1
4D0W    1
4D0P    1
4D0N    1
4CVO    1
4CVH    1
4CSR    1
4CRW    1
4CRU    1
4CPC    1
4CO8    1
4CO7    1
4CNM    1
4CN0    1
4CMT    1
4CML    1
4CLL    1
4CLF    1
4CKJ    1
4CH9    1
4D8O    1
4DCK    1
4DD8    1
4DRK    1
4ED5    1
4EBB    1
4E6R    1
4E5Y    1
4DZO    1
4DYL    1
4DY0    1
4DXT    1
4DX8    1
4DVQ    1
4DRI    1
4DEM    1
4DOT    1
4DNL    1
4DND    1
4DMB    1
4DM9    1
4DLO    1
4DKC    1
4DJ9    1
4DIP    1
4DHX    1
4CGV    1
4CG4    1
4CEK    1
4AYC    1
4BC2    1
4BBQ    1
4B94    1
4B93    1
4B91    1
4B90    1
4B4C    1
4B3F    1
4AZ9    1
4AYI    1
4AYA    1
4BHG    1
4AY9    1
4AY1    1
4AWA    1
4AVX    1
4AVS    1
4AUQ    1
4AU8    1
4ATM    1
4AT5    1
4ASZ    1
4BC3    1
4BK0    1
4CCG    1
4C0O    1
4CC9    1
4CC0    1
4CBZ    1
4CA1    1
4C9Y    1
4C9B    1
4C5W    1
4C5I    1
4C3Z    1
4C2J    1
4BX8    1
4BKW    1
4BWS    1
4BWE    1
4BVX    1
4BUZ    1
4BSP    1
4BSJ    1
4BQM    1


In [None]:
base_GraphsResult.head(20)

Unnamed: 0,degree_Graph_ScriptR,clusteringCoef_Graph,betweennessWeighted_Graph,graphAssortativity,filename_Graph,PDB_id_Graph_ScriptR
0,34.842365,0.043091,0.011917,-0.48242,10GS.pdb.edges,10GS
1,39.482927,0.047376,0.011863,-0.483344,11GS.pdb.edges,11GS
2,43.3125,0.052289,0.022028,-0.706934,121P.pdb.edges,121P
3,8.189655,0.045981,0.020806,0.012907,12CA.pdb.edges,12CA
4,34.397059,0.052068,0.011775,-0.480735,12GS.pdb.edges,12GS
5,6.533333,0.065934,0.033248,0.421536,133L.pdb.edges,133L
6,6.559322,0.049219,0.032411,0.266855,134L.pdb.edges,134L
7,49.050847,0.047885,0.011922,-0.570438,13GS.pdb.edges,13GS
8,16.307692,0.055159,0.013318,-0.167203,16GS.pdb.edges,16GS
9,33.577396,0.051262,0.012556,-0.45291,18GS.pdb.edges,18GS


In [None]:
#Checking for 'missing' values
base_GraphsResult.isna().sum()

degree_Graph_ScriptR         0
clusteringCoef_Graph         0
betweennessWeighted_Graph    0
graphAssortativity           0
filename_Graph               0
PDB_id_Graph_ScriptR         0
dtype: int64

In [None]:
base_GraphsResult.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6713 entries, 0 to 6712
Data columns (total 6 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   degree_Graph_ScriptR       6713 non-null   float64
 1   clusteringCoef_Graph       6713 non-null   float64
 2   betweennessWeighted_Graph  6713 non-null   float64
 3   graphAssortativity         6713 non-null   float64
 4   filename_Graph             6713 non-null   object 
 5   PDB_id_Graph_ScriptR       6713 non-null   object 
dtypes: float64(4), object(2)
memory usage: 314.8+ KB


In [None]:
base_GraphsResult.query("PDB_id_Graph_ScriptR  == '5SYT'")

Unnamed: 0,degree_Graph_ScriptR,clusteringCoef_Graph,betweennessWeighted_Graph,graphAssortativity,filename_Graph,PDB_id_Graph_ScriptR
5419,186.920993,0.025547,0.010857,-0.494055,5SYT.pdb.edges,5SYT


###1.2.5 Generation of an intermediate file with the processed *base_GraphsResult* database.

In [None]:
base_GraphsResult.to_csv("drive/My Drive/ProcessaNovaBase/TrataArqsScriptDiego/cifs_pds_GraphsResult_proc.csv",sep=',',index=False)
