#0 - Basic Settings

In [None]:
#Permission to access any file on Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Increasing the display capacity of columns and rows
import pandas as pd

pd.set_option('display.max_columns', 7000)
pd.set_option('display.max_rows',1000000)
pd.set_option('display.width', 7000)

#1 - Reading and processing of the file **plddt-global**


In this section the file **plddt-global.csv** (created by Elionai) will be read, which contains the value of the **global pllddt** associated with **Uniprot_AF**.   

##1.1 Processing of the *plddt-global.csv* database.

In [None]:
import pandas as pd

df_plddt = pd.read_csv("drive/My Drive/ProcessaNovaBase/TrataArq_pLDDT/plddt-global.csv",index_col=False, delimiter=',')


In [None]:
df_plddt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17107 entries, 0 to 17106
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      17107 non-null  object 
 1    plddt  17107 non-null  float64
dtypes: float64(1), object(1)
memory usage: 267.4+ KB


In [None]:
df_plddt.head()

Unnamed: 0,id,plddt
0,AF-A0A183-F1,62.78
1,AF-A0AUZ9-F1,50.24
2,AF-A0AV02-F1,68.87
3,AF-A0AV96-F1,64.31
4,AF-A0AVF1-F1,91.28


In [None]:
#Identify duplicates records in the data
dupes=df_plddt.duplicated()
sum(dupes)

0

In [None]:
#Checking for 'missing' values
df_plddt.isna().sum()

id        0
 plddt    0
dtype: int64

###1.1.1 Renaming fields

The attribute that represents the global pLDDT has a blank space at the beginning and should be named **pLDDT_global**.

In [None]:
df_plddt.rename(columns={' plddt': 'pLDDT_global',
                       }, inplace=True)

In [None]:
df_plddt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17107 entries, 0 to 17106
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            17107 non-null  object 
 1   pLDDT_global  17107 non-null  float64
dtypes: float64(1), object(1)
memory usage: 267.4+ KB


##1.2 Analysis of attribute values

In [None]:
df_plddt.columns

Index(['id', 'pLDDT_global'], dtype='object')

In [None]:

def categories_column(df):
    for col in ['id','pLDDT_global']:
        mydic= df[col].value_counts().to_dict()
        print(col, mydic)
        print('\n')

categories_column(df_plddt)

id {'AF-A0A183-F1': 1, 'AF-Q8TEU8-F1': 1, 'AF-Q8TED4-F1': 1, 'AF-Q8TED9-F1': 1, 'AF-Q8TEF2-F1': 1, 'AF-Q8TEH3-F1': 1, 'AF-Q8TEJ3-F1': 1, 'AF-Q8TEK3-F1': 1, 'AF-Q8TEL6-F1': 1, 'AF-Q8TEM1-F1': 1, 'AF-Q8TEQ6-F1': 1, 'AF-Q8TEQ8-F1': 1, 'AF-Q8TER0-F1': 1, 'AF-Q8TER5-F1': 1, 'AF-Q8TET4-F1': 1, 'AF-Q8TEU7-F1': 1, 'AF-Q8TEV9-F1': 1, 'AF-Q8TF39-F1': 1, 'AF-Q8TEW0-F1': 1, 'AF-Q8TEW6-F1': 1, 'AF-Q8TEW8-F1': 1, 'AF-Q8TEX9-F1': 1, 'AF-Q8TEY5-F1': 1, 'AF-Q8TEY7-F1': 1, 'AF-Q8TEZ7-F1': 1, 'AF-Q8TF01-F1': 1, 'AF-Q8TF05-F1': 1, 'AF-Q8TF08-F1': 1, 'AF-Q8TF17-F1': 1, 'AF-Q8TF20-F1': 1, 'AF-Q8TF21-F1': 1, 'AF-Q8TF30-F1': 1, 'AF-Q8TED0-F1': 1, 'AF-Q8TEC5-F1': 1, 'AF-Q8TEB9-F1': 1, 'AF-Q8TEB1-F1': 1, 'AF-Q8TE58-F1': 1, 'AF-Q8TE60-F1': 1, 'AF-Q8TE67-F1': 1, 'AF-Q8TE73-F1': 1, 'AF-Q8TE73-F10': 1, 'AF-Q8TE73-F11': 1, 'AF-Q8TE73-F12': 1, 'AF-Q8TE73-F13': 1, 'AF-Q8TE73-F14': 1, 'AF-Q8TE73-F15': 1, 'AF-Q8TE73-F16': 1, 'AF-Q8TE73-F17': 1, 'AF-Q8TE73-F18': 1, 'AF-Q8TE73-F2': 1, 'AF-Q8TE73-F3': 1, 'AF-Q8TE73-F4': 1,

##1.3 Generation of the attribute *Uniprot_AF_id* extracted from the attribute *id*

In [None]:
def get_uniprot(id):
  aux = id.split("-")[1]   # AF-A0A183-F1
  return aux

In [None]:
df_plddt['Uniprot_AF_id'] = df_plddt['id'].apply(get_uniprot)

In [None]:
df_plddt.head(10)

Unnamed: 0,id,pLDDT_global,Uniprot_AF_id
0,AF-A0A183-F1,62.78,A0A183
1,AF-A0AUZ9-F1,50.24,A0AUZ9
2,AF-A0AV02-F1,68.87,A0AV02
3,AF-A0AV96-F1,64.31,A0AV96
4,AF-A0AVF1-F1,91.28,A0AVF1
5,AF-A0AVI4-F1,88.18,A0AVI4
6,AF-A0AVK6-F1,53.69,A0AVK6
7,AF-A0AVT1-F1,91.7,A0AVT1
8,AF-A0FGR8-F1,74.29,A0FGR8
9,AF-A0FGR9-F1,72.13,A0FGR9


##1.4 Generation of the attribute *F_AF* extracted from the attribute *id*

In [None]:
def get_F(id):

  aux1 = id.split("-")[2]   # AF-A0A183-F1
  return aux1

In [None]:
df_plddt['F_AF'] = df_plddt['id'].apply(get_F)

In [None]:
df_plddt

Output hidden; open in https://colab.research.google.com to view.

The id attribute will no longer be needed, so it will be removed

In [None]:
del df_plddt['id']

In [None]:
#Identify duplicates records in the data
dupes= df_plddt.duplicated()
sum(dupes)

0

In [None]:
#Checking for 'missing' values
df_plddt.isna().sum()

pLDDT_global     0
Uniprot_AF_id    0
F_AF             0
dtype: int64

In [None]:
df_plddt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17107 entries, 0 to 17106
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   pLDDT_global   17107 non-null  float64
 1   Uniprot_AF_id  17107 non-null  object 
 2   F_AF           17107 non-null  object 
dtypes: float64(1), object(2)
memory usage: 401.1+ KB


In [None]:
df_plddt.query('Uniprot_AF_id == " "')

Unnamed: 0,pLDDT_global,Uniprot_AF_id,F_AF


In [None]:

def categories_column(df):
    for col in ['pLDDT_global','Uniprot_AF_id','F_AF']:
        mydic= df[col].value_counts().to_dict()
        print(col, mydic)
        print('\n')

categories_column(df_plddt)

pLDDT_global {83.29: 15, 85.13: 15, 85.25: 14, 81.42: 13, 84.81: 13, 75.3: 13, 75.01: 13, 77.27: 12, 83.78: 12, 83.81: 12, 85.91: 12, 80.24: 12, 81.99: 12, 87.42: 12, 78.54: 12, 79.85: 12, 81.76: 12, 88.12: 12, 89.45: 12, 80.35: 12, 84.11: 11, 84.76: 11, 80.51: 11, 80.48: 11, 79.0: 11, 80.11: 11, 84.33: 11, 82.86: 11, 81.82: 11, 87.6: 11, 77.06: 11, 84.19: 11, 85.02: 11, 85.24: 10, 79.06: 10, 88.96: 10, 86.35: 10, 76.09: 10, 82.39: 10, 83.56: 10, 85.14: 10, 87.78: 10, 78.29: 10, 81.31: 10, 83.49: 10, 83.97: 10, 74.65: 10, 78.51: 10, 79.12: 10, 64.85: 10, 82.08: 10, 78.28: 10, 81.49: 10, 79.42: 10, 86.98: 10, 82.57: 10, 79.93: 10, 84.06: 10, 80.9: 10, 80.39: 10, 88.27: 10, 83.46: 10, 84.65: 10, 83.66: 10, 85.96: 10, 81.9: 10, 77.82: 10, 86.84: 10, 92.08: 10, 92.97: 9, 84.38: 9, 87.38: 9, 81.47: 9, 86.03: 9, 85.85: 9, 90.59: 9, 91.1: 9, 88.07: 9, 86.24: 9, 86.44: 9, 81.26: 9, 80.83: 9, 88.39: 9, 87.1: 9, 86.3: 9, 87.54: 9, 70.55: 9, 76.92: 9, 63.81: 9, 87.39: 9, 73.85: 9, 84.45: 9, 78.82

##1.5 Changing the order of attributes

In [None]:
base_plddt = df_plddt[['Uniprot_AF_id', 'F_AF', 'pLDDT_global']]

In [None]:
base_plddt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17107 entries, 0 to 17106
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Uniprot_AF_id  17107 non-null  object 
 1   F_AF           17107 non-null  object 
 2   pLDDT_global   17107 non-null  float64
dtypes: float64(1), object(2)
memory usage: 401.1+ KB


In [None]:
#Identify duplicates records in the data
dupes=base_plddt.duplicated()
sum(dupes)

0

In [None]:
#Checking for 'missing' values
base_plddt.isna().sum()

Uniprot_AF_id    0
F_AF             0
pLDDT_global     0
dtype: int64

In [None]:
base_plddt.query('Uniprot_AF_id == " "')

Unnamed: 0,Uniprot_AF_id,F_AF,pLDDT_global


In [None]:

def categories_column(df):
    for col in ['Uniprot_AF_id','F_AF','pLDDT_global']:
        mydic= df[col].value_counts().to_dict()
        print(col, mydic)
        print('\n')

categories_column(base_plddt)

Uniprot_AF_id {'Q8NF91': 38, 'Q03001': 32, 'Q9UPN3': 31, 'Q8WXH0': 29, 'Q8WXG9': 26, 'Q09666': 24, 'Q9HC84': 23, 'Q96RW7': 23, 'Q8IVF2': 23, 'Q9NU22': 22, 'O14686': 22, 'O75445': 21, 'Q5T4S7': 20, 'Q2LD37': 20, 'Q4G0P3': 20, 'P21817': 20, 'Q9Y6V0': 20, 'Q15751': 19, 'Q15413': 19, 'O95714': 19, 'Q6V0I7': 19, 'Q8NEZ4': 19, 'Q92736': 19, 'Q14204': 18, 'Q96M86': 18, 'Q15149': 18, 'Q8TE73': 18, 'P98164': 18, 'Q8TDW7': 17, 'Q8IVF4': 17, 'Q9P225': 17, 'Q07954': 17, 'Q14517': 17, 'Q9NYC9': 17, 'Q9NZR2': 17, 'Q685J3': 17, 'Q96DT5': 17, 'Q9NZJ4': 17, 'Q96JB1': 17, 'Q12955': 16, 'Q86WI1': 16, 'Q9NYQ8': 16, 'P98160': 16, 'Q8NCM8': 16, 'P98161': 16, 'P08F94': 15, 'Q8N3K9': 15, 'Q9C0G6': 15, 'Q8TD57': 15, 'Q8WXX0': 15, 'P78527': 15, 'P20930': 15, 'Q7Z7G8': 15, 'Q99698': 14, 'Q03164': 14, 'Q9Y4A5': 14, 'Q99996': 14, 'Q9UPA5': 14, 'Q01484': 14, 'O15230': 13, 'Q96Q15': 13, 'Q7Z407': 13, 'Q15911': 13, 'Q709C8': 13, 'Q96T58': 13, 'Q9NRC6': 13, 'O60494': 13, 'Q5T011': 12, 'P46939': 12, 'P78509': 12, 'Q4LD

In [None]:
base_plddt.query("Uniprot_AF_id == 'O15417'")

Unnamed: 0,Uniprot_AF_id,F_AF,pLDDT_global
858,O15417,F1,43.36
859,O15417,F2,46.58
860,O15417,F3,46.58
861,O15417,F4,48.26
862,O15417,F5,49.17
863,O15417,F6,50.91
864,O15417,F7,50.88
865,O15417,F8,49.87
866,O15417,F9,51.82


In [None]:
base_plddt.query("Uniprot_AF_id == 'Q8WXG9'")

Unnamed: 0,Uniprot_AF_id,F_AF,pLDDT_global
11643,Q8WXG9,F1,81.51
11644,Q8WXG9,F10,84.06
11645,Q8WXG9,F11,84.19
11646,Q8WXG9,F12,85.08
11647,Q8WXG9,F13,82.77
11648,Q8WXG9,F14,84.34
11649,Q8WXG9,F15,83.21
11650,Q8WXG9,F16,83.84
11651,Q8WXG9,F17,84.38
11652,Q8WXG9,F18,82.78


##1.6 Sorting the database by the attributes *Uniprot_AF_id* and *pLDDT_global*

As can be observed, a Uniprot-AF can be linked to more than one resolution (pLDDT), the difference being the F of AlphaFold, since a Uniprot is linked to more than one F. The analysis of this duplication will be performed after the merge with the database already containing the RING data.

To aid in this analysis, the list of Uniprot associated with resolution will be ordered by the Uniprot-AF with the highest pLDDT (which has the best quality).



In [None]:
#Sorting the file by attributes Uniprot_AF_id and plddt in descending order
base_plddt_sort = base_plddt.sort_values(by=['Uniprot_AF_id','pLDDT_global'], ascending=False, ignore_index=True)

In [None]:
base_plddt_sort

Output hidden; open in https://colab.research.google.com to view.

In [None]:
base_plddt_sort.query('Uniprot_AF_id =="Q9UPN3"')

Unnamed: 0,Uniprot_AF_id,F_AF,pLDDT_global
664,Q9UPN3,F27,85.79
665,Q9UPN3,F29,85.6
666,Q9UPN3,F2,85.46
667,Q9UPN3,F1,85.22
668,Q9UPN3,F28,85.02
669,Q9UPN3,F30,85.0
670,Q9UPN3,F26,84.58
671,Q9UPN3,F25,84.49
672,Q9UPN3,F24,84.26
673,Q9UPN3,F4,84.09


In [None]:
base_plddt_sort.query('Uniprot_AF_id =="A4UGR9"')

Unnamed: 0,Uniprot_AF_id,F_AF,pLDDT_global
16986,A4UGR9,F2,52.36
16987,A4UGR9,F3,50.86
16988,A4UGR9,F6,49.42
16989,A4UGR9,F5,48.81
16990,A4UGR9,F1,48.67
16991,A4UGR9,F4,48.59
16992,A4UGR9,F7,46.69
16993,A4UGR9,F9,45.34
16994,A4UGR9,F11,44.97
16995,A4UGR9,F8,44.74


In [None]:
base_plddt_sort.query('Uniprot_AF_id =="Q9Y6V0"')

Unnamed: 0,Uniprot_AF_id,F_AF,pLDDT_global
16,Q9Y6V0,F20,51.48
17,Q9Y6V0,F17,49.26
18,Q9Y6V0,F19,49.13
19,Q9Y6V0,F18,48.23
20,Q9Y6V0,F5,47.6
21,Q9Y6V0,F6,47.13
22,Q9Y6V0,F4,46.65
23,Q9Y6V0,F13,46.38
24,Q9Y6V0,F16,46.37
25,Q9Y6V0,F11,45.76


In [None]:
base_plddt_sort.query("Uniprot_AF_id == 'O15417'")

Unnamed: 0,Uniprot_AF_id,F_AF,pLDDT_global
16240,O15417,F9,51.82
16241,O15417,F6,50.91
16242,O15417,F7,50.88
16243,O15417,F8,49.87
16244,O15417,F5,49.17
16245,O15417,F4,48.26
16246,O15417,F2,46.58
16247,O15417,F3,46.58
16248,O15417,F1,43.36


In [None]:
base_plddt_sort.query("Uniprot_AF_id == 'Q8WXG9'")

Unnamed: 0,Uniprot_AF_id,F_AF,pLDDT_global
5438,Q8WXG9,F3,86.27
5439,Q8WXG9,F12,85.08
5440,Q8WXG9,F5,84.76
5441,Q8WXG9,F6,84.59
5442,Q8WXG9,F19,84.45
5443,Q8WXG9,F17,84.38
5444,Q8WXG9,F14,84.34
5445,Q8WXG9,F11,84.19
5446,Q8WXG9,F2,84.11
5447,Q8WXG9,F10,84.06


In [None]:
base_plddt_sort.query("Uniprot_AF_id == 'Q7Z7M0'")

Unnamed: 0,Uniprot_AF_id,F_AF,pLDDT_global
7828,Q7Z7M0,F1,83.29
7829,Q7Z7M0,F2,80.48
7830,Q7Z7M0,F6,79.84
7831,Q7Z7M0,F7,78.39
7832,Q7Z7M0,F5,77.57
7833,Q7Z7M0,F4,77.12
7834,Q7Z7M0,F8,76.67
7835,Q7Z7M0,F9,76.26
7836,Q7Z7M0,F3,74.7


In [None]:
base_plddt_sort.query("Uniprot_AF_id == 'P98160'")

Unnamed: 0,Uniprot_AF_id,F_AF,pLDDT_global
11850,P98160,F3,84.88
11851,P98160,F4,82.85
11852,P98160,F5,82.74
11853,P98160,F6,81.63
11854,P98160,F2,81.13
11855,P98160,F16,80.59
11856,P98160,F7,79.94
11857,P98160,F8,79.36
11858,P98160,F15,78.79
11859,P98160,F13,78.17


In [None]:
base_plddt_sort.query("Uniprot_AF_id == 'Q5T011'")

Unnamed: 0,Uniprot_AF_id,F_AF,pLDDT_global
9500,Q5T011,F1,73.51
9501,Q5T011,F12,73.39
9502,Q5T011,F11,71.03
9503,Q5T011,F10,69.14
9504,Q5T011,F3,67.97
9505,Q5T011,F2,67.33
9506,Q5T011,F9,65.89
9507,Q5T011,F8,65.07
9508,Q5T011,F4,62.67
9509,Q5T011,F7,62.21


In [None]:

def categories_column(df):
    for col in ['Uniprot_AF_id','F_AF','pLDDT_global']:
        mydic= df[col].value_counts().to_dict()
        print(col, mydic)
        print('\n')

categories_column(base_plddt_sort)

Uniprot_AF_id {'Q8NF91': 38, 'Q03001': 32, 'Q9UPN3': 31, 'Q8WXH0': 29, 'Q8WXG9': 26, 'Q09666': 24, 'Q8IVF2': 23, 'Q9HC84': 23, 'Q96RW7': 23, 'O14686': 22, 'Q9NU22': 22, 'O75445': 21, 'Q9Y6V0': 20, 'Q2LD37': 20, 'Q5T4S7': 20, 'P21817': 20, 'Q4G0P3': 20, 'O95714': 19, 'Q15751': 19, 'Q8NEZ4': 19, 'Q6V0I7': 19, 'Q15413': 19, 'Q92736': 19, 'Q96M86': 18, 'P98164': 18, 'Q14204': 18, 'Q15149': 18, 'Q8TE73': 18, 'Q9NZR2': 17, 'Q96JB1': 17, 'Q14517': 17, 'Q9P225': 17, 'Q9NZJ4': 17, 'Q07954': 17, 'Q8IVF4': 17, 'Q9NYC9': 17, 'Q8TDW7': 17, 'Q96DT5': 17, 'Q685J3': 17, 'Q86WI1': 16, 'P98160': 16, 'Q8NCM8': 16, 'Q12955': 16, 'Q9NYQ8': 16, 'P98161': 16, 'P08F94': 15, 'P78527': 15, 'Q8TD57': 15, 'Q8N3K9': 15, 'Q7Z7G8': 15, 'Q8WXX0': 15, 'Q9C0G6': 15, 'P20930': 15, 'Q99996': 14, 'Q01484': 14, 'Q03164': 14, 'Q9UPA5': 14, 'Q99698': 14, 'Q9Y4A5': 14, 'Q96T58': 13, 'Q15911': 13, 'O15230': 13, 'Q709C8': 13, 'Q9NRC6': 13, 'Q7Z407': 13, 'O60494': 13, 'Q96Q15': 13, 'Q8IZQ1': 12, 'Q96PZ7': 12, 'Q4LDE5': 12, 'Q70C

#2 - Generating a file with the fields of the *plddt-global* database processed and sorted by the largest plddt.

In [None]:
base_plddt_sort.to_csv("drive/My Drive/ProcessaNovaBase/TrataArq_pLDDT/plddt-global_proc.csv",sep=',',index=False)