In [1]:
from Bio.PDB import PDBParser
from Bio.PDB.DSSP import DSSP

from graphein.protein.config import ProteinGraphConfig
from graphein.protein.graphs import construct_graph
from graphein.protein.visualisation import plotly_protein_structure_graph

from utils import *

In [5]:
path_og = '../datasets/test/HGD_normal.pdb'
path_m = '../datasets/test/A122V_4.pdb'


In [6]:
parser = PDBParser()

structure_og = parser.get_structure("HGD", path_og)
structure_m = parser.get_structure("A122V", path_m)

model_og = structure_og[0]
model_m = structure_m[0]

dssp_og = DSSP(model_og, path_og, dssp='mkdssp')
dssp_m = DSSP(model_m, path_m, dssp='mkdssp')


In [26]:
# print all the dssp attributes names
print(dssp_og.property_list)

[(1, 'M', '-', 0.9893617021276596, 360.0, 136.7, 0, 0.0, 2, -0.1, 0, 0.0, 3, -0.1), (2, 'A', 'P', 0.8584905660377359, -60.5, 142.7, 1, -0.1, 2, -0.2, 0, 0.0, 0, 0.0), (3, 'E', 'P', 0.8092783505154639, -62.5, 127.2, -2, -0.1, -1, -0.1, 1, -0.1, 2, -0.1), (4, 'L', 'P', 0.3231707317073171, -67.5, 147.4, -2, -0.2, 2, -0.3, -3, -0.1, -1, -0.1), (5, 'K', 'P', 0.624390243902439, -103.9, 153.0, -2, -0.1, 226, -1.1, 226, -0.1, 2, -0.3), (6, 'Y', 'E', 0.2702702702702703, -138.8, 150.4, -2, -0.3, 2, -0.2, 224, -0.2, 166, -0.0), (7, 'I', 'E', 0.23668639053254437, -82.0, 156.7, 222, -2.6, 222, -2.6, -2, -0.3, 2, -0.4), (8, 'S', 'E', 0.4230769230769231, -127.0, 143.4, 220, -0.2, 220, -0.2, -2, -0.2, 217, -0.1), (9, 'G', '-', 0.07142857142857142, 129.9, 108.4, 218, -2.6, 3, -1.7, 215, -0.5, 216, -0.2), (10, 'F', 'T', 0.2233502538071066, -58.4, 126.4, 214, -1.9, -2, -0.0, 1, -0.3, 215, -0.0), (11, 'G', 'T', 0.4642857142857143, 71.3, 10.5, 1, -0.2, -1, -0.3, -2, -0.1, 260, -0.0), (12, 'N', '-', 0.50955

In [8]:
# print the secondary structure
print('OG')
for key in dssp_og.keys():
    print(dssp_og[key])

print('Mutant')
for key in dssp_m.keys():
    print(dssp_m[key])

OG
(1, 'M', '-', 0.9893617021276596, 360.0, 136.7, 0, 0.0, 2, -0.1, 0, 0.0, 3, -0.1)
(2, 'A', 'P', 0.8584905660377359, -60.5, 142.7, 1, -0.1, 2, -0.2, 0, 0.0, 0, 0.0)
(3, 'E', 'P', 0.8092783505154639, -62.5, 127.2, -2, -0.1, -1, -0.1, 1, -0.1, 2, -0.1)
(4, 'L', 'P', 0.3231707317073171, -67.5, 147.4, -2, -0.2, 2, -0.3, -3, -0.1, -1, -0.1)
(5, 'K', 'P', 0.624390243902439, -103.9, 153.0, -2, -0.1, 226, -1.1, 226, -0.1, 2, -0.3)
(6, 'Y', 'E', 0.2702702702702703, -138.8, 150.4, -2, -0.3, 2, -0.2, 224, -0.2, 166, -0.0)
(7, 'I', 'E', 0.23668639053254437, -82.0, 156.7, 222, -2.6, 222, -2.6, -2, -0.3, 2, -0.4)
(8, 'S', 'E', 0.4230769230769231, -127.0, 143.4, 220, -0.2, 220, -0.2, -2, -0.2, 217, -0.1)
(9, 'G', '-', 0.07142857142857142, 129.9, 108.4, 218, -2.6, 3, -1.7, 215, -0.5, 216, -0.2)
(10, 'F', 'T', 0.2233502538071066, -58.4, 126.4, 214, -1.9, -2, -0.0, 1, -0.3, 215, -0.0)
(11, 'G', 'T', 0.4642857142857143, 71.3, 10.5, 1, -0.2, -1, -0.3, -2, -0.1, 260, -0.0)
(12, 'N', '-', 0.50955414012738

In [9]:
secondary_structure = [(x[1], x[2]) for x in dssp_og.property_list]

secondary_structure2 = [(x[1],x[2]) for x in dssp_m.property_list]
# print differences between the two
"""for i, (aa1, aa2) in enumerate(zip(secondary_structure, secondary_structure2)):
    if aa1 != aa2:
        print(i, aa1, aa2)"""

for i in range(len(secondary_structure)):
    print(secondary_structure[i], secondary_structure2[i])



('M', '-') ('M', '-')
('A', 'P') ('A', '-')
('E', 'P') ('E', '-')
('L', 'P') ('L', '-')
('K', 'P') ('K', '-')
('Y', 'E') ('Y', 'E')
('I', 'E') ('I', 'E')
('S', 'E') ('S', 'E')
('G', '-') ('G', '-')
('F', 'T') ('F', 'S')
('G', 'T') ('G', 'S')
('N', '-') ('N', '-')
('E', '-') ('E', '-')
('C', 'E') ('C', 'E')
('S', 'E') ('S', 'E')
('S', 'E') ('S', 'E')
('E', 'E') ('E', 'E')
('D', '-') ('D', '-')
('P', 'T') ('P', 'T')
('R', 'T') ('R', 'T')
('C', 'S') ('C', 'S')
('P', 'T') ('P', 'T')
('G', 'T') ('G', 'T')
('S', '-') ('S', '-')
('L', 'S') ('L', 'S')
('P', 'P') ('P', 'P')
('E', 'S') ('E', 'S')
('G', 'S') ('G', 'S')
('Q', '-') ('Q', '-')
('N', 'S') ('N', 'S')
('N', '-') ('N', 'S')
('P', '-') ('P', '-')
('Q', 'S') ('Q', 'S')
('V', 'S') ('V', 'S')
('C', '-') ('C', '-')
('P', 'G') ('P', 'G')
('Y', 'G') ('Y', 'G')
('N', 'G') ('N', 'G')
('L', 'P') ('L', 'P')
('Y', 'E') ('Y', 'E')
('A', 'E') ('A', 'E')
('E', 'E') ('E', 'E')
('Q', 'E') ('Q', 'E')
('L', 'E') ('L', 'E')
('S', 'E') ('S', 'E')
('G', 'S')

In [6]:
print(secondary_structure[197], secondary_structure2[197])

('G', 'E') ('D', 'E')


In [10]:
# show the alpha sheet and beta sheet
for key, value in dssp_og.property_dict.items():
    if value[2] == 'H' or value[2] == 'G':
        print(key, value)

('A', (' ', 36, ' ')) (36, 'P', 'G', 0.27941176470588236, -46.6, 140.8, 0, 0.0, 3, -1.7, 0, 0.0, -1, -0.1)
('A', (' ', 37, ' ')) (37, 'Y', 'G', 0.536036036036036, 57.1, 17.2, 1, -0.3, -2, -0.1, -3, -0.1, -13, -0.1)
('A', (' ', 38, ' ')) (38, 'N', 'G', 0.821656050955414, 59.1, 29.7, -3, -2.1, -1, -0.3, 1, -0.2, 2, -0.0)
('A', (' ', 85, ' ')) (85, 'W', 'G', 0.11013215859030837, -70.2, -17.5, -2, -1.4, 3, -0.7, 1, -0.3, -1, -0.2)
('A', (' ', 86, ' ')) (86, 'D', 'G', 0.9447852760736196, -82.3, 2.4, 1, -0.2, -1, -0.3, -3, -0.1, -2, -0.1)
('A', (' ', 87, ' ')) (87, 'E', 'G', 0.23711340206185566, -80.6, -14.9, -3, -1.2, 2, -0.4, 1, -0.1, -1, -0.2)
('A', (' ', 113, ' ')) (113, 'V', 'G', 0.07042253521126761, -65.0, -44.0, 1, -0.3, 3, -1.6, 2, -0.2, -1, -0.2)
('A', (' ', 114, ' ')) (114, 'S', 'G', 0.5846153846153846, -70.6, -15.7, 1, -0.3, -1, -0.3, -35, -0.1, -2, -0.2)
('A', (' ', 115, ' ')) (115, 'G', 'G', 0.0, -99.3, 11.4, -4, -1.2, 22, -2.1, -3, -0.8, 2, -0.5)
('A', (' ', 125, ' ')) (125, 'I

In [11]:
# comprare the two structures
for key in dssp_og.keys():
    if key in dssp_m.keys():
        if dssp_og[key][1] != dssp_m[key][1]:
            print(key, dssp_og[key][1], dssp_m[key][1])
    else:
        print(key, dssp_og[key][1], 'None')


# print(dssp_og)



('A', (' ', 122, ' ')) A V


-----
test

In [46]:
# print the features of the pdb
from Bio.PDB import AbstractPropertyMap

# print the property map of the pdbs
AbstractPropertyMap.AbstractPropertyMap(dssp_og)





TypeError: AbstractPropertyMap.__init__() missing 2 required positional arguments: 'property_keys' and 'property_list'

In [12]:

# Load the default config
c = ProteinGraphConfig(granularity='CA')

# Construct the graph!
g = construct_graph(path=path_og)

Output()

In [13]:
plotly_protein_structure_graph(g, node_size_multiplier=0.5, colour_nodes_by="residue_name")

In [14]:
# Load the default config
c = ProteinGraphConfig(granularity='CA')

# Construct the graph!
g1 = construct_graph(path=path_m)

Output()

In [15]:
plotly_protein_structure_graph(g1, node_size_multiplier=0.5, colour_nodes_by="residue_name")

In [16]:
print(type(g1))

<class 'networkx.classes.graph.Graph'>


In [17]:
# difference between the two graphs
for node in g.nodes:
    if node not in g1.nodes:
        print(node)

# distance between the two graphs
from networkx.algorithms.similarity import graph_edit_distance
distance = graph_edit_distance(g, g1)
print(distance)


A:ALA:122
0.0


In [29]:
g1

<networkx.classes.graph.Graph at 0x30ada53c0>

In [85]:
path_fs = '../datasets/test/A218fs.pdb'

path_m_alpha = '../datasets/ALPHAFOLD PDBs/A122V_41d52/A122V_41d52_unrelaxed_rank_001_alphafold2_ptm_model_5_seed_000.pdb'
path_fs_alpha = '../datasets/ALPHAFOLD PDBs/A218fs_8b909/A218fs_8b909_unrelaxed_rank_001_alphafold2_ptm_model_4_seed_000.pdb'
from Bio.PDB import PDBParser, Superimposer, is_aa

# Function to get backbone atoms from residues that are present in both structures
def get_common_backbone_atoms(structure1, structure2):
    backbone_atoms1 = []
    backbone_atoms2 = []
    for model1, model2 in zip(structure1, structure2):
        for chain1, chain2 in zip(model1, model2):
            res_dict2 = {res.get_id(): res for res in chain2 if is_aa(res)}
            for res1 in chain1:
                if is_aa(res1) and res1.get_id() in res_dict2:
                    res2 = res_dict2[res1.get_id()]
                    for atom1 in res1:
                        if atom1.get_id() in ('N', 'CA', 'C', 'O'):
                            atom2 = res2[atom1.get_id()]
                            backbone_atoms1.append(atom1)
                            backbone_atoms2.append(atom2)
    return backbone_atoms1, backbone_atoms2

# Load the original and mutant PDB files
pdb_parser = PDBParser(QUIET=True)
structure_original = pdb_parser.get_structure("original", path_fs_alpha)
structure_mutant = pdb_parser.get_structure("mutant", path_fs)

# Extract common backbone atoms
backbone_atoms_original, backbone_atoms_mutant = get_common_backbone_atoms(structure_original, structure_mutant)

# Ensure there are common backbone atoms to align
assert len(backbone_atoms_original) > 0 and len(backbone_atoms_mutant) > 0, "No common backbone atoms found."

# Superimpose the structures
super_imposer = Superimposer()
super_imposer.set_atoms(backbone_atoms_original, backbone_atoms_mutant)
super_imposer.apply(structure_mutant.get_atoms())

# Calculate RMSD
print(f"RMSD: {super_imposer.rms:.4f} Å")

# Optionally, save the aligned mutant structure to a new PDB file
from Bio.PDB import PDBIO
io = PDBIO()
io.set_structure(structure_mutant)
io.save("aligned_mutant.pdb")




RMSD: 0.0005 Å


RMSD: 2.0825 Å


# let's check the graph edges

In [35]:
for edge in g.edges:
    if edge in g1.edges:
        continue
        if g.edges[edge]['distance'] != g1.edges[edge]['distance']:
            print(edge, g.edges[edge]['distance'], g1.edges[edge]['distance'])
    else:
        
        print(edge, g.edges[edge]['distance'], 'None')



('A:GLY:121', 'A:ALA:122') 3.7943766286440255 None
('A:ALA:122', 'A:GLY:123') 3.8291502973897478 None


# they are slighlty different.. mutation fault or graph construction??

---

In [16]:
#read the csv file aku_prin_v2.0_cleaned.csv

import pandas as pd

df = pd.read_excel('../datasets/aku_prin_v2.0.xlsx', sheet_name=0)

In [17]:
df2 = df.iloc[:, list(range(12, 18)) + list(range(41, 52))] 
df2.head()

Unnamed: 0,Protein change allele 1,DNA change allele 1,Exon/intron allele 1,Protein change allele 2,DNA change allele 2,ex/in allele 2,physical_health_score,mental_health_score,AKUSSI_jointpain,AKUSSI_spinalpain,KOOSpain,KOOSsymptoms,KOOSdaily_living,KOOSsport,KOOS_QOL,HAQ_hapVAS,HAQ_haqDI
0,G161R,c.481G>A,ex8,His371Profs,c.1111dupC,ex13,26.0,43.0,57.0,75.0,67.0,89.0,75.0,70.0,69.0,63.0,1.5
1,G161R,c.481G>A,ex8,G161R,c.481G>A,ex8,53.0,51.0,29.0,25.0,92.0,96.0,91.0,85.0,75.0,23.0,0.25
2,G161R,c.481G>A,ex8,M368V,c.1102A>G,ex13,23.0,40.0,36.0,100.0,47.0,39.0,31.0,0.0,19.0,53.0,1.75
3,Y6_G29del ivs1-1G>A,c.16-1G>A,in1,Y6_G29del ivs1-1G>A,c.16-1G>A,in1,,,,,,,,,,,
4,G161R,c.481G>A,ex8,G161R,c.481G>A,ex8,23.0,29.0,50.0,100.0,33.0,29.0,60.0,25.0,13.0,70.0,2.0


---

In [36]:
from graphein.protein.config import ProteinGraphConfig
from graphein.protein.edges.distance import *
from graphein.protein.graphs import construct_graph

edge_fns = [
    add_aromatic_interactions,
    add_hydrophobic_interactions,
    add_aromatic_sulphur_interactions,
    add_cation_pi_interactions,
    add_disulfide_interactions,
    add_hydrogen_bond_interactions,
    add_ionic_interactions,
    add_peptide_bonds
    ]
config = ProteinGraphConfig(edge_construction_functions=edge_fns)

g = construct_graph(config=config, path= '../datasets/test/HGD_normal.pdb')

Output()

In [37]:
p = plotly_protein_structure_graph(
    g,
    colour_edges_by="kind",
    colour_nodes_by="degree",
    label_node_ids=False,
    plot_title="Peptide backbone graph. Nodes coloured by degree.",
    node_size_multiplier=1
    )
p.show()

In [38]:
type(g)

networkx.classes.graph.Graph

# funzione per 

In [21]:
active_site = [292, 335, 365, 371, 341]

In [22]:
active_nodes = []
active_edges = {}
for i in active_site:
    active_edges[i] = []


for node in g.nodes:
    #get the int of the node at the end of the string
    nnode = int(node.split(':')[2])
    if nnode in active_site:
        active_nodes.append(node)
        # get the edges of the node
        for edge in g.edges(node, data=True):
            active_edges[nnode].append(edge)

In [23]:
active_nodes = get_edges_from_nodes(g, active_site)[0]

In [24]:
active_nodes

['A:HIS:292', 'A:HIS:335', 'A:GLU:341', 'A:HIS:365', 'A:HIS:371']

In [25]:
SG = get_subgraph(g, active_nodes)

In [26]:
#g.subgraph(active_nodes)
SG = g.__class__()
SG.add_nodes_from((n, g.nodes[n]) for n in active_nodes)
if SG.is_multigraph():
    SG.add_edges_from(
        (n, nbr, key, d)
        for n, nbrs in g.adj.items()
        if n in active_nodes
        for nbr, keydict in nbrs.items()
        if nbr in active_nodes
        for key, d in keydict.items()
    )
else:
    SG.add_edges_from(
        (n, nbr, d)
        for n, nbrs in g.adj.items()
        if n in active_nodes
        for nbr, d in nbrs.items()
        if nbr in active_nodes
    )
SG.graph.update(g.graph)

In [27]:
p = plotly_protein_structure_graph(
    SG,
    colour_edges_by="kind",
    colour_nodes_by="degree",
    label_node_ids=False,
    plot_title="Peptide backbone graph. Nodes coloured by degree.",
    node_size_multiplier=1
    )
p.show()

In [28]:
edge[1].split(':')[2]

'372'

In [29]:
active_edges

{292: [('A:HIS:292',
   'A:TYR:333',
   {'kind': {'aromatic'}, 'distance': 7.4876871595974155}),
  ('A:HIS:292',
   'A:GLU:351',
   {'kind': {'ionic'}, 'distance': 13.701098532599495}),
  ('A:HIS:292',
   'A:ASP:291',
   {'kind': {'peptide_bond'}, 'distance': 3.833183272425151}),
  ('A:HIS:292',
   'A:ALA:293',
   {'kind': {'peptide_bond'}, 'distance': 3.839483428796118})],
 335: [('A:HIS:335',
   'A:HIS:371',
   {'kind': {'aromatic'}, 'distance': 5.410436211619173}),
  ('A:HIS:335',
   'A:TYR:333',
   {'kind': {'aromatic'}, 'distance': 5.903799623970991}),
  ('A:HIS:335',
   'A:GLU:341',
   {'kind': {'ionic'}, 'distance': 11.37200197854362}),
  ('A:HIS:335',
   'A:TYR:334',
   {'kind': {'peptide_bond'}, 'distance': 3.7909056965321635}),
  ('A:HIS:335',
   'A:ARG:336',
   {'kind': {'peptide_bond'}, 'distance': 3.84312906366674})],
 365: [('A:HIS:365',
   'A:HIS:371',
   {'kind': {'aromatic'}, 'distance': 7.411853816691206}),
  ('A:HIS:365',
   'A:TRP:427',
   {'kind': {'aromatic'}, 'di

In [30]:
active_site = [292, 335, 365, 371, 341]

active_edges_num = {}
for i in active_site:
    active_edges_num[i] = []

for key, value in active_edges.items():
    for edge in value:
        active_edges_num[key].append(edge[1].split(':')[2])
    #active_edges_num[key] = value[2][1].split(':')[2]

active_edges_num

{292: ['333', '351', '291', '293'],
 335: ['371', '333', '341', '334', '336'],
 365: ['371', '427', '423', '341', '364', '366'],
 371: ['335', '365', '341', '370', '372'],
 341: ['365', '335', '371', '340', '342']}

In [31]:
def get_neighbor_nodes(nodes, n):
    active_site_neighbors = []
    for node in nodes:
        for i in range(n*2+1):
            active_site_neighbors.append(node - n + i)
    
    active_site_neighbors = list(set(active_site_neighbors))
    return active_site_neighbors


In [32]:
get_neighbor_nodes(active_site, 1)

[291, 292, 293, 364, 365, 334, 335, 336, 366, 370, 371, 372, 340, 341, 342]

---

In [33]:
import warnings; warnings.simplefilter('ignore')
df_g = create_graph_df(config=config)

In [34]:
df_g.head()

Unnamed: 0,Protein change allele 1,Protein change allele 2,graph_allele1,graph_allele2,structure_allele1,structure_allele2,pdb_file_allele1,pdb_file_allele2
0,G161R,His371Profs,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,../datasets/ALPHAFOLD PDBs/H371fs_9b3dd/H371fs...
1,G161R,G161R,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...
2,G161R,M368V,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,../datasets/ALPHAFOLD PDBs/M368V_be390/M368V_b...
3,Y6_G29del ivs1-1G>A,Y6_G29del ivs1-1G>A,,,,,,
4,G161R,G161R,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...


In [35]:
len(df_g)

219

In [36]:
df_all=pd.read_excel('../datasets/aku_prin_v2.0.xlsx')

In [37]:
df_scores=df_all[['physical_health_score', 'mental_health_score', 'AKUSSI_jointpain',
        'AKUSSI_spinalpain', 'KOOSpain', 'KOOSsymptoms', 'KOOSdaily_living',
        'KOOSsport', 'KOOS_QOL', 'HAQ_hapVAS', 'HAQ_haqDI']]

In [38]:
df_scores.head()

Unnamed: 0,physical_health_score,mental_health_score,AKUSSI_jointpain,AKUSSI_spinalpain,KOOSpain,KOOSsymptoms,KOOSdaily_living,KOOSsport,KOOS_QOL,HAQ_hapVAS,HAQ_haqDI
0,26.0,43.0,57.0,75.0,67.0,89.0,75.0,70.0,69.0,63.0,1.5
1,53.0,51.0,29.0,25.0,92.0,96.0,91.0,85.0,75.0,23.0,0.25
2,23.0,40.0,36.0,100.0,47.0,39.0,31.0,0.0,19.0,53.0,1.75
3,,,,,,,,,,,
4,23.0,29.0,50.0,100.0,33.0,29.0,60.0,25.0,13.0,70.0,2.0


In [39]:
# append df_g and df_scores
df_g = pd.concat([df_g, df_scores], axis=1)

In [40]:
df_g.head()

Unnamed: 0,Protein change allele 1,Protein change allele 2,graph_allele1,graph_allele2,structure_allele1,structure_allele2,pdb_file_allele1,pdb_file_allele2,physical_health_score,mental_health_score,AKUSSI_jointpain,AKUSSI_spinalpain,KOOSpain,KOOSsymptoms,KOOSdaily_living,KOOSsport,KOOS_QOL,HAQ_hapVAS,HAQ_haqDI
0,G161R,His371Profs,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,../datasets/ALPHAFOLD PDBs/H371fs_9b3dd/H371fs...,26.0,43.0,57.0,75.0,67.0,89.0,75.0,70.0,69.0,63.0,1.5
1,G161R,G161R,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,53.0,51.0,29.0,25.0,92.0,96.0,91.0,85.0,75.0,23.0,0.25
2,G161R,M368V,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,../datasets/ALPHAFOLD PDBs/M368V_be390/M368V_b...,23.0,40.0,36.0,100.0,47.0,39.0,31.0,0.0,19.0,53.0,1.75
3,Y6_G29del ivs1-1G>A,Y6_G29del ivs1-1G>A,,,,,,,,,,,,,,,,,
4,G161R,G161R,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,23.0,29.0,50.0,100.0,33.0,29.0,60.0,25.0,13.0,70.0,2.0


In [41]:
# drop the rows with nan values in the columns 'Protein change allele 1' and 'Protein change allele 2'
df_g = df_g.dropna(subset=['Protein change allele 1 ', 'Protein change allele 2'])
len(df_g)


199

In [42]:
# print all the rows that have None in the 'graph_allele1' or 'graph_allele2' column
print(len(df_g[df_g['graph_allele1'].isnull() | df_g['graph_allele2'].isnull()]))

df_g[df_g['graph_allele1'].isnull() | df_g['graph_allele2'].isnull()]


40


Unnamed: 0,Protein change allele 1,Protein change allele 2,graph_allele1,graph_allele2,structure_allele1,structure_allele2,pdb_file_allele1,pdb_file_allele2,physical_health_score,mental_health_score,AKUSSI_jointpain,AKUSSI_spinalpain,KOOSpain,KOOSsymptoms,KOOSdaily_living,KOOSsport,KOOS_QOL,HAQ_hapVAS,HAQ_haqDI
3,Y6_G29del ivs1-1G>A,Y6_G29del ivs1-1G>A,,,,,,,,,,,,,,,,,
8,G161R,ivs1-1G>A (Y6_G29del),"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...",,"(((<Residue MET het= resseq=1 icode= >, <Resi...",,../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,,32.0,30.0,50.0,100.0,44.0,68.0,54.0,45.0,31.0,53.0,0.63
9,G251D,del ex13 (MLPA),"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...",,"(((<Residue MET het= resseq=1 icode= >, <Resi...",,../datasets/ALPHAFOLD PDBs/G251D_5ea06/G251D_5...,,31.0,39.0,36.0,75.0,44.0,46.0,52.0,0.0,44.0,80.0,1.25
15,D153fs,ivs1-1G>A (Y6_G29del) (2copies)**,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...",,"(((<Residue MET het= resseq=1 icode= >, <Resi...",,../datasets/ALPHAFOLD PDBs/D153fs_315fd/D153fs...,,39.0,51.0,14.0,75.0,,,,,,70.0,0.63
24,ivs1-1G>A (Y6_G29del),H371fs,,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...",,"(((<Residue MET het= resseq=1 icode= >, <Resi...",,../datasets/ALPHAFOLD PDBs/H371fs_9b3dd/H371fs...,40.0,46.0,7.0,50.0,100.0,96.0,100.0,95.0,100.0,40.0,0.38
45,G161R,n.i. (MLPAneg),"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...",,"(((<Residue MET het= resseq=1 icode= >, <Resi...",,../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,,43.0,30.0,7.0,75.0,67.0,64.0,54.0,42.0,56.0,48.0,0.75
51,c.140C>T,c.140C>T,,,,,,,,,,,,,,,,,
60,R336S fs ivs12-2A>T,E168*,,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...",,"(((<Residue MET het= resseq=1 icode= >, <Resi...",,../datasets/ALPHAFOLD PDBs/E168__10c4e/E168_10...,,,,,,,,,,,
74,ivs1-1G>A (Y6_G29del),ivs1-1G>A (Y6_G29del),,,,,,,,,,,,,,,,,
78,K248R,ivs7+5G>A (R145Sfs),,,,,,,,,,,,,,,,,


In [43]:
# drop the rows with None value of the column 'graph_allele1'
df_g = df_g.dropna(subset=['graph_allele1'])



In [44]:
df_g = df_g.dropna(subset=['graph_allele2'])

In [45]:
len(df_g)

159

---
# da qui cose brutte

In [46]:
from Bio.PDB.Superimposer import Superimposer
from numpy import array, dot, set_printoptions

parser = PDBParser()
structure = parser.get_structure("HGD", path_og)
model = structure[0]
model_1 = (model.get_chains() == "A") | (model.get_chains() == "B")

# get the coordinates of the atoms of the structure
coords = []

for atom in model.get_atoms():
    coords.append(atom.get_coord())


coords = array(coords)

structure2 = parser.get_structure("G198D", path_m)
model2 = structure2[0]
model_2 = (model2.get_chains() == "A") | (model2.get_chains() == "B")

coords2 = []

for atom in model2.get_atoms():
    coords2.append(atom.get_coord())


coords2 = array(coords2)

print(len(coords), len(coords2))

print("Shape of coords:", coords.shape)
print("Shape of coords2:", coords2.shape)




3523 3526
Shape of coords: (3523, 3)
Shape of coords2: (3526, 3)


In [85]:
import biotite.structure as struc
from biotite.structure.io.pdb import PDBFile
from biotite.structure import AtomArray, AtomArrayStack
from biotite.structure.info import residue
from biotite.structure import array

file = PDBFile.read(path_og)
array1 = file.get_structure()

file2 = PDBFile.read(path_m)
array2 = file2.get_structure()



In [86]:
array1.shape, array2.shape

((1, 3523), (1, 3522))

In [87]:
array1

stack([
	array([
	Atom(np.array([-18.693,  20.898,  -0.686], dtype=float32), chain_id="A", res_id=1, ins_code="", res_name="MET", hetero=False, atom_name="N", element="N"),
	Atom(np.array([-18.036,  20.589,   0.604], dtype=float32), chain_id="A", res_id=1, ins_code="", res_name="MET", hetero=False, atom_name="CA", element="C"),
	Atom(np.array([-17.101,  21.732,   0.941], dtype=float32), chain_id="A", res_id=1, ins_code="", res_name="MET", hetero=False, atom_name="C", element="C"),
	Atom(np.array([-17.244,  19.274,   0.534], dtype=float32), chain_id="A", res_id=1, ins_code="", res_name="MET", hetero=False, atom_name="CB", element="C"),
	Atom(np.array([-16.437,  22.209,   0.03 ], dtype=float32), chain_id="A", res_id=1, ins_code="", res_name="MET", hetero=False, atom_name="O", element="O"),
	Atom(np.array([-18.052,  18.111,   1.107], dtype=float32), chain_id="A", res_id=1, ins_code="", res_name="MET", hetero=False, atom_name="CG", element="C"),
	Atom(np.array([-17.177,  16.539,   0.958], 

In [88]:
array1 = array1[0]

In [89]:
array2 = array2[0]

In [90]:
array1 = array1[(array1.chain_id == "A") | (array1.chain_id == "B")]
array1

array([
	Atom(np.array([-18.693,  20.898,  -0.686], dtype=float32), chain_id="A", res_id=1, ins_code="", res_name="MET", hetero=False, atom_name="N", element="N"),
	Atom(np.array([-18.036,  20.589,   0.604], dtype=float32), chain_id="A", res_id=1, ins_code="", res_name="MET", hetero=False, atom_name="CA", element="C"),
	Atom(np.array([-17.101,  21.732,   0.941], dtype=float32), chain_id="A", res_id=1, ins_code="", res_name="MET", hetero=False, atom_name="C", element="C"),
	Atom(np.array([-17.244,  19.274,   0.534], dtype=float32), chain_id="A", res_id=1, ins_code="", res_name="MET", hetero=False, atom_name="CB", element="C"),
	Atom(np.array([-16.437,  22.209,   0.03 ], dtype=float32), chain_id="A", res_id=1, ins_code="", res_name="MET", hetero=False, atom_name="O", element="O"),
	Atom(np.array([-18.052,  18.111,   1.107], dtype=float32), chain_id="A", res_id=1, ins_code="", res_name="MET", hetero=False, atom_name="CG", element="C"),
	Atom(np.array([-17.177,  16.539,   0.958], dtype=flo

In [91]:
array1 = array1[~struc.filter_solvent(array1)]

In [92]:
array2 = array2[~struc.filter_solvent(array2)]

In [55]:
test = array1[struc.filter_intersection(array1, array2)]
test_common = array2[struc.filter_intersection(array2, array1)]

In [93]:
test

array([
	Atom(np.array([-18.693,  20.898,  -0.686], dtype=float32), chain_id="A", res_id=1, ins_code="", res_name="MET", hetero=False, atom_name="N", element="N"),
	Atom(np.array([-18.036,  20.589,   0.604], dtype=float32), chain_id="A", res_id=1, ins_code="", res_name="MET", hetero=False, atom_name="CA", element="C"),
	Atom(np.array([-17.101,  21.732,   0.941], dtype=float32), chain_id="A", res_id=1, ins_code="", res_name="MET", hetero=False, atom_name="C", element="C"),
	Atom(np.array([-17.244,  19.274,   0.534], dtype=float32), chain_id="A", res_id=1, ins_code="", res_name="MET", hetero=False, atom_name="CB", element="C"),
	Atom(np.array([-16.437,  22.209,   0.03 ], dtype=float32), chain_id="A", res_id=1, ins_code="", res_name="MET", hetero=False, atom_name="O", element="O"),
	Atom(np.array([-18.052,  18.111,   1.107], dtype=float32), chain_id="A", res_id=1, ins_code="", res_name="MET", hetero=False, atom_name="CG", element="C"),
	Atom(np.array([-17.177,  16.539,   0.958], dtype=flo

In [94]:
test_common

array([
	Atom(np.array([-10.664, -23.312,   5.98 ], dtype=float32), chain_id="A", res_id=1, ins_code="", res_name="MET", hetero=False, atom_name="N", element="N"),
	Atom(np.array([-11.523, -22.125,   5.941], dtype=float32), chain_id="A", res_id=1, ins_code="", res_name="MET", hetero=False, atom_name="CA", element="C"),
	Atom(np.array([-12.352, -22.094,   4.664], dtype=float32), chain_id="A", res_id=1, ins_code="", res_name="MET", hetero=False, atom_name="C", element="C"),
	Atom(np.array([-10.68 , -20.859,   6.051], dtype=float32), chain_id="A", res_id=1, ins_code="", res_name="MET", hetero=False, atom_name="CB", element="C"),
	Atom(np.array([-11.852, -22.438,   3.586], dtype=float32), chain_id="A", res_id=1, ins_code="", res_name="MET", hetero=False, atom_name="O", element="O"),
	Atom(np.array([-10.531, -20.344,   7.477], dtype=float32), chain_id="A", res_id=1, ins_code="", res_name="MET", hetero=False, atom_name="CG", element="C"),
	Atom(np.array([-10.258, -18.531,   7.543], dtype=flo

In [95]:
test_superimposed, transformation = struc.superimpose(
    test, test_common
)

In [96]:
test_superimposed = transformation.apply(array1)

In [97]:
test_superimposed

array([
	Atom(np.array([ 9.158576 , -3.3709402, 25.941381 ], dtype=float32), chain_id="A", res_id=1, ins_code="", res_name="MET", hetero=False, atom_name="N", element="N"),
	Atom(np.array([ 7.9976697, -4.1152287, 25.40327  ], dtype=float32), chain_id="A", res_id=1, ins_code="", res_name="MET", hetero=False, atom_name="CA", element="C"),
	Atom(np.array([ 8.4271   , -5.5481977, 25.165722 ], dtype=float32), chain_id="A", res_id=1, ins_code="", res_name="MET", hetero=False, atom_name="C", element="C"),
	Atom(np.array([ 7.492182, -3.500229, 24.08887 ], dtype=float32), chain_id="A", res_id=1, ins_code="", res_name="MET", hetero=False, atom_name="CB", element="C"),
	Atom(np.array([ 9.535296 , -5.7351685, 24.680658 ], dtype=float32), chain_id="A", res_id=1, ins_code="", res_name="MET", hetero=False, atom_name="O", element="O"),
	Atom(np.array([ 6.2988653, -2.5782022, 24.33305  ], dtype=float32), chain_id="A", res_id=1, ins_code="", res_name="MET", hetero=False, atom_name="CG", element="C"),
	A

In [100]:
import biotite.structure as struc


struc.io.save_structure("../datasets/superimposed_structure_nomask.pdb", test_superimposed)

In [61]:
len(array1), len(test_superimposed)

(3523, 3523)

In [98]:
# Calculate the RMSD of all models to the first model
rms = struc.rmsd(array1, test_superimposed)

In [99]:
rms

27.251003

In [64]:
df_g.head()

Unnamed: 0,Protein change allele 1,Protein change allele 2,graph_allele1,graph_allele2,structure_allele1,structure_allele2,pdb_file_allele1,pdb_file_allele2,physical_health_score,mental_health_score,AKUSSI_jointpain,AKUSSI_spinalpain,KOOSpain,KOOSsymptoms,KOOSdaily_living,KOOSsport,KOOS_QOL,HAQ_hapVAS,HAQ_haqDI
0,G161R,His371Profs,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,../datasets/ALPHAFOLD PDBs/H371fs_9b3dd/H371fs...,26.0,43.0,57.0,75.0,67.0,89.0,75.0,70.0,69.0,63.0,1.5
1,G161R,G161R,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,53.0,51.0,29.0,25.0,92.0,96.0,91.0,85.0,75.0,23.0,0.25
2,G161R,M368V,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,../datasets/ALPHAFOLD PDBs/M368V_be390/M368V_b...,23.0,40.0,36.0,100.0,47.0,39.0,31.0,0.0,19.0,53.0,1.75
4,G161R,G161R,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,23.0,29.0,50.0,100.0,33.0,29.0,60.0,25.0,13.0,70.0,2.0
5,G161R,G161R,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,26.0,31.0,50.0,100.0,44.0,29.0,32.0,35.0,13.0,71.0,1.88


In [65]:
# get the pdb_file_allele2 of the first row
pdb_file_allele2 = df_g.iloc[0]['pdb_file_allele2']
pdb_file_allele2

'../datasets/ALPHAFOLD PDBs/H371fs_9b3dd/H371fs_9b3dd_unrelaxed_rank_001_alphafold2_ptm_model_3_seed_000.pdb'

In [66]:
# Remove DNA and water
def get_rmsd(path_og, path_m):

    file = PDBFile.read(path_og)
    original = file.get_structure()
    original = original[0]

    file2 = PDBFile.read(path_m)
    mutation = file2.get_structure()
    mutation = mutation[0]


    original = original[(original.chain_id == "A") | (original.chain_id == "B")]
    original = original[~struc.filter_solvent(original)]
    mutation = mutation[~struc.filter_solvent(mutation)]
    common = original[struc.filter_intersection(original, mutation)]
    common_mutation = mutation[struc.filter_intersection(mutation, original)]
    # NOTE: The superimpose() function requires the same number of atoms
    #       in both structures
    # on the doc we work with the longest array
    # but here the original array is longer than the mutation array
    # so we will work with the original array
    
    superimposed, transformation = struc.superimpose(
        common, common_mutation, (common.atom_name == "CA")
    )
    superimposed = transformation.apply(original)
    return struc.rmsd(original, superimposed)

get_rmsd(path_og, '../datasets/ALPHAFOLD PDBs/Q29fs_f0607/Q29fs_f0607_unrelaxed_rank_001_alphafold2_ptm_model_1_seed_000.pdb')

39.129913

In [67]:
print(np.around(rms, decimals=3))


27.252


In [134]:
pdb_files = pd.read_csv('../datasets/pdb_files.csv')

In [69]:
pdb_files.head()

Unnamed: 0,mutation,pdb_file
0,G309V,../datasets/ALPHAFOLD PDBs/G309V_e3b31/G309V_e...
1,G185R,../datasets/ALPHAFOLD PDBs/G185R_e6971/G185R_e...
2,E168L,../datasets/ALPHAFOLD PDBs/E168L_0f4af/E168L_0...
3,R225H,../datasets/ALPHAFOLD PDBs/R225H_3ba56/R225H_3...
4,R53Q,../datasets/ALPHAFOLD PDBs/R53Q_8ee59/R53Q_8ee...


In [70]:
pdb_list = pdb_files['pdb_file'].to_list()

In [71]:
pdb_list

['../datasets/ALPHAFOLD PDBs/G309V_e3b31/G309V_e3b31_unrelaxed_rank_001_alphafold2_ptm_model_5_seed_000.pdb',
 '../datasets/ALPHAFOLD PDBs/G185R_e6971/G185R_e6971_unrelaxed_rank_001_alphafold2_ptm_model_5_seed_000.pdb',
 '../datasets/ALPHAFOLD PDBs/E168L_0f4af/E168L_0f4af_unrelaxed_rank_001_alphafold2_ptm_model_5_seed_000.pdb',
 '../datasets/ALPHAFOLD PDBs/R225H_3ba56/R225H_3ba56_unrelaxed_rank_001_alphafold2_ptm_model_5_seed_000.pdb',
 '../datasets/ALPHAFOLD PDBs/R53Q_8ee59/R53Q_8ee59_unrelaxed_rank_001_alphafold2_ptm_model_5_seed_000.pdb',
 '../datasets/ALPHAFOLD PDBs/W97C_b9ccb/W97C_b9ccb_unrelaxed_rank_001_alphafold2_ptm_model_5_seed_000.pdb',
 '../datasets/ALPHAFOLD PDBs/K353Q_ed31e/K353Q_ed31e_unrelaxed_rank_001_alphafold2_ptm_model_5_seed_000.pdb',
 '../datasets/ALPHAFOLD PDBs/G360A_4d8ad/G360A_4d8ad_unrelaxed_rank_001_alphafold2_ptm_model_5_seed_000.pdb',
 '../datasets/ALPHAFOLD PDBs/G360R_057f7/G360R_057f7_unrelaxed_rank_001_alphafold2_ptm_model_5_seed_000.pdb',
 '../datasets/

In [72]:
rms = []

for path_m in pdb_list:
    rms.append(get_rmsd(path_og, path_m))


In [135]:
# add column RMSD to pdb_files
pdb_files['RMSD'] = rms



In [137]:
pdb_files.tail()

Unnamed: 0,mutation,pdb_file,RMSD
59,G270R,../datasets/ALPHAFOLD PDBs/G270R_3f501/G270R_3...,27.251003
60,G123A,../datasets/ALPHAFOLD PDBs/G123A_a58f9/G123A_a...,27.251003
61,G115Mfs*,../datasets/ALPHAFOLD PDBs/G115Mfs__4df58/G115...,27.251003
62,His371Profs,../datasets/ALPHAFOLD PDBs/H371fs_9b3dd/H371fs...,27.251003
63,L353Q,../datasets/ALPHAFOLD PDBs/K353Q_ed31e/K353Q_e...,27.251003


In [142]:
pdb_files.iloc[62]

mutation                                          His371Profs
pdb_file    ../datasets/ALPHAFOLD PDBs/H371fs_9b3dd/H371fs...
RMSD                                                27.251003
Name: 62, dtype: object

In [140]:
h371 = pdb_files.iloc[62]['pdb_file']

In [141]:
get_rmsd(path_og, h371)

26.035992

In [125]:
# reorder the rows of the dataframe
pdb_files = pdb_files.reset_index(drop=True)

In [133]:
pdb_files.head()

Unnamed: 0,mutation,pdb_file,RMSD
0,W60*,../datasets/ALPHAFOLD PDBs/W60__b58bd/W60__b58...,40.944534
1,G11fs,../datasets/ALPHAFOLD PDBs/G11fs_0fe2e/G11fs_0...,40.628494
2,G115Mfs*,../datasets/ALPHAFOLD PDBs/G115Mfs__4df58/G115...,39.664482
3,Q29fs,../datasets/ALPHAFOLD PDBs/Q29fs_f0607/Q29fs_f...,39.129913
4,A218fs,../datasets/ALPHAFOLD PDBs/A218fs_8b909/A218fs...,38.497429


# test con pdf salvato da chimeraX

In [144]:
path_og

'../datasets/test/HGD_normal.pdb'

In [58]:
path_fs = '../datasets/test/A218fs.pdb'

In [64]:
get_rmsd(path_fs, path_m)

0.32535726

---
# da qui cose meglio

In [137]:
# create a column called 'AKUSSI' which is the sum of the columns 'AKUSSI_jointpain', 'AKUSSI_spinalpain'
df_g['AKUSSI'] = df_g['AKUSSI_jointpain'] + df_g['AKUSSI_spinalpain']

df_g.head()


Unnamed: 0,Protein change allele 1,Protein change allele 2,graph_allele1,graph_allele2,structure_allele1,structure_allele2,pdb_file_allele1,pdb_file_allele2,physical_health_score,mental_health_score,AKUSSI_jointpain,AKUSSI_spinalpain,KOOSpain,KOOSsymptoms,KOOSdaily_living,KOOSsport,KOOS_QOL,AKUSSI
0,G161R,His371Profs,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,../datasets/ALPHAFOLD PDBs/H371fs_9b3dd/H371fs...,26.0,43.0,57.0,75.0,67.0,89.0,75.0,70.0,69.0,132.0
1,G161R,G161R,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,53.0,51.0,29.0,25.0,92.0,96.0,91.0,85.0,75.0,54.0
2,G161R,M368V,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,../datasets/ALPHAFOLD PDBs/M368V_be390/M368V_b...,23.0,40.0,36.0,100.0,47.0,39.0,31.0,0.0,19.0,136.0
4,G161R,G161R,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,23.0,29.0,50.0,100.0,33.0,29.0,60.0,25.0,13.0,150.0
5,G161R,G161R,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,26.0,31.0,50.0,100.0,44.0,29.0,32.0,35.0,13.0,150.0


In [139]:
#sort the dataframe by the column 'AKUSSI'
df_g = df_g.sort_values(by='AKUSSI', ascending=False)

df_g.head()

Unnamed: 0,Protein change allele 1,Protein change allele 2,graph_allele1,graph_allele2,structure_allele1,structure_allele2,pdb_file_allele1,pdb_file_allele2,physical_health_score,mental_health_score,AKUSSI_jointpain,AKUSSI_spinalpain,KOOSpain,KOOSsymptoms,KOOSdaily_living,KOOSsport,KOOS_QOL,AKUSSI
40,D153fs,A218fs,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/D153fs_315fd/D153fs...,../datasets/ALPHAFOLD PDBs/A218fs_8b909/A218fs...,28.0,26.0,93.0,100.0,36.0,50.0,22.0,6.0,19.0,193.0
21,M368V,M368V,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/M368V_be390/M368V_b...,../datasets/ALPHAFOLD PDBs/M368V_be390/M368V_b...,15.0,50.0,79.0,100.0,19.0,14.0,19.0,5.0,0.0,179.0
105,C120F,C120F,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/C120F_3f78a/C120F_3...,../datasets/ALPHAFOLD PDBs/C120F_3f78a/C120F_3...,19.0,61.0,79.0,100.0,31.0,14.0,41.0,0.0,25.0,179.0
126,G161R,V300G,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,../datasets/ALPHAFOLD PDBs/V300G_f0e19/V300G_f...,29.0,35.0,71.0,100.0,33.0,36.0,34.0,5.0,19.0,171.0
120,D153fs,D153fs,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/D153fs_315fd/D153fs...,../datasets/ALPHAFOLD PDBs/D153fs_315fd/D153fs...,25.0,29.0,64.0,100.0,56.0,29.0,31.0,5.0,0.0,164.0


In [145]:
# drop the rows with nan value in the column AKUSSI
df_NO_AKUSSI = df_g.dropna(subset=['AKUSSI'])

In [148]:
len(df_NO_AKUSSI)

df_NO_AKUSSI.tail(10)

Unnamed: 0,Protein change allele 1,Protein change allele 2,graph_allele1,graph_allele2,structure_allele1,structure_allele2,pdb_file_allele1,pdb_file_allele2,physical_health_score,mental_health_score,AKUSSI_jointpain,AKUSSI_spinalpain,KOOSpain,KOOSsymptoms,KOOSdaily_living,KOOSsport,KOOS_QOL,AKUSSI
156,Q29fs,Q29fs,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/Q29fs_f0607/Q29fs_f...,../datasets/ALPHAFOLD PDBs/Q29fs_f0607/Q29fs_f...,44.0,52.0,0.0,25.0,100.0,96.0,97.0,100.0,100.0,25.0
34,G161R,G161R,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,54.0,56.0,0.0,25.0,97.0,93.0,100.0,100.0,88.0,25.0
93,G161R,E178G,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,../datasets/ALPHAFOLD PDBs/E178G_aeca3/E178G_a...,45.0,51.0,0.0,25.0,100.0,93.0,100.0,95.0,100.0,25.0
7,G161R,G161R,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,41.0,64.0,21.0,0.0,78.0,39.0,76.0,60.0,56.0,21.0
173,G161R,V300G,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,../datasets/ALPHAFOLD PDBs/V300G_f0e19/V300G_f...,54.0,60.0,14.0,0.0,97.0,82.0,99.0,90.0,100.0,14.0
122,G161R,G161R,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,,,14.0,0.0,100.0,100.0,100.0,100.0,100.0,14.0
150,Q29fs,Q29fs,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/Q29fs_f0607/Q29fs_f...,../datasets/ALPHAFOLD PDBs/Q29fs_f0607/Q29fs_f...,45.0,52.0,7.0,0.0,33.0,36.0,40.0,30.0,44.0,7.0
14,G161R,G372_P373delinsA,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,../datasets/ALPHAFOLD PDBs/G372_P373delinsA_8a...,56.0,53.0,0.0,0.0,100.0,89.0,100.0,90.0,69.0,0.0
35,G161R,H371fs,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,../datasets/ALPHAFOLD PDBs/H371fs_9b3dd/H371fs...,57.0,62.0,0.0,0.0,100.0,93.0,100.0,100.0,100.0,0.0
113,I216T,G360R,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/I216T_85a22/I216T_8...,../datasets/ALPHAFOLD PDBs/G360R_057f7/G360R_0...,61.0,24.0,0.0,0.0,100.0,100.0,100.0,95.0,100.0,0.0
