In [6]:
from Bio.PDB import PDBParser
from Bio.PDB.DSSP import DSSP

from tqdm import tqdm

from utils import *

In [7]:
path_og = '../datasets/test/HGD_normal.pdb'
path_m = '../datasets/test/A122V_4.pdb'


| Code | Structure                     |
|------|-------------------------------|
| H    | Alpha helix (4-12)            |
| B    | Isolated beta-bridge residue  |
| E    | Strand                        |
| G    | 3-10 helix                    |
| I    | Pi helix                      |
| T    | Turn                          |
| S    | Bend                          |
| -    | None                          |

In [8]:
parser = PDBParser()

structure_og = parser.get_structure("HGD", path_og)
structure_m = parser.get_structure("A122V", path_m)

model_og = structure_og[0]
model_m = structure_m[0]

dssp_og = DSSP(model_og, path_og, dssp='mkdssp')
dssp_m = DSSP(model_m, path_m, dssp='mkdssp')


In [6]:
# print the dssp 
print(dssp_og['A', 122])

(122, 'A', 'E', 0.0660377358490566, -158.0, 160.4, 9, -2.5, 9, -2.1, -2, -0.3, 2, -0.1)


In [7]:
type(dssp_og)

Bio.PDB.DSSP.DSSP

| Tuple Index | Value             |
|-------------|-------------------|
| 0           | DSSP index        |
| 1           | Amino acid        |
| 2           | Secondary structure |
| 3           | Relative ASA      |
| 4           | Phi               |
| 5           | Psi               |
| 6           | NH–>O_1_relidx    |
| 7           | NH–>O_1_energy    |
| 8           | O–>NH_1_relidx    |
| 9           | O–>NH_1_energy    |
| 10          | NH–>O_2_relidx    |
| 11          | NH–>O_2_energy    |
| 12          | O–>NH_2_relidx    |
| 13          | O–>NH_2_energy    |

In [None]:
# print the secondary structure
print('OG')
for key in dssp_og.keys():
    print(dssp_og[key])

print('Mutant')
for key in dssp_m.keys():
    print(dssp_m[key])

In [None]:
secondary_structure = [(x[1], x[2]) for x in dssp_og.property_list]

secondary_structure2 = [(x[1],x[2]) for x in dssp_m.property_list]

print('OG', 'Mutant')
print('-----------------')
print('(Ammino Acid, Scondary structure) (Ammino Acid, Scondary structure)')
print('-----------------')
for i in range(len(secondary_structure)):
    print(secondary_structure[i], secondary_structure2[i])



In [18]:
def get_secondary_structure(path, parser):

    structure = parser.get_structure("HGD", path)

    model = structure[0]

    dssp = DSSP(model, path, dssp='mkdssp')

    secondary_structure = [x[2] for x in dssp.property_list]
    # convert in a string
    secondary_structure = ''.join(secondary_structure)
    return secondary_structure

In [19]:
get_secondary_structure(path_og, PDBParser())

'-PPPPEEE-TT--EEEE-TTSTT-SPSS-S--SS-GGGPEEEEEESS-TTS-TTT--EEEEEESS-TT-SPPPEE---TB----GGGS----S-EEEPPPPPPPTTT----TGGGEEEEEEEEEGGGTEEEEEEEEEE-S---SEEEEESSEEEEEEEEES-EEEEETTEEEEE-TTEEEEE-TT--EEEEESSSEEEEEEEEES--EEPPP-GGG-SS-BS-GGGEEEEPP-----EEEEEEEEEEEETTEEEEEEEEE-S--EEEEEES---EEEEGGG-B----SSSS---GGGGEEEEEE-SSTT-EEEEEEEE-SEEE--TTS--SPPPB--SSEEEEEEEES--TT-SSS--TT-EEEE-TT--B---HHHHHHHTTS----EEESTT-EEEEEEESS-PEEPHIIIIIT--B-TTGGGGTTT------TT-SSPPP--'

In [20]:
pdb_files = pd.read_csv('../datasets/pdb_files.csv')

In [26]:
# for each path in the pdb_files['pdb_file'] column add a row at the head of the pdb_files dataframe with 'HEADER test' 
# and 'SEQRES test' as values

for i in tqdm(range(len(pdb_files))):
    path = pdb_files['pdb_file'][i]
    # add 'HEADER test' at the start of the file
    with open(path, 'r') as original: data = original.read()
    with open(path, 'w') as modified: modified.write('HEADER test\n' + data)



100%|██████████| 64/64 [00:00<00:00, 590.72it/s]


In [None]:
# add a column 'secondary_structure' colum

pdb_files['secondary_structure'] = pdb_files['pdb_file'].apply(lambda x: get_secondary_structure(x, parser))

In [None]:
secondary = []
# for each path in pdb_files['pdb_files'] use get_secondary_structure to get the secondary structure

for i in tqdm(range(len(pdb_files))):
    path = pdb_files['pdb_file'][i]
    print(path)
    secondary_structure = get_secondary_structure(path, PDBParser())
    # add the secondary structure to the pdb_files dataframe

    secondary.append(secondary_structure)

pdb_files['secondary_structure'] = secondary

In [37]:
pdb_files.head(10)

Unnamed: 0,mutation,pdb_file,secondary_structure
0,G309V,../datasets/ALPHAFOLD PDBs/G309V_e3b31/G309V_e...,-----EEE-SS--EEEE-TTSTT-SPSS-SS-SS-GGGPEEEEEES...
1,G185R,../datasets/ALPHAFOLD PDBs/G185R_e6971/G185R_e...,--PPPEEE-SS--EEEE-TTSTT-SPSS-SS-SS-GGGPEEEEEES...
2,E168L,../datasets/ALPHAFOLD PDBs/E168L_0f4af/E168L_0...,-----EEE-SS--EEEE-TTSTT-SPSS-SS-SS-GGGPEEEEEES...
3,R225H,../datasets/ALPHAFOLD PDBs/R225H_3ba56/R225H_3...,--PPPEEE-SS--EEEE-TTSTT-SPSS-SS-SS-GGGPEEEEEES...
4,R53Q,../datasets/ALPHAFOLD PDBs/R53Q_8ee59/R53Q_8ee...,-----EEE-SS--EEEE-TTSTT-SPSS-SS-SS-GGGPEEEEEES...
5,W97C,../datasets/ALPHAFOLD PDBs/W97C_b9ccb/W97C_b9c...,--PPPEEE-SS--EEEE-TTSTT-SPSS-SS-SS-GGGPEEEEEES...
6,K353Q,../datasets/ALPHAFOLD PDBs/K353Q_ed31e/K353Q_e...,-----EEE-SS--EEEE-TTSTT-SPSS-SS-SS-GGGPEEEEEES...
7,G360A,../datasets/ALPHAFOLD PDBs/G360A_4d8ad/G360A_4...,-----EEE-SS--EEEE-TTSTT-SPSS-SS-SS-GGGPEEEEEES...
8,G360R,../datasets/ALPHAFOLD PDBs/G360R_057f7/G360R_0...,-----EEE-SS--EEEE-TTSTT-SPSS-SS-SS-GGGPEEEEEES...
9,S59fs,../datasets/ALPHAFOLD PDBs/S59fs_cd7b7/S59fs_c...,---PPPPP-TTS------TTSTT-SPSS-SS-SS-GGG-EEEEEE-...


In [38]:
# save as csv
pdb_files.to_csv('../datasets/pdb_files_ss.csv', index=False)

In [16]:
len(get_secondary_structure(dssp_og))

445

In [17]:
print(secondary_structure[197], secondary_structure2[197])

('G', 'E') ('G', 'E')


In [18]:
# check the active site
active_site = [292, 335, 365, 371, 341]
for i in active_site:
    print(i, secondary_structure[i], secondary_structure2[i])

292 ('A', '-') ('A', '-')
335 ('R', '-') ('R', '-')
365 ('S', '-') ('S', '-')
371 ('G', '-') ('G', '-')
341 ('F', 'E') ('F', 'E')


In [11]:
# comprare the two structures
for key in dssp_og.keys():
    if key in dssp_m.keys():
        if dssp_og[key][1] != dssp_m[key][1]:
            print(key, dssp_og[key][1], dssp_m[key][1])
    else:
        print(key, dssp_og[key][1], 'None')

('A', (' ', 122, ' ')) A V


# RMSD

In [None]:
path_fs = '../datasets/test/A218fs.pdb'

path_m_alpha = '../datasets/ALPHAFOLD PDBs/A122V_41d52/A122V_41d52_unrelaxed_rank_001_alphafold2_ptm_model_5_seed_000.pdb'
path_fs_alpha = '../datasets/ALPHAFOLD PDBs/A218fs_8b909/A218fs_8b909_unrelaxed_rank_001_alphafold2_ptm_model_4_seed_000.pdb'

from Bio.PDB import PDBParser, Superimposer, is_aa

# Function to get backbone atoms from residues that are present in both structures
def get_common_backbone_atoms(structure1, structure2):
    backbone_atoms1 = []
    backbone_atoms2 = []
    for model1, model2 in zip(structure1, structure2):
        for chain1, chain2 in zip(model1, model2):
            res_dict2 = {res.get_id(): res for res in chain2 if is_aa(res)}
            for res1 in chain1:
                if is_aa(res1) and res1.get_id() in res_dict2:
                    res2 = res_dict2[res1.get_id()]
                    for atom1 in res1:
                        if atom1.get_id() in ('N', 'CA', 'C', 'O'):
                            atom2 = res2[atom1.get_id()]
                            backbone_atoms1.append(atom1)
                            backbone_atoms2.append(atom2)
    return backbone_atoms1, backbone_atoms2

# Load the original and mutant PDB files
pdb_parser = PDBParser(QUIET=True)
structure_original = pdb_parser.get_structure("original", path_fs_alpha)
structure_mutant = pdb_parser.get_structure("mutant", path_fs)

# Extract common backbone atoms
backbone_atoms_original, backbone_atoms_mutant = get_common_backbone_atoms(structure_original, structure_mutant)

# Ensure there are common backbone atoms to align
assert len(backbone_atoms_original) > 0 and len(backbone_atoms_mutant) > 0, "No common backbone atoms found."

# Superimpose the structures
super_imposer = Superimposer()
super_imposer.set_atoms(backbone_atoms_original, backbone_atoms_mutant)
super_imposer.apply(structure_mutant.get_atoms())

# Calculate RMSD
print(f"RMSD: {super_imposer.rms:.4f} Å")

# Optionally, save the aligned mutant structure to a new PDB file
from Bio.PDB import PDBIO
io = PDBIO()
io.set_structure(structure_mutant)
io.save("aligned_mutant.pdb")




RMSD: 0.0005 Å


----
# Test su Grafi

In [None]:
from graphein.protein.visualisation import plotly_protein_structure_graph

# Load the default config
c = ProteinGraphConfig(granularity='CA')

# Construct the graph!
g = construct_graph(path=path_og)

In [20]:
plotly_protein_structure_graph(g, node_size_multiplier=0.5, colour_nodes_by="residue_name", figsize=(1000, 1000))

In [None]:
# Load the default config
c = ProteinGraphConfig(granularity='CA')

# Construct the graph!
g1 = construct_graph(path=path_m)

In [15]:
plotly_protein_structure_graph(g1, node_size_multiplier=0.5, colour_nodes_by="residue_name")

In [16]:
print(type(g1))

<class 'networkx.classes.graph.Graph'>


In [17]:
# difference between the two graphs
for node in g.nodes:
    if node not in g1.nodes:
        print(node)

# distance between the two graphs
from networkx.algorithms.similarity import graph_edit_distance
distance = graph_edit_distance(g, g1)
print(distance)


A:ALA:122
0.0


---
Include the graph in the dataset

In [16]:
#read the csv file aku_prin_v2.0_cleaned.csv

import pandas as pd

df = pd.read_excel('../datasets/aku_prin_v2.0.xlsx', sheet_name=0)

In [17]:
df2 = df.iloc[:, list(range(12, 18)) + list(range(41, 52))] 
df2.head()

Unnamed: 0,Protein change allele 1,DNA change allele 1,Exon/intron allele 1,Protein change allele 2,DNA change allele 2,ex/in allele 2,physical_health_score,mental_health_score,AKUSSI_jointpain,AKUSSI_spinalpain,KOOSpain,KOOSsymptoms,KOOSdaily_living,KOOSsport,KOOS_QOL,HAQ_hapVAS,HAQ_haqDI
0,G161R,c.481G>A,ex8,His371Profs,c.1111dupC,ex13,26.0,43.0,57.0,75.0,67.0,89.0,75.0,70.0,69.0,63.0,1.5
1,G161R,c.481G>A,ex8,G161R,c.481G>A,ex8,53.0,51.0,29.0,25.0,92.0,96.0,91.0,85.0,75.0,23.0,0.25
2,G161R,c.481G>A,ex8,M368V,c.1102A>G,ex13,23.0,40.0,36.0,100.0,47.0,39.0,31.0,0.0,19.0,53.0,1.75
3,Y6_G29del ivs1-1G>A,c.16-1G>A,in1,Y6_G29del ivs1-1G>A,c.16-1G>A,in1,,,,,,,,,,,
4,G161R,c.481G>A,ex8,G161R,c.481G>A,ex8,23.0,29.0,50.0,100.0,33.0,29.0,60.0,25.0,13.0,70.0,2.0


---

In [None]:
from graphein.protein.config import ProteinGraphConfig
from graphein.protein.edges.distance import *
from graphein.protein.graphs import construct_graph

edge_fns = [
    add_aromatic_interactions,
    add_hydrophobic_interactions,
    add_aromatic_sulphur_interactions,
    add_cation_pi_interactions,
    add_disulfide_interactions,
    add_hydrogen_bond_interactions,
    add_ionic_interactions,
    add_peptide_bonds
    ]
config = ProteinGraphConfig(edge_construction_functions=edge_fns)

g = construct_graph(config=config, path= '../datasets/test/HGD_normal.pdb')

In [None]:
p = plotly_protein_structure_graph(
    g,
    colour_edges_by="kind",
    colour_nodes_by="degree",
    label_node_ids=False,
    plot_title="Peptide backbone graph. Nodes coloured by degree.",
    node_size_multiplier=1
    )
p.show()

In [38]:
type(g)

networkx.classes.graph.Graph

# funzione per controllare i nodi neighbor dei siti attivi

In [21]:
active_site = [292, 335, 365, 371, 341]

In [22]:
active_nodes = []
active_edges = {}
for i in active_site:
    active_edges[i] = []


for node in g.nodes:
    #get the int of the node at the end of the string
    nnode = int(node.split(':')[2])
    if nnode in active_site:
        active_nodes.append(node)
        # get the edges of the node
        for edge in g.edges(node, data=True):
            active_edges[nnode].append(edge)

In [23]:
active_nodes = get_edges_from_nodes(g, active_site)[0]

In [24]:
active_nodes

['A:HIS:292', 'A:HIS:335', 'A:GLU:341', 'A:HIS:365', 'A:HIS:371']

In [25]:
SG = get_subgraph(g, active_nodes)

In [26]:
#g.subgraph(active_nodes)
SG = g.__class__()
SG.add_nodes_from((n, g.nodes[n]) for n in active_nodes)
if SG.is_multigraph():
    SG.add_edges_from(
        (n, nbr, key, d)
        for n, nbrs in g.adj.items()
        if n in active_nodes
        for nbr, keydict in nbrs.items()
        if nbr in active_nodes
        for key, d in keydict.items()
    )
else:
    SG.add_edges_from(
        (n, nbr, d)
        for n, nbrs in g.adj.items()
        if n in active_nodes
        for nbr, d in nbrs.items()
        if nbr in active_nodes
    )
SG.graph.update(g.graph)

In [27]:
p = plotly_protein_structure_graph(
    SG,
    colour_edges_by="kind",
    colour_nodes_by="degree",
    label_node_ids=False,
    plot_title="Peptide backbone graph. Nodes coloured by degree.",
    node_size_multiplier=1
    )
p.show()

In [28]:
edge[1].split(':')[2]

'372'

In [29]:
active_edges

{292: [('A:HIS:292',
   'A:TYR:333',
   {'kind': {'aromatic'}, 'distance': 7.4876871595974155}),
  ('A:HIS:292',
   'A:GLU:351',
   {'kind': {'ionic'}, 'distance': 13.701098532599495}),
  ('A:HIS:292',
   'A:ASP:291',
   {'kind': {'peptide_bond'}, 'distance': 3.833183272425151}),
  ('A:HIS:292',
   'A:ALA:293',
   {'kind': {'peptide_bond'}, 'distance': 3.839483428796118})],
 335: [('A:HIS:335',
   'A:HIS:371',
   {'kind': {'aromatic'}, 'distance': 5.410436211619173}),
  ('A:HIS:335',
   'A:TYR:333',
   {'kind': {'aromatic'}, 'distance': 5.903799623970991}),
  ('A:HIS:335',
   'A:GLU:341',
   {'kind': {'ionic'}, 'distance': 11.37200197854362}),
  ('A:HIS:335',
   'A:TYR:334',
   {'kind': {'peptide_bond'}, 'distance': 3.7909056965321635}),
  ('A:HIS:335',
   'A:ARG:336',
   {'kind': {'peptide_bond'}, 'distance': 3.84312906366674})],
 365: [('A:HIS:365',
   'A:HIS:371',
   {'kind': {'aromatic'}, 'distance': 7.411853816691206}),
  ('A:HIS:365',
   'A:TRP:427',
   {'kind': {'aromatic'}, 'di

In [30]:
active_site = [292, 335, 365, 371, 341]

active_edges_num = {}
for i in active_site:
    active_edges_num[i] = []

for key, value in active_edges.items():
    for edge in value:
        active_edges_num[key].append(edge[1].split(':')[2])
    #active_edges_num[key] = value[2][1].split(':')[2]

active_edges_num

{292: ['333', '351', '291', '293'],
 335: ['371', '333', '341', '334', '336'],
 365: ['371', '427', '423', '341', '364', '366'],
 371: ['335', '365', '341', '370', '372'],
 341: ['365', '335', '371', '340', '342']}

In [31]:
def get_neighbor_nodes(nodes, n):
    active_site_neighbors = []
    for node in nodes:
        for i in range(n*2+1):
            active_site_neighbors.append(node - n + i)
    
    active_site_neighbors = list(set(active_site_neighbors))
    return active_site_neighbors


In [32]:
get_neighbor_nodes(active_site, 1)

[291, 292, 293, 364, 365, 334, 335, 336, 366, 370, 371, 372, 340, 341, 342]

---

# creazione del df con informazioni su grafo (networkx object)

In [None]:
from utils import create_graph_df
df_g = create_graph_df(g)


In [35]:
len(df_g)

219

In [36]:
df_all=pd.read_excel('../datasets/aku_prin_v2.0.xlsx')

In [37]:
df_scores=df_all[['physical_health_score', 'mental_health_score', 'AKUSSI_jointpain',
        'AKUSSI_spinalpain', 'KOOSpain', 'KOOSsymptoms', 'KOOSdaily_living',
        'KOOSsport', 'KOOS_QOL', 'HAQ_hapVAS', 'HAQ_haqDI']]

In [38]:
df_scores.head()

Unnamed: 0,physical_health_score,mental_health_score,AKUSSI_jointpain,AKUSSI_spinalpain,KOOSpain,KOOSsymptoms,KOOSdaily_living,KOOSsport,KOOS_QOL,HAQ_hapVAS,HAQ_haqDI
0,26.0,43.0,57.0,75.0,67.0,89.0,75.0,70.0,69.0,63.0,1.5
1,53.0,51.0,29.0,25.0,92.0,96.0,91.0,85.0,75.0,23.0,0.25
2,23.0,40.0,36.0,100.0,47.0,39.0,31.0,0.0,19.0,53.0,1.75
3,,,,,,,,,,,
4,23.0,29.0,50.0,100.0,33.0,29.0,60.0,25.0,13.0,70.0,2.0


In [39]:
# append df_g and df_scores
df_g = pd.concat([df_g, df_scores], axis=1)

In [40]:
df_g.head()

Unnamed: 0,Protein change allele 1,Protein change allele 2,graph_allele1,graph_allele2,structure_allele1,structure_allele2,pdb_file_allele1,pdb_file_allele2,physical_health_score,mental_health_score,AKUSSI_jointpain,AKUSSI_spinalpain,KOOSpain,KOOSsymptoms,KOOSdaily_living,KOOSsport,KOOS_QOL,HAQ_hapVAS,HAQ_haqDI
0,G161R,His371Profs,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,../datasets/ALPHAFOLD PDBs/H371fs_9b3dd/H371fs...,26.0,43.0,57.0,75.0,67.0,89.0,75.0,70.0,69.0,63.0,1.5
1,G161R,G161R,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,53.0,51.0,29.0,25.0,92.0,96.0,91.0,85.0,75.0,23.0,0.25
2,G161R,M368V,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,../datasets/ALPHAFOLD PDBs/M368V_be390/M368V_b...,23.0,40.0,36.0,100.0,47.0,39.0,31.0,0.0,19.0,53.0,1.75
3,Y6_G29del ivs1-1G>A,Y6_G29del ivs1-1G>A,,,,,,,,,,,,,,,,,
4,G161R,G161R,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,23.0,29.0,50.0,100.0,33.0,29.0,60.0,25.0,13.0,70.0,2.0


In [41]:
# drop the rows with nan values in the columns 'Protein change allele 1' and 'Protein change allele 2'
df_g = df_g.dropna(subset=['Protein change allele 1 ', 'Protein change allele 2'])
len(df_g)


199

In [42]:
# print all the rows that have None in the 'graph_allele1' or 'graph_allele2' column
print(len(df_g[df_g['graph_allele1'].isnull() | df_g['graph_allele2'].isnull()]))

df_g[df_g['graph_allele1'].isnull() | df_g['graph_allele2'].isnull()]


40


Unnamed: 0,Protein change allele 1,Protein change allele 2,graph_allele1,graph_allele2,structure_allele1,structure_allele2,pdb_file_allele1,pdb_file_allele2,physical_health_score,mental_health_score,AKUSSI_jointpain,AKUSSI_spinalpain,KOOSpain,KOOSsymptoms,KOOSdaily_living,KOOSsport,KOOS_QOL,HAQ_hapVAS,HAQ_haqDI
3,Y6_G29del ivs1-1G>A,Y6_G29del ivs1-1G>A,,,,,,,,,,,,,,,,,
8,G161R,ivs1-1G>A (Y6_G29del),"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...",,"(((<Residue MET het= resseq=1 icode= >, <Resi...",,../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,,32.0,30.0,50.0,100.0,44.0,68.0,54.0,45.0,31.0,53.0,0.63
9,G251D,del ex13 (MLPA),"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...",,"(((<Residue MET het= resseq=1 icode= >, <Resi...",,../datasets/ALPHAFOLD PDBs/G251D_5ea06/G251D_5...,,31.0,39.0,36.0,75.0,44.0,46.0,52.0,0.0,44.0,80.0,1.25
15,D153fs,ivs1-1G>A (Y6_G29del) (2copies)**,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...",,"(((<Residue MET het= resseq=1 icode= >, <Resi...",,../datasets/ALPHAFOLD PDBs/D153fs_315fd/D153fs...,,39.0,51.0,14.0,75.0,,,,,,70.0,0.63
24,ivs1-1G>A (Y6_G29del),H371fs,,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...",,"(((<Residue MET het= resseq=1 icode= >, <Resi...",,../datasets/ALPHAFOLD PDBs/H371fs_9b3dd/H371fs...,40.0,46.0,7.0,50.0,100.0,96.0,100.0,95.0,100.0,40.0,0.38
45,G161R,n.i. (MLPAneg),"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...",,"(((<Residue MET het= resseq=1 icode= >, <Resi...",,../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,,43.0,30.0,7.0,75.0,67.0,64.0,54.0,42.0,56.0,48.0,0.75
51,c.140C>T,c.140C>T,,,,,,,,,,,,,,,,,
60,R336S fs ivs12-2A>T,E168*,,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...",,"(((<Residue MET het= resseq=1 icode= >, <Resi...",,../datasets/ALPHAFOLD PDBs/E168__10c4e/E168_10...,,,,,,,,,,,
74,ivs1-1G>A (Y6_G29del),ivs1-1G>A (Y6_G29del),,,,,,,,,,,,,,,,,
78,K248R,ivs7+5G>A (R145Sfs),,,,,,,,,,,,,,,,,


In [43]:
# drop the rows with None value of the column 'graph_allele1'
df_g = df_g.dropna(subset=['graph_allele1'])

In [44]:
df_g = df_g.dropna(subset=['graph_allele2'])

In [45]:
len(df_g)

159

---
# df con solo pazienti che abbiamo entrambi i pdb delle mutazioni

In [137]:
# create a column called 'AKUSSI' which is the sum of the columns 'AKUSSI_jointpain', 'AKUSSI_spinalpain'
df_g['AKUSSI'] = df_g['AKUSSI_jointpain'] + df_g['AKUSSI_spinalpain']

df_g.head()


Unnamed: 0,Protein change allele 1,Protein change allele 2,graph_allele1,graph_allele2,structure_allele1,structure_allele2,pdb_file_allele1,pdb_file_allele2,physical_health_score,mental_health_score,AKUSSI_jointpain,AKUSSI_spinalpain,KOOSpain,KOOSsymptoms,KOOSdaily_living,KOOSsport,KOOS_QOL,AKUSSI
0,G161R,His371Profs,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,../datasets/ALPHAFOLD PDBs/H371fs_9b3dd/H371fs...,26.0,43.0,57.0,75.0,67.0,89.0,75.0,70.0,69.0,132.0
1,G161R,G161R,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,53.0,51.0,29.0,25.0,92.0,96.0,91.0,85.0,75.0,54.0
2,G161R,M368V,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,../datasets/ALPHAFOLD PDBs/M368V_be390/M368V_b...,23.0,40.0,36.0,100.0,47.0,39.0,31.0,0.0,19.0,136.0
4,G161R,G161R,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,23.0,29.0,50.0,100.0,33.0,29.0,60.0,25.0,13.0,150.0
5,G161R,G161R,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,26.0,31.0,50.0,100.0,44.0,29.0,32.0,35.0,13.0,150.0


In [139]:
#sort the dataframe by the column 'AKUSSI'
df_g = df_g.sort_values(by='AKUSSI', ascending=False)

df_g.head()

Unnamed: 0,Protein change allele 1,Protein change allele 2,graph_allele1,graph_allele2,structure_allele1,structure_allele2,pdb_file_allele1,pdb_file_allele2,physical_health_score,mental_health_score,AKUSSI_jointpain,AKUSSI_spinalpain,KOOSpain,KOOSsymptoms,KOOSdaily_living,KOOSsport,KOOS_QOL,AKUSSI
40,D153fs,A218fs,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/D153fs_315fd/D153fs...,../datasets/ALPHAFOLD PDBs/A218fs_8b909/A218fs...,28.0,26.0,93.0,100.0,36.0,50.0,22.0,6.0,19.0,193.0
21,M368V,M368V,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/M368V_be390/M368V_b...,../datasets/ALPHAFOLD PDBs/M368V_be390/M368V_b...,15.0,50.0,79.0,100.0,19.0,14.0,19.0,5.0,0.0,179.0
105,C120F,C120F,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/C120F_3f78a/C120F_3...,../datasets/ALPHAFOLD PDBs/C120F_3f78a/C120F_3...,19.0,61.0,79.0,100.0,31.0,14.0,41.0,0.0,25.0,179.0
126,G161R,V300G,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,../datasets/ALPHAFOLD PDBs/V300G_f0e19/V300G_f...,29.0,35.0,71.0,100.0,33.0,36.0,34.0,5.0,19.0,171.0
120,D153fs,D153fs,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/D153fs_315fd/D153fs...,../datasets/ALPHAFOLD PDBs/D153fs_315fd/D153fs...,25.0,29.0,64.0,100.0,56.0,29.0,31.0,5.0,0.0,164.0


In [145]:
# drop the rows with nan value in the column AKUSSI
df_NO_AKUSSI = df_g.dropna(subset=['AKUSSI'])

In [148]:
len(df_NO_AKUSSI)

df_NO_AKUSSI.tail(10)

Unnamed: 0,Protein change allele 1,Protein change allele 2,graph_allele1,graph_allele2,structure_allele1,structure_allele2,pdb_file_allele1,pdb_file_allele2,physical_health_score,mental_health_score,AKUSSI_jointpain,AKUSSI_spinalpain,KOOSpain,KOOSsymptoms,KOOSdaily_living,KOOSsport,KOOS_QOL,AKUSSI
156,Q29fs,Q29fs,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/Q29fs_f0607/Q29fs_f...,../datasets/ALPHAFOLD PDBs/Q29fs_f0607/Q29fs_f...,44.0,52.0,0.0,25.0,100.0,96.0,97.0,100.0,100.0,25.0
34,G161R,G161R,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,54.0,56.0,0.0,25.0,97.0,93.0,100.0,100.0,88.0,25.0
93,G161R,E178G,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,../datasets/ALPHAFOLD PDBs/E178G_aeca3/E178G_a...,45.0,51.0,0.0,25.0,100.0,93.0,100.0,95.0,100.0,25.0
7,G161R,G161R,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,41.0,64.0,21.0,0.0,78.0,39.0,76.0,60.0,56.0,21.0
173,G161R,V300G,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,../datasets/ALPHAFOLD PDBs/V300G_f0e19/V300G_f...,54.0,60.0,14.0,0.0,97.0,82.0,99.0,90.0,100.0,14.0
122,G161R,G161R,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,,,14.0,0.0,100.0,100.0,100.0,100.0,100.0,14.0
150,Q29fs,Q29fs,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/Q29fs_f0607/Q29fs_f...,../datasets/ALPHAFOLD PDBs/Q29fs_f0607/Q29fs_f...,45.0,52.0,7.0,0.0,33.0,36.0,40.0,30.0,44.0,7.0
14,G161R,G372_P373delinsA,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,../datasets/ALPHAFOLD PDBs/G372_P373delinsA_8a...,56.0,53.0,0.0,0.0,100.0,89.0,100.0,90.0,69.0,0.0
35,G161R,H371fs,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/G161R_c8ba5/G161R_c...,../datasets/ALPHAFOLD PDBs/H371fs_9b3dd/H371fs...,57.0,62.0,0.0,0.0,100.0,93.0,100.0,100.0,100.0,0.0
113,I216T,G360R,"(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(A:MET:1, A:ALA:2, A:GLU:3, A:LEU:4, A:LYS:5, ...","(((<Residue MET het= resseq=1 icode= >, <Resi...","(((<Residue MET het= resseq=1 icode= >, <Resi...",../datasets/ALPHAFOLD PDBs/I216T_85a22/I216T_8...,../datasets/ALPHAFOLD PDBs/G360R_057f7/G360R_0...,61.0,24.0,0.0,0.0,100.0,100.0,100.0,95.0,100.0,0.0
