In [1]:
!pip install Biopython

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from Bio.Data.IUPACData import protein_letters_1to3_extended as prot1to3
from Bio.Data.IUPACData import protein_letters_3to1_extended as prot3to1
import pandas as pd
import numpy as np
import re

In [3]:
df = pd.read_excel('champ-mutation-list-q4-clean.xlsx', index_col=0)
df.head()

Unnamed: 0,HGVS cDNA,hg19 Coordinates,HGVS Protein,Mature Protein,Mutation Type,Mechanism,Exon,Codon,Domain,Subtype,...,Mild (>5 U/dL),Unclassified (no FVIII level),Reported Severity,History of Inhibitor,Comments,Reference ID,Year Reported,HGVS Wild Amino Acid,HGVS New Amino Acid,HGVS Position
0,C.65G>C,154250763,Arg22Thr,Arg3Thr,Missense,Substitution,1,3,A1,Heavy Chain,...,,,Severe,No,,129,2002,Arg,Thr,22
1,C.64A>G,154250764,Arg22Gly,Arg3Gly,Missense,Substitution,1,3,A1,Heavy Chain,...,,,Severe,Not Reported,,187,2008,Arg,Gly,22
2,C.65G>T,154250763,Arg22Ile,Arg3Ile,Missense,Substitution,1,3,A1,Heavy Chain,...,X,,Mild,No,,H,H,Arg,Ile,22
3,C.67A>G,154250761,Arg23Gly,Arg4Gly,Missense,Substitution,1,4,A1,Heavy Chain,...,X,,Mild,No,,H,H,Arg,Gly,23
4,C.72C>G,154250756,Tyr24*,Tyr5*,Nonsense,Substitution,1,5,A1,Heavy Chain,...,,,Severe,Not Reported,,260,2014,Tyr,*,24


In [4]:
df_rsa = pd.read_csv('Relative_Surf_Area_2R7E_v2.csv', sep='\t')
df_rsa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1337 entries, 0 to 1336
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pos_HGVS  1308 non-null   float64
 1   Residue   1337 non-null   object 
 2   RSA       1337 non-null   float64
dtypes: float64(2), object(1)
memory usage: 31.5+ KB


In [5]:
df_rsa.dropna(inplace=True)
df_rsa.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1308 entries, 0 to 1336
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pos_HGVS  1308 non-null   float64
 1   Residue   1308 non-null   object 
 2   RSA       1308 non-null   float64
dtypes: float64(2), object(1)
memory usage: 40.9+ KB


In [6]:
df_rsa.rename(inplace=True, columns={
    'pos_HGVS': 'HGVS Position', 
    'Residue': 'HGVS Wild Amino Acid',
    'RSA': 'Relative Surface Area'})
df_rsa['HGVS Wild Amino Acid'] = df_rsa['HGVS Wild Amino Acid'].apply(lambda r: prot1to3[r])
df_rsa['HGVS Position'] = df_rsa['HGVS Position'].astype(int)
df_rsa.head()

Unnamed: 0,HGVS Position,HGVS Wild Amino Acid,Relative Surface Area
0,20,Ala,0.102683
1,21,Thr,0.691208
2,22,Arg,0.752081
3,23,Arg,0.523571
4,24,Tyr,0.785954


In [7]:
df = df.merge(df_rsa, how = 'inner', on = ['HGVS Position', 'HGVS Wild Amino Acid'])
df.head()

Unnamed: 0,HGVS cDNA,hg19 Coordinates,HGVS Protein,Mature Protein,Mutation Type,Mechanism,Exon,Codon,Domain,Subtype,...,Unclassified (no FVIII level),Reported Severity,History of Inhibitor,Comments,Reference ID,Year Reported,HGVS Wild Amino Acid,HGVS New Amino Acid,HGVS Position,Relative Surface Area
0,C.65G>C,154250763,Arg22Thr,Arg3Thr,Missense,Substitution,1,3,A1,Heavy Chain,...,,Severe,No,,129,2002,Arg,Thr,22,0.752081
1,C.64A>G,154250764,Arg22Gly,Arg3Gly,Missense,Substitution,1,3,A1,Heavy Chain,...,,Severe,Not Reported,,187,2008,Arg,Gly,22,0.752081
2,C.65G>T,154250763,Arg22Ile,Arg3Ile,Missense,Substitution,1,3,A1,Heavy Chain,...,,Mild,No,,H,H,Arg,Ile,22,0.752081
3,C.67A>G,154250761,Arg23Gly,Arg4Gly,Missense,Substitution,1,4,A1,Heavy Chain,...,,Mild,No,,H,H,Arg,Gly,23,0.523571
4,C.72C>G,154250756,Tyr24*,Tyr5*,Nonsense,Substitution,1,5,A1,Heavy Chain,...,,Severe,Not Reported,,260,2014,Tyr,*,24,0.785954


In [8]:
df_dm = pd.read_excel('Supplementary_Table_npj_paper.xlsx', index_col=0)
df_dm.head()

Unnamed: 0,ala,cys,asp,glu,phe,gly,his,ile,lys,leu,met,asn,pro,gln,arg,ser,thr,val,trp,tyr
ala,0.0,1.5,1.57,1.51,1.52,1.59,1.51,1.52,1.52,1.46,1.48,1.51,1.49,1.47,1.6,1.38,1.49,1.44,1.55,1.59
cys,1.5,0.0,1.6,1.54,1.48,1.68,1.51,1.46,1.61,1.49,1.48,1.59,1.34,1.5,1.61,1.53,1.54,1.42,1.57,1.56
asp,1.57,1.6,0.0,1.48,1.65,1.61,1.55,1.62,1.59,1.67,1.65,1.51,1.46,1.53,1.63,1.52,1.58,1.6,1.72,1.62
glu,1.51,1.54,1.48,0.0,1.62,1.63,1.53,1.6,1.58,1.63,1.58,1.5,1.51,1.47,1.61,1.56,1.58,1.57,1.7,1.63
phe,1.52,1.48,1.65,1.62,0.0,1.63,1.48,1.39,1.61,1.45,1.48,1.56,1.6,1.54,1.61,1.58,1.52,1.42,1.53,1.49


In [9]:
df_dm.columns = [r.capitalize() for r in df_dm.columns]
df_dm.index = [r.capitalize() for r in df_dm.index]
df_dm.head()

Unnamed: 0,Ala,Cys,Asp,Glu,Phe,Gly,His,Ile,Lys,Leu,Met,Asn,Pro,Gln,Arg,Ser,Thr,Val,Trp,Tyr
Ala,0.0,1.5,1.57,1.51,1.52,1.59,1.51,1.52,1.52,1.46,1.48,1.51,1.49,1.47,1.6,1.38,1.49,1.44,1.55,1.59
Cys,1.5,0.0,1.6,1.54,1.48,1.68,1.51,1.46,1.61,1.49,1.48,1.59,1.34,1.5,1.61,1.53,1.54,1.42,1.57,1.56
Asp,1.57,1.6,0.0,1.48,1.65,1.61,1.55,1.62,1.59,1.67,1.65,1.51,1.46,1.53,1.63,1.52,1.58,1.6,1.72,1.62
Glu,1.51,1.54,1.48,0.0,1.62,1.63,1.53,1.6,1.58,1.63,1.58,1.5,1.51,1.47,1.61,1.56,1.58,1.57,1.7,1.63
Phe,1.52,1.48,1.65,1.62,0.0,1.63,1.48,1.39,1.61,1.45,1.48,1.56,1.6,1.54,1.61,1.58,1.52,1.42,1.53,1.49


In [10]:
def distance(row):
    wild = row['HGVS Wild Amino Acid']
    new = row['HGVS New Amino Acid']
    return np.nan if new == '*' else df_dm[wild][new]

In [11]:
df['Distance Wild and New'] = df.apply(distance, axis=1)
df.head()

Unnamed: 0,HGVS cDNA,hg19 Coordinates,HGVS Protein,Mature Protein,Mutation Type,Mechanism,Exon,Codon,Domain,Subtype,...,Reported Severity,History of Inhibitor,Comments,Reference ID,Year Reported,HGVS Wild Amino Acid,HGVS New Amino Acid,HGVS Position,Relative Surface Area,Distance Wild and New
0,C.65G>C,154250763,Arg22Thr,Arg3Thr,Missense,Substitution,1,3,A1,Heavy Chain,...,Severe,No,,129,2002,Arg,Thr,22,0.752081,1.56
1,C.64A>G,154250764,Arg22Gly,Arg3Gly,Missense,Substitution,1,3,A1,Heavy Chain,...,Severe,Not Reported,,187,2008,Arg,Gly,22,0.752081,1.59
2,C.65G>T,154250763,Arg22Ile,Arg3Ile,Missense,Substitution,1,3,A1,Heavy Chain,...,Mild,No,,H,H,Arg,Ile,22,0.752081,1.57
3,C.67A>G,154250761,Arg23Gly,Arg4Gly,Missense,Substitution,1,4,A1,Heavy Chain,...,Mild,No,,H,H,Arg,Gly,23,0.523571,1.59
4,C.72C>G,154250756,Tyr24*,Tyr5*,Nonsense,Substitution,1,5,A1,Heavy Chain,...,Severe,Not Reported,,260,2014,Tyr,*,24,0.785954,


In [12]:
df.to_excel('champ-mutation-list-q4-clean-enhanced.xlsx')