In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import Bio
from Bio import SeqIO
from Bio.Seq import Seq, MutableSeq
from utils import *

In [2]:
# read the first sheet of the excel file
df = pd.read_excel('../datasets/aku_prin_v2.0.xlsx', sheet_name=0)

In [3]:
df.columns

Index(['patient', 'sex', 'birth', 'creatinine_urines mg/dl', 'HGA_u mg/dl',
       'HGA_U mg/24h', 'bqa', 'urate', 'uric_acid_urine', 'tyr_urines',
       'hypoxanthine_urines', 'xanthine_urines', 'Protein change allele 1 ',
       'DNA change allele 1', 'Exon/intron allele 1',
       'Protein change allele 2', 'DNA change allele 2', 'ex/in allele 2',
       'SAA (ug/mL) ', 'chitotriosidase', 'HGA_serum', 'CATD (ng/mL)',
       'IL-6 (pg/mL)', 'IL-1beta (pg/mL)', 'IL-1ra (pg/mL)', 'TNFalfa (pg/mL)',
       'CRP (mg/dL)', 'CRP (æg/mL) ELISA', 'MMP3 (ng/mL)', 'aopp',
       'glucose(mg/dL)', 'creatinine (mg/dL)', 'cholesterol (mg/dL)',
       'triglycerides  (mg/dL)', 'hdl_cholesterol (mg/dL)',
       'ldl_cholesterol (mg/dL)', 'alkaline_phosphatase (UI/L)',
       'cystatin_C (mg/dL)', 'smoker/cigarettes_a_day', 'alcohol_units_weekly',
       'bmi', 'physical_health_score', 'mental_health_score',
       'AKUSSI_jointpain', 'AKUSSI_spinalpain', 'KOOSpain', 'KOOSsymptoms',
       'KOOSdai

In [3]:
len(df.columns)

67

In [4]:
# get the index of Protein change allele 1 column
df.columns.get_loc('Protein change allele 1 ') + 6

18

In [5]:
# get the index of physical_health_score column
df.columns.get_loc('physical_health_score') + 11

52

In [6]:
# create a new dataframe with the columns of interest, from index 12 to 18 and 41 to 52
df2 = df.iloc[:, list(range(12, 18)) + list(range(41, 52))] 
df2.head()

Unnamed: 0,Protein change allele 1,DNA change allele 1,Exon/intron allele 1,Protein change allele 2,DNA change allele 2,ex/in allele 2,physical_health_score,mental_health_score,AKUSSI_jointpain,AKUSSI_spinalpain,KOOSpain,KOOSsymptoms,KOOSdaily_living,KOOSsport,KOOS_QOL,HAQ_hapVAS,HAQ_haqDI
0,G161R,c.481G>A,ex8,His371Profs,c.1111dupC,ex13,26.0,43.0,57.0,75.0,67.0,89.0,75.0,70.0,69.0,63.0,1.5
1,G161R,c.481G>A,ex8,G161R,c.481G>A,ex8,53.0,51.0,29.0,25.0,92.0,96.0,91.0,85.0,75.0,23.0,0.25
2,G161R,c.481G>A,ex8,M368V,c.1102A>G,ex13,23.0,40.0,36.0,100.0,47.0,39.0,31.0,0.0,19.0,53.0,1.75
3,Y6_G29del ivs1-1G>A,c.16-1G>A,in1,Y6_G29del ivs1-1G>A,c.16-1G>A,in1,,,,,,,,,,,
4,G161R,c.481G>A,ex8,G161R,c.481G>A,ex8,23.0,29.0,50.0,100.0,33.0,29.0,60.0,25.0,13.0,70.0,2.0


In [7]:
df2.shape

(219, 17)

In [8]:
# count null values of Protein change allele 1 column
df2['Protein change allele 1 '].isnull().sum()

19

In [9]:
# check if the null value of Protein change allele 1 is the same as Protein change allele 2
df2['Protein change allele 1 '].isnull().sum() == df2['Protein change allele 2'].isnull().sum()

True

In [10]:
# check if they are the same indexes
(df2['Protein change allele 1 '].isnull() == df2['Protein change allele 2'].isnull()).all()

False

In [11]:
# print the indexes of the null values
df2[df2['Protein change allele 1 '].isnull()].index

Int64Index([ 50,  54,  61,  66,  72,  80,  84,  87,  88,  89,  90, 165, 166,
            183, 185, 191, 192, 204, 213],
           dtype='int64')

In [12]:
# print the indexes of the null values
null1 = df2['Protein change allele 1 '][df2['Protein change allele 1 '].isnull()].index

In [13]:
null1

Int64Index([ 50,  54,  61,  66,  72,  80,  84,  87,  88,  89,  90, 165, 166,
            183, 185, 191, 192, 204, 213],
           dtype='int64')

In [14]:
null2 = df2['Protein change allele 2'][df2['Protein change allele 2'].isnull()].index

In [15]:
null2

Int64Index([ 50,  54,  61,  66,  80,  84,  87,  88,  89,  90, 165, 166, 178,
            183, 185, 191, 192, 204, 213],
           dtype='int64')

In [16]:
# merge null1 and null2 in a list without repetition
null = list(set(null1) | set(null2))
len(null)

20

In [17]:
len(null1), len(null2)

(19, 19)

In [18]:
# remove the null indexed rows from the dataframe
df2 = df2.drop(null)

In [19]:
df2.shape

(199, 17)

In [20]:
df2.head()

Unnamed: 0,Protein change allele 1,DNA change allele 1,Exon/intron allele 1,Protein change allele 2,DNA change allele 2,ex/in allele 2,physical_health_score,mental_health_score,AKUSSI_jointpain,AKUSSI_spinalpain,KOOSpain,KOOSsymptoms,KOOSdaily_living,KOOSsport,KOOS_QOL,HAQ_hapVAS,HAQ_haqDI
0,G161R,c.481G>A,ex8,His371Profs,c.1111dupC,ex13,26.0,43.0,57.0,75.0,67.0,89.0,75.0,70.0,69.0,63.0,1.5
1,G161R,c.481G>A,ex8,G161R,c.481G>A,ex8,53.0,51.0,29.0,25.0,92.0,96.0,91.0,85.0,75.0,23.0,0.25
2,G161R,c.481G>A,ex8,M368V,c.1102A>G,ex13,23.0,40.0,36.0,100.0,47.0,39.0,31.0,0.0,19.0,53.0,1.75
3,Y6_G29del ivs1-1G>A,c.16-1G>A,in1,Y6_G29del ivs1-1G>A,c.16-1G>A,in1,,,,,,,,,,,
4,G161R,c.481G>A,ex8,G161R,c.481G>A,ex8,23.0,29.0,50.0,100.0,33.0,29.0,60.0,25.0,13.0,70.0,2.0


In [21]:
# count the number of 'ex8' value in Exon/intron allele 1 column
df2['Exon/intron allele 1'].value_counts()

ex8      55
ex13     21
ex6      18
ex7      14
ex10     12
ex14      8
ex3       8
ex2       7
ex11      7
 in1      6
 ex3      6
ex9       4
 ex13     4
ex12      4
 ex14     3
ex7i      3
ex5       2
in1       2
 ex10     2
in7       2
ex4       2
in8       1
 ex2      1
S47L      1
 ex8      1
in12      1
in5       1
in10      1
in2       1
ex 8      1
Name: Exon/intron allele 1, dtype: int64

vediamo che si deve stare attenti: ci sono varie istanze di ex8, questo perche è stato scritto con gli spazi a volte (ex8, ex8,ex 8)

In [22]:
# sum all the values that contains "ex" in the string value
df2['Exon/intron allele 1'].str.contains('ex').sum()

183

In [23]:
# get the indexes of the values that contains "in" in the string value
in_indexes = df2['Exon/intron allele 1'][df2['Exon/intron allele 1'].str.contains('in')].index

In [24]:
in_indexes

Int64Index([3, 24, 74, 103, 115, 135, 141, 142, 148, 157, 159, 163, 170, 193,
            217],
           dtype='int64')

In [25]:
# drop the rows which index is in_indexes
df3 = df2.drop(in_indexes)

In [26]:
df3.shape

(184, 17)

In [27]:
# sum all the values that contains "ex" in the string value
df3['Exon/intron allele 1'].str.contains('ex').sum()

183

notiamo che 183 su 184 istanze hanno ex nella colonna `Exon/intron allele 1`, vediamo chi è la riga che non ha ex

In [28]:
# see instance that has no "ex" in the string value
df3['Exon/intron allele 1'][~df3['Exon/intron allele 1'].str.contains('ex')]

51    S47L
Name: Exon/intron allele 1, dtype: object

In [29]:
df3[48:49]

Unnamed: 0,Protein change allele 1,DNA change allele 1,Exon/intron allele 1,Protein change allele 2,DNA change allele 2,ex/in allele 2,physical_health_score,mental_health_score,AKUSSI_jointpain,AKUSSI_spinalpain,KOOSpain,KOOSsymptoms,KOOSdaily_living,KOOSsport,KOOS_QOL,HAQ_hapVAS,HAQ_haqDI
51,c.140C>T,ex3,S47L,c.140C>T,ex3,,,,,,,,,,,,


è evidente che i dati sono utilizzabili, ma vanno spostati nelle colonne corrette

siccome vediamo che ci sono altri pazienti con la stessa situazione, non serve nemmeno che facciamo questi cambiamenti perche il paziente 51 e gia rappresentato

In [30]:
# define df that has S47L in the Protein change allele 1 column
df_s = df3[df3['Protein change allele 1 '].str.contains('S47L')]
df_s

Unnamed: 0,Protein change allele 1,DNA change allele 1,Exon/intron allele 1,Protein change allele 2,DNA change allele 2,ex/in allele 2,physical_health_score,mental_health_score,AKUSSI_jointpain,AKUSSI_spinalpain,KOOSpain,KOOSsymptoms,KOOSdaily_living,KOOSsport,KOOS_QOL,HAQ_hapVAS,HAQ_haqDI
6,S47L,c.140C>T,ex3,S47L,c.140C>T,ex3,38.0,53.0,29.0,100.0,78.0,93.0,71.0,25.0,44.0,50.0,0.75


In [31]:
# get the index of the elements that does not contain "ex" in the string value
not_ex_indexes = df3['Exon/intron allele 1'][~df3['Exon/intron allele 1'].str.contains('ex')].index
not_ex_indexes

Int64Index([51], dtype='int64')

In [32]:
# drop the rows which index is not_ex_indexes
df3 = df3.drop(not_ex_indexes)

In [33]:
not_ex_indexes2 = df3['ex/in allele 2'][~df3['ex/in allele 2'].str.contains('ex')].index
not_ex_indexes2

Int64Index([8, 15, 45, 78, 98, 108, 130, 131, 146, 147, 153, 181, 207], dtype='int64')

In [34]:
# drop the rows which index is not_ex_indexes
df3 = df3.drop(not_ex_indexes2)
df3.shape

(170, 17)

In [35]:
def is_replace(mutation):

    if mutation[0].isupper() and mutation[-1].isupper() and mutation[1:-1].isdigit(): 
        return True
    return False

In [36]:
# drop the rows that does not contain a replacement mutation
df4 = df3[df3['Protein change allele 1 '].apply(is_replace)]
df4.shape

(135, 17)

In [38]:
# in the same way, drop the rows that does not contain a replacement mutation wrt allele 2
df4 = df4[df4['Protein change allele 2'].apply(is_replace)]

In [39]:
df4.shape

(116, 17)

In [40]:
df4.head(15)

Unnamed: 0,Protein change allele 1,DNA change allele 1,Exon/intron allele 1,Protein change allele 2,DNA change allele 2,ex/in allele 2,physical_health_score,mental_health_score,AKUSSI_jointpain,AKUSSI_spinalpain,KOOSpain,KOOSsymptoms,KOOSdaily_living,KOOSsport,KOOS_QOL,HAQ_hapVAS,HAQ_haqDI
1,G161R,c.481G>A,ex8,G161R,c.481G>A,ex8,53.0,51.0,29.0,25.0,92.0,96.0,91.0,85.0,75.0,23.0,0.25
2,G161R,c.481G>A,ex8,M368V,c.1102A>G,ex13,23.0,40.0,36.0,100.0,47.0,39.0,31.0,0.0,19.0,53.0,1.75
4,G161R,c.481G>A,ex8,G161R,c.481G>A,ex8,23.0,29.0,50.0,100.0,33.0,29.0,60.0,25.0,13.0,70.0,2.0
5,G161R,c.481G>A,ex8,G161R,c.481G>A,ex8,26.0,31.0,50.0,100.0,44.0,29.0,32.0,35.0,13.0,71.0,1.88
6,S47L,c.140C>T,ex3,S47L,c.140C>T,ex3,38.0,53.0,29.0,100.0,78.0,93.0,71.0,25.0,44.0,50.0,0.75
7,G161R,c.481G>A,ex8,G161R,c.481G>A,ex8,41.0,64.0,21.0,0.0,78.0,39.0,76.0,60.0,56.0,2.0,0.63
10,W97C,c.291G>C,ex5,W97C,c.291G>C,ex5,44.0,56.0,57.0,50.0,94.0,100.0,100.0,100.0,69.0,55.0,0.0
13,R225P,c.674G>C,ex10,I216T,c.647T>C,ex9,33.0,17.0,57.0,100.0,17.0,0.0,21.0,0.0,6.0,100.0,1.38
16,R53Q,c.158G>A,ex3,R53Q,c.158G>A,ex3,25.0,44.0,29.0,75.0,47.0,54.0,18.0,0.0,19.0,77.0,1.63
17,G161R,c.481G>A,ex8,G161R,c.481G>A,ex8,40.0,2.0,50.0,100.0,,,,,,98.0,2.5


In [41]:
seq = [
    'Protein change allele 1 ', 
    'DNA change allele 1',
    'Exon/intron allele 1', 
    'Protein change allele 2',
    'DNA change allele 2', 
    'ex/in allele 2'
]

# keep only the columns in the seq list
df5 = df4[seq]
df5.head()

Unnamed: 0,Protein change allele 1,DNA change allele 1,Exon/intron allele 1,Protein change allele 2,DNA change allele 2,ex/in allele 2
1,G161R,c.481G>A,ex8,G161R,c.481G>A,ex8
2,G161R,c.481G>A,ex8,M368V,c.1102A>G,ex13
4,G161R,c.481G>A,ex8,G161R,c.481G>A,ex8
5,G161R,c.481G>A,ex8,G161R,c.481G>A,ex8
6,S47L,c.140C>T,ex3,S47L,c.140C>T,ex3


In [42]:
df5['DNA change allele 1'].value_counts()

DNA change allele 1
c.481G>A      36
c.365C>T      12
 c.1201G>C     8
c.1102A>G      7
 c.158G>A      4
 c.1102A>G     3
 C158G>A       3
c.502G>A       3
c.688C>T       3
 c.688C>T      3
c.1078G>C      2
c.368G>C       2
c.647T>C       2
c.808G>A       2
c.359G>T       1
c.289T>G       1
c.217T>C       1
c.509G>C       1
c.1081G>A      1
c.553G>A       1
c.1079G>C      1
c.815A>G       1
c.508G>A       1
c.449C>T       1
c.130C>T       1
c.680T>C       1
c.593G>A       1
c.140C>T       1
c.742A>G       1
c.119A>C       1
c.752G>A       1
c.347T>C       1
c.533A>G       1
c.990G>C       1
c.1037T>C      1
c.1085G>A      1
c.800C>T       1
c.674G>C       1
c.291G>C       1
c.1057A>C      1
Name: count, dtype: int64

some instances have a space before the DNA change: let's uniform the data

In [43]:
# eliminate the space in the column DNA change allele 1
df5['DNA change allele 1'] = df5['DNA change allele 1'].str.replace(' ', '')
df5['DNA change allele 1'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df5['DNA change allele 1'] = df5['DNA change allele 1'].str.replace(' ', '')


DNA change allele 1
c.481G>A     36
c.365C>T     12
c.1102A>G    10
c.1201G>C     8
c.688C>T      6
c.158G>A      4
C158G>A       3
c.502G>A      3
c.1078G>C     2
c.368G>C      2
c.647T>C      2
c.808G>A      2
c.449C>T      1
c.359G>T      1
c.217T>C      1
c.509G>C      1
c.289T>G      1
c.1081G>A     1
c.553G>A      1
c.680T>C      1
c.815A>G      1
c.508G>A      1
c.1079G>C     1
c.742A>G      1
c.593G>A      1
c.130C>T      1
c.140C>T      1
c.119A>C      1
c.752G>A      1
c.347T>C      1
c.533A>G      1
c.990G>C      1
c.1037T>C     1
c.1085G>A     1
c.800C>T      1
c.674G>C      1
c.291G>C      1
c.1057A>C     1
Name: count, dtype: int64

In [44]:
# see value counts for Exon/intron allele 1 column
df5['Exon/intron allele 1'].value_counts()

Exon/intron allele 1
ex8      41
ex6      16
ex13     14
ex10      8
 ex3      6
ex14      5
ex11      4
ex3       4
ex9       4
 ex14     3
 ex13     3
ex5       2
 ex10     2
ex12      1
ex7       1
ex4       1
ex 8      1
Name: count, dtype: int64

In [45]:
# eliminate the space in the column Exon/intron allele 1
df5['Exon/intron allele 1'] = df5['Exon/intron allele 1'].str.replace(' ', '')
df5['Exon/intron allele 1'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df5['Exon/intron allele 1'] = df5['Exon/intron allele 1'].str.replace(' ', '')


Exon/intron allele 1
ex8     42
ex13    17
ex6     16
ex3     10
ex10    10
ex14     8
ex11     4
ex9      4
ex5      2
ex12     1
ex7      1
ex4      1
Name: count, dtype: int64

In [46]:
df5['DNA change allele 2'].value_counts()

DNA change allele 2
c.481G>A      23
c.365C>T      13
 c.1201G>C     9
c.1102A>G      8
 c.158G>A      6
c.1078G>C      6
 c.688C>T      5
 c.899T>G      4
c.808G>A       4
 C158G>A       3
c.680T>C       3
 c.1102A>G     3
c.647T>C       2
c.502G>A       2
c.368G>C       2
c.688C>T       2
c.500C>T       1
c.533A>G       1
c.359G>T       1
c.1057A>C      1
c.995C>G       1
c.815A>G       1
c.899T>G       1
c.752G>A       1
c.593G>A       1
c.130C>T       1
c.742A>G       1
c.119A>C       1
c.614G>A       1
c.347T>C       1
c.990G>C       1
c.1037T>C      1
c.454G>A       1
c.800C>T       1
c.291G>C       1
c.140C>T       1
c.52G>T        1
Name: count, dtype: int64

In [47]:
# eliminate the space in the column DNA change allele 2
df5['DNA change allele 2'] = df5['DNA change allele 2'].str.replace(' ', '')
df5['DNA change allele 2'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df5['DNA change allele 2'] = df5['DNA change allele 2'].str.replace(' ', '')


DNA change allele 2
c.481G>A     23
c.365C>T     13
c.1102A>G    11
c.1201G>C     9
c.688C>T      7
c.158G>A      6
c.1078G>C     6
c.899T>G      5
c.808G>A      4
c.680T>C      3
C158G>A       3
c.647T>C      2
c.368G>C      2
c.502G>A      2
c.995C>G      1
c.500C>T      1
c.359G>T      1
c.1057A>C     1
c.815A>G      1
c.533A>G      1
c.593G>A      1
c.614G>A      1
c.130C>T      1
c.742A>G      1
c.119A>C      1
c.752G>A      1
c.347T>C      1
c.990G>C      1
c.1037T>C     1
c.454G>A      1
c.800C>T      1
c.291G>C      1
c.140C>T      1
c.52G>T       1
Name: count, dtype: int64

In [48]:
df5['ex/in allele 2'].value_counts()

ex/in allele 2
ex8      26
ex6      17
ex13     16
 ex3     10
 ex14     9
ex10      7
ex11      6
 ex10     5
ex9       4
 ex12     4
ex12      3
 ex13     3
ex3       2
ex5       1
ex7       1
ex2       1
ex 8      1
Name: count, dtype: int64

In [49]:
# eliminate the space in the column ex/in allele 2
df5['ex/in allele 2'] = df5['ex/in allele 2'].str.replace(' ', '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df5['ex/in allele 2'] = df5['ex/in allele 2'].str.replace(' ', '')


In [50]:
df5['ex/in allele 2'].value_counts()

ex/in allele 2
ex8     27
ex13    19
ex6     17
ex3     12
ex10    12
ex14     9
ex12     7
ex11     6
ex9      4
ex5      1
ex7      1
ex2      1
Name: count, dtype: int64

In [51]:
df5.head()

Unnamed: 0,Protein change allele 1,DNA change allele 1,Exon/intron allele 1,Protein change allele 2,DNA change allele 2,ex/in allele 2
1,G161R,c.481G>A,ex8,G161R,c.481G>A,ex8
2,G161R,c.481G>A,ex8,M368V,c.1102A>G,ex13
4,G161R,c.481G>A,ex8,G161R,c.481G>A,ex8
5,G161R,c.481G>A,ex8,G161R,c.481G>A,ex8
6,S47L,c.140C>T,ex3,S47L,c.140C>T,ex3


notiamo che il paziente 4 e 5 sono uguali all'1 in tutto e per tutto, quindi saranno rappresentati dalla proteina 3d ottenuta per il paziente1

In [52]:
# count how many different rows are in the dataframe
df5.drop_duplicates().shape

(53, 6)

In [53]:
df6 = df5.drop_duplicates()
df6.head()

Unnamed: 0,Protein change allele 1,DNA change allele 1,Exon/intron allele 1,Protein change allele 2,DNA change allele 2,ex/in allele 2
1,G161R,c.481G>A,ex8,G161R,c.481G>A,ex8
2,G161R,c.481G>A,ex8,M368V,c.1102A>G,ex13
6,S47L,c.140C>T,ex3,S47L,c.140C>T,ex3
10,W97C,c.291G>C,ex5,W97C,c.291G>C,ex5
13,R225P,c.674G>C,ex10,I216T,c.647T>C,ex9


vediamo infatti che i pazienti 4 e 5 sono stati tolti

In [54]:
# save the dataframe to a csv file
df6.to_csv('../datasets/aku_prin_v2.0_cleaned.csv', index=False)

---

In [56]:
# return the indexes of the elements that have the same value in both columns 
same_change = df4[df4['Protein change allele 1 '] == df4['Protein change allele 2']].index
print(len(same_change))
# do the same for exon/intron columns
same_exon = df4[df4['Exon/intron allele 1'] == df4['ex/in allele 2']].index
print(len(same_exon))

# count how many indexes are in both lists
indexes = set(same_change) & set(same_exon)

# print the df4 elements that are in the indexes list
df4.loc[list(indexes)]


78
76


Unnamed: 0,Protein change allele 1,DNA change allele 1,Exon/intron allele 1,Protein change allele 2,DNA change allele 2,ex/in allele 2,physical_health_score,mental_health_score,AKUSSI_jointpain,AKUSSI_spinalpain,KOOSpain,KOOSsymptoms,KOOSdaily_living,KOOSsport,KOOS_QOL,HAQ_hapVAS,HAQ_haqDI
128,A122V,c.365C>T,ex6,A122V,c.365C>T,ex6,24.0,50.0,29.0,25.0,44.0,61.0,25.0,20.0,50.0,100.0,1.88
1,G161R,c.481G>A,ex8,G161R,c.481G>A,ex8,53.0,51.0,29.0,25.0,92.0,96.0,91.0,85.0,75.0,23.0,0.25
129,A122V,c.365C>T,ex6,A122V,c.365C>T,ex6,40.0,60.0,14.0,25.0,78.0,86.0,75.0,85.0,63.0,0.0,0.00
4,G161R,c.481G>A,ex8,G161R,c.481G>A,ex8,23.0,29.0,50.0,100.0,33.0,29.0,60.0,25.0,13.0,70.0,2.00
5,G161R,c.481G>A,ex8,G161R,c.481G>A,ex8,26.0,31.0,50.0,100.0,44.0,29.0,32.0,35.0,13.0,71.0,1.88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111,R53Q,c.158G>A,ex3,R53Q,c.158G>A,ex3,37.0,40.0,21.0,50.0,83.0,93.0,87.0,80.0,75.0,75.0,0.88
112,P230S,c.688C>T,ex10,P230S,c.688C>T,ex10,39.0,32.0,50.0,75.0,64.0,54.0,65.0,5.0,19.0,30.0,1.13
114,G123A,c.368G>C,ex6,G123A,c.368G>C,ex6,29.0,37.0,71.0,50.0,44.0,50.0,52.0,40.0,56.0,35.0,1.75
122,G161R,c.481G>A,ex8,G161R,c.481G>A,ex8,,,14.0,0.0,100.0,100.0,100.0,100.0,100.0,30.0,0.13


In [61]:
# count unique values of Protein change allele 1 column
unique1 = df['Protein change allele 1 '].unique()
unique2 = df['Protein change allele 2'].unique()
unique = set(unique1) | set(unique2)
len(unique)

90

# Obtain all the possible mutations we have: not only substitutions

In [37]:
df3.head()

Unnamed: 0,Protein change allele 1,DNA change allele 1,Exon/intron allele 1,Protein change allele 2,DNA change allele 2,ex/in allele 2,physical_health_score,mental_health_score,AKUSSI_jointpain,AKUSSI_spinalpain,KOOSpain,KOOSsymptoms,KOOSdaily_living,KOOSsport,KOOS_QOL,HAQ_hapVAS,HAQ_haqDI
0,G161R,c.481G>A,ex8,His371Profs,c.1111dupC,ex13,26.0,43.0,57.0,75.0,67.0,89.0,75.0,70.0,69.0,63.0,1.5
1,G161R,c.481G>A,ex8,G161R,c.481G>A,ex8,53.0,51.0,29.0,25.0,92.0,96.0,91.0,85.0,75.0,23.0,0.25
2,G161R,c.481G>A,ex8,M368V,c.1102A>G,ex13,23.0,40.0,36.0,100.0,47.0,39.0,31.0,0.0,19.0,53.0,1.75
4,G161R,c.481G>A,ex8,G161R,c.481G>A,ex8,23.0,29.0,50.0,100.0,33.0,29.0,60.0,25.0,13.0,70.0,2.0
5,G161R,c.481G>A,ex8,G161R,c.481G>A,ex8,26.0,31.0,50.0,100.0,44.0,29.0,32.0,35.0,13.0,71.0,1.88


In [38]:
# obtain all the values in the columns Protein change allele 1 and Protein change allele 2, without repetitions
unique1 = df3['Protein change allele 1 '].unique()
unique2 = df3['Protein change allele 2'].unique()
unique = set(unique1) | set(unique2)
len(unique)

71

In [39]:
# see which elements of unique are subs (is_replace(mutation))
subs = [mutation for mutation in unique if is_replace(mutation)]
len(subs)

49

In [40]:
subs

['D153G',
 'G270R',
 'W97C',
 'E168L',
 'G360A',
 'K248E',
 'P332R',
 'P230S',
 'G205V',
 'S47L',
 'F227S',
 'S150L',
 'E401Q',
 'G251D',
 'G360R',
 'R225P',
 'R225H',
 'D18Y',
 'V300G',
 'G185R',
 'G152R',
 'I216T',
 'W97G',
 'R53Q',
 'A122V',
 'T167I',
 'G123A',
 'R330S',
 'Y272C',
 'E168K',
 'G362E',
 'L353Q',
 'F73L',
 'K353Q',
 'D18N',
 'G170A',
 'G198D',
 'I346T',
 'A267V',
 'G161R',
 'G205D',
 'M368V',
 'E178G',
 'G361R',
 'L44F',
 'G170S',
 'Y40S',
 'L116P',
 'C120F']

In [41]:
# consider the file in datasets/mutated_sequences.csv
ms = pd.read_csv('../datasets/mutated_sequences.csv', header=None)
ms.head()

Unnamed: 0,0,1
0,G161R,MAELKYISGFGNECSSEDPRCPGSLPEGQNNPQVCPYNLYAEQLS...
1,S47L,MAELKYISGFGNECSSEDPRCPGSLPEGQNNPQVCPYNLYAEQLS...
2,W97C,MAELKYISGFGNECSSEDPRCPGSLPEGQNNPQVCPYNLYAEQLS...
3,R225P,MAELKYISGFGNECSSEDPRCPGSLPEGQNNPQVCPYNLYAEQLS...
4,R53Q,MAELKYISGFGNECSSEDPRCPGSLPEGQNNPQVCPYNLYAEQLS...


In [42]:
# see difference between subs and ms[0]
diff = set(subs) - set(ms[0])
diff

{'D153G', 'D18N', 'G205V', 'L353Q', 'R225H'}

In [43]:
def read_faa_file(file_path):
    sequences = []
    for record in SeqIO.parse(file_path, "fasta"):
        sequences.append(record)
    return sequences

# Specify the path to your .faa file
file_path = "../datasets/protein.faa"

# Read the .faa file
sequences = read_faa_file(file_path)
hgd_protein = sequences[0].seq
hgd_protein = Bio.Seq.MutableSeq(hgd_protein)
hgd_protein

MutableSeq('MAELKYISGFGNECSSEDPRCPGSLPEGQNNPQVCPYNLYAEQLSGSAFTCPRS...EPN')

In [44]:
hgd_protein[152], hgd_protein[17], hgd_protein[204], hgd_protein[352], hgd_protein[224]

('D', 'D', 'G', 'K', 'R')

In [45]:
# obtain the mutated sequences
seq = replace(hgd_protein, 'D153G')
print(str(seq))

MAELKYISGFGNECSSEDPRCPGSLPEGQNNPQVCPYNLYAEQLSGSAFTCPRSTNKRSWLYRILPSVSHKPFESIDEGQVTHNWDEVDPDPNQLRWKPFEIPKASQKKVDFVSGLHTLCGAGDIKSNNGLAIHIFLCNTSMENRCFYNSDGGFLIVPQKGNLLIYTEFGKMLVQPNEICVIQRGMRFSIDVFEETRGYILEVYGVHFELPDLGPIGANGLANPRDFLIPIAWYEDRQVPGGYTVINKYQGKLFAAKQDVSPFNVVAWHGNYTPYKYNLKNFMVINSVAFDHADPSIFTVLTAKSVRPGVAIADFVIFPPRWGVADKTFRPPYYHRNCMSEFMGLIRGHYEAKQGGFLPGGGSLHSTMTPHGPDADCFEKASKVKLAPERIADGTMAFMFESSLSLAVTKWGLKASRCLDENYHKCWEPLKSHFTPNSRNPAEPN


In [58]:
seq = replace(hgd_protein, 'G309V')
print(str(seq))

MAELKYISGFGNECSSEDPRCPGSLPEGQNNPQVCPYNLYAEQLSGSAFTCPRSTNKRSWLYRILPSVSHKPFESIDEGQVTHNWDEVDPDPNQLRWKPFEIPKASQKKVDFVSGLHTLCGAGDIKSNNGLAIHIFLCNTSMENRCFYNSDGDFLIVPQKGNLLIYTEFGKMLVQPNEICVIQRGMRFSIDVFEETRGYILEVYGVHFELPDLGPIGANGLANPRDFLIPIAWYEDRQVPGGYTVINKYQGKLFAAKQDVSPFNVVAWHGNYTPYKYNLKNFMVINSVAFDHADPSIFTVLTAKSVRPVVAIADFVIFPPRWGVADKTFRPPYYHRNCMSEFMGLIRGHYEAKQGGFLPGGGSLHSTMTPHGPDADCFEKASKVKLAPERIADGTMAFMFESSLSLAVTKWGLKASRCLDENYHKCWEPLKSHFTPNSRNPAEPN


In [46]:
# now consider unique - subs
other_mutations = set(unique) - set(subs)
len(other_mutations)

22

In [47]:
other_mutations

{'(p.(Ala218_Asn219insLysIle))',
 'A218fs',
 'D153fs',
 'E168*',
 'G115Mfs*',
 'G11fs',
 'G309V ',
 'G372_P373delinsA',
 'H371fs',
 'His371Profs',
 'M339fs',
 'Q29fs',
 'R145Sfs ivs7+5G>A',
 'R321*',
 'R336S fs  ivs12-2A>T',
 'S59fs',
 'V157fs',
 'W60*',
 'del ex13',
 'del ex13 (MLPA)',
 'del ex2 (MLPA)',
 'del ex5,6 (MLPA)'}

In [48]:
# We spotted another substitution: 'G309V '

In [49]:
hgd_protein[308]

'G'

In [50]:
# see if G309V is in ms[0]
'G309V' in ms[0].values

False

In [51]:
# eliminate G309V from other_mutations
other_mutations = other_mutations - {'G309V '}
len(other_mutations)

21

In [52]:
other_mutations

{'(p.(Ala218_Asn219insLysIle))',
 'A218fs',
 'D153fs',
 'E168*',
 'G115Mfs*',
 'G11fs',
 'G372_P373delinsA',
 'H371fs',
 'His371Profs',
 'M339fs',
 'Q29fs',
 'R145Sfs ivs7+5G>A',
 'R321*',
 'R336S fs  ivs12-2A>T',
 'S59fs',
 'V157fs',
 'W60*',
 'del ex13',
 'del ex13 (MLPA)',
 'del ex2 (MLPA)',
 'del ex5,6 (MLPA)'}

# Obtain mutated sequences (from the DNA change)

In [2]:
# proviamo con la sequenza trovata su `https://hgddatabase.cvtisr.sk/refseq/HGD_codingDNA.html`
dna = 'ATGGCTGAGTTAAAGTACATTTCTGGATTTGGGAATGAGTGTTCTTCAGAGGATCCTCGCTGCCCAGGTTCCCTGCCAGAAGGACAGAATAATCCTCAGGTCTGCCCCTACAATCTCTATGCTGAGCAGCTCTCAGGATCGGCTTTCACTTGTCCACGGAGCACCAATAAGAGAAGCTGGCTGTATAGGATTCTACCTTCAGTTTCTCACAAGCCCTTTGAATCCATTGACGAAGGCCAAGTCACTCACAACTGGGATGAAGTTGATCCTGATCCTAACCAGCTTAGATGGAAACCATTTGAGATTCCAAAAGCATCTCAGAAGAAAGTAGACTTTGTGAGTGGCCTGCATACCTTGTGTGGAGCTGGAGACATAAAGTCTAACAATGGGCTTGCTATCCACATTTTCCTCTGCAATACCTCCATGGAGAACAGATGCTTTTACAATTCAGATGGGGACTTCTTGATTGTTCCGCAGAAAGGGAACCTTCTCATTTACACCGAGTTTGGCAAGATGCTTGTACAGCCCAATGAGATCTGCGTCATTCAGAGAGGAATGCGGTTCAGCATAGATGTCTTTGAGGAGACCAGGGGCTACATCTTGGAGGTCTATGGTGTCCACTTTGAGTTACCTGACCTTGGACCAATTGGGGCCAATGGCTTGGCCAATCCTCGTGATTTCTTGATACCCATTGCCTGGTATGAGGATCGCCAAGTACCAGGTGGTTACACGGTCATTAATAAATACCAGGGCAAGCTGTTTGCTGCCAAACAGGATGTCTCCCCGTTCAATGTTGTGGCCTGGCACGGGAATTATACACCCTACAAGTACAACCTGAAGAATTTCATGGTTATCAACTCAGTGGCCTTTGACCATGCAGACCCATCCATTTTCACAGTATTGACTGCTAAGTCTGTCCGCCCTGGAGTGGCCATTGCTGATTTTGTCATCTTCCCACCTCGATGGGGGGTTGCTGATAAGACCTTCAGGCCTCCTTATTACCATAGGAACTGCATGAGTGAGTTCATGGGACTCATCCGAGGTCACTATGAGGCAAAGCAAGGTGGGTTCCTGCCAGGGGGAGGGAGTCTACACAGCACAATGACCCCCCATGGACCTGATGCTGACTGCTTTGAGAAGGCCAGCAAGGTCAAGCTGGCACCTGAGAGGATTGCCGATGGCACCATGGCATTTATGTTTGAATCATCTTTAAGTCTGGCGGTCACAAAGTGGGGACTCAAGGCCTCCAGGTGTTTGGATGAGAACTACCACAAGTGCTGGGAGCCACTCAAGAGCCACTTCACTCCCAACTCCAGGAACCCAGCAGAACCTAATTGA'
len(dna)

1338

In [3]:
# prendiamo la proteina di hgd sano

def read_faa_file(file_path):
    sequences = []
    for record in SeqIO.parse(file_path, "fasta"):
        sequences.append(record)
    return sequences

# Specify the path to your .faa file
file_path = "../datasets/protein.faa"


# Read the .faa file
sequences = read_faa_file(file_path)

hgd_protein = sequences[0].seq
hgd_protein = Bio.Seq.MutableSeq(hgd_protein)
hgd_protein

MutableSeq('MAELKYISGFGNECSSEDPRCPGSLPEGQNNPQVCPYNLYAEQLSGSAFTCPRS...EPN')

In [4]:
len(hgd_protein)

445

In [5]:
445*3

1335

La differenza di lunghezza tra il pezzo di DNA `dna` (1338) e `hgd_protein` si spiega col fatto che l'ultima tripletta di `dna` codifica uno stop e non un aminoacido!

In [6]:
# eliminate last 3 elements of the dna sequence
dna = dna[:-3]
len(dna)

1335

In [7]:
# translate the dna sequence into a protein

def translate(seq): 

    table = { 
        'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', 
        'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', 
        'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', 
        'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',                  
        'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 
        'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 
        'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', 
        'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R', 
        'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 
        'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A', 
        'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', 
        'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 
        'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 
        'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L', 
        'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_', 
        'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W', 
    } 
    protein ="" 
    if len(seq)%3 == 0: 
        for i in range(0, len(seq), 3): 
            codon = seq[i:i + 3] 
            protein+= table[codon] 
    return protein 

In [8]:
# Apply the function to our case
protein = translate(dna)

# check if the protein is the same as the hgd_protein
protein == str(hgd_protein)

True

Let's apply the changes to DNA on the basis of our mutations and then obtain the mutated protein, which we will then give to AlphaFold

In [9]:
other_mutations = [
    '(p.(Ala218_Asn219insLysIle))',
    'A218fs',
    'D153fs',
    'E168*',
    'G115Mfs*',
    'G11fs',
    'G372_P373delinsA',
    'H371fs',
    'His371Profs',
    'M339fs',
    'Q29fs',
    'R145Sfs ivs7+5G>A',
    'R321*',
    'R336S fs  ivs12-2A>T',
    'S59fs',
    'V157fs',
    'W60*',
    'del ex13',
    'del ex13 (MLPA)',
    'del ex2 (MLPA)',
    'del ex5,6 (MLPA)'
    ]

1. Let's consider the first "strange" mutation, using `https://hgddatabase.cvtisr.sk/variants.php?action=search_unique&select_db=HGD`, we can obtain the DNA change causing the mutation, then we will obtain the amino acids sequences translated by the DNA!!!

In [10]:
# p.(Ala218_Asn219insLysIle) --> c.656_657insAATCAA
# insert the sequence AATCAA between the nucleotides 656 and 657
dna_mutated = dna[:656] + 'AATCAA' + dna[656:]

# obtain the protein sequence
protein = translate(dna_mutated)

# check positions 218-221 of the protein
protein[217:221]

'AKIN'

In [17]:
print(other_mutations[0] + ': ' + protein)

(p.(Ala218_Asn219insLysIle)): MAELKYISGFGNECSSEDPRCPGSLPEGQNNPQVCPYNLYAEQLSGSAFTCPRSTNKRSWLYRILPSVSHKPFESIDEGQVTHNWDEVDPDPNQLRWKPFEIPKASQKKVDFVSGLHTLCGAGDIKSNNGLAIHIFLCNTSMENRCFYNSDGDFLIVPQKGNLLIYTEFGKMLVQPNEICVIQRGMRFSIDVFEETRGYILEVYGVHFELPDLGPIGAKINGLANPRDFLIPIAWYEDRQVPGGYTVINKYQGKLFAAKQDVSPFNVVAWHGNYTPYKYNLKNFMVINSVAFDHADPSIFTVLTAKSVRPGVAIADFVIFPPRWGVADKTFRPPYYHRNCMSEFMGLIRGHYEAKQGGFLPGGGSLHSTMTPHGPDADCFEKASKVKLAPERIADGTMAFMFESSLSLAVTKWGLKASRCLDENYHKCWEPLKSHFTPNSRNPAEPN


2. Seconda mutazione: è un frameshift

In [18]:
other_mutations[1]

'A218fs'

In [29]:
# translate the dna sequence into a protein: for the FRAMESHIFTS

def translate_fs(seq): 

    table = { 
        'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', 
        'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', 
        'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', 
        'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',                  
        'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 
        'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 
        'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', 
        'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R', 
        'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 
        'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A', 
        'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', 
        'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 
        'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 
        'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L', 
        'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_', 
        'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W', 
    } 
    protein ="" 
    for i in range(0, len(seq), 3): 
            codon = seq[i:i + 3] 
            if table[codon] == '_':
                break
            protein+= table[codon]
    return protein 

In [31]:
# A218fs --> c.652delG
# delete the nucleotide in position 652
dna_mutated = dna[:651] + dna[652:]

# obtain the protein sequence
protein = translate_fs(dna_mutated)

# see how long it is
len(protein)

227

In [32]:
# see what is in position 218 of hgd_protein, and what is in position 218 of protein
hgd_protein[217], protein[217]

('A', 'P')

perfetto, dato che: la mutazione A218fs sarebbe p.(Ala218Profs*11)

In [33]:
print(other_mutations[1] + ': ' + protein)

A218fs: MAELKYISGFGNECSSEDPRCPGSLPEGQNNPQVCPYNLYAEQLSGSAFTCPRSTNKRSWLYRILPSVSHKPFESIDEGQVTHNWDEVDPDPNQLRWKPFEIPKASQKKVDFVSGLHTLCGAGDIKSNNGLAIHIFLCNTSMENRCFYNSDGDFLIVPQKGNLLIYTEFGKMLVQPNEICVIQRGMRFSIDVFEETRGYILEVYGVHFELPDLGPIGPMAWPILVIS


In [34]:
other_mutations[2]

'D153fs'

In [35]:
# c.457dupG
# check if in position 457 of dna there is a G
dna[456:457]

'G'

In [36]:
# add a G in position 458, check length of dna
dna_mutated = dna[:457] + 'G' + dna[457:]
len(dna_mutated)

1336

In [37]:
protein = translate_fs(dna_mutated)
print(other_mutations[2] + ': ' + protein)
len(protein)

D153fs: MAELKYISGFGNECSSEDPRCPGSLPEGQNNPQVCPYNLYAEQLSGSAFTCPRSTNKRSWLYRILPSVSHKPFESIDEGQVTHNWDEVDPDPNQLRWKPFEIPKASQKKVDFVSGLHTLCGAGDIKSNNGLAIHIFLCNTSMENRCFYNSDGGLLDCSAEREPSHLHRVWQDACTAQ


177

In [38]:
other_mutations[3]

'E168*'

In [39]:
# check position 168 of hgd_protein
hgd_protein[167]

'E'

In [40]:
protein = hgd_protein[:167] 
print(other_mutations[3] + ': ' + protein)

E168*: MAELKYISGFGNECSSEDPRCPGSLPEGQNNPQVCPYNLYAEQLSGSAFTCPRSTNKRSWLYRILPSVSHKPFESIDEGQVTHNWDEVDPDPNQLRWKPFEIPKASQKKVDFVSGLHTLCGAGDIKSNNGLAIHIFLCNTSMENRCFYNSDGDFLIVPQKGNLLIYT


In [42]:
other_mutations[4]

'G115Mfs*'

In [43]:
# c.413_434+35del
# delete the nucleotides from 413 to 434
dna_mutated = dna[:412] + dna[434:]
protein = translate_fs(dna_mutated)
print(other_mutations[4] + ': ' + protein)
len(protein)

G115Mfs*: MAELKYISGFGNECSSEDPRCPGSLPEGQNNPQVCPYNLYAEQLSGSAFTCPRSTNKRSWLYRILPSVSHKPFESIDEGQVTHNWDEVDPDPNQLRWKPFEIPKASQKKVDFVSGLHTLCGAGDIKSNNGLAIHIFLYAFTIQMGTS


147

In [44]:
other_mutations[5]

'G11fs'

In [45]:
# c.31_32delGGinsATT
dna[30:32]

'GG'

In [46]:
# delete the nucleotides from 31 to 32 and insert ATT
dna_mutated = dna[:30] + 'ATT' + dna[32:]
protein = translate_fs(dna_mutated)
print(other_mutations[5] + ': ' + protein)

G11fs: MAELKYISGFIE


In [47]:
other_mutations[6]

'G372_P373delinsA'

In [48]:
# check positions 372 and 373 of hgd_protein
hgd_protein[371:373]

MutableSeq('GP')

In [49]:
# delete these amino acids and insert A
protein = hgd_protein[:371] + 'A' + hgd_protein[373:]
print(other_mutations[6] + ': ' + protein)
len(protein)

G372_P373delinsA: MAELKYISGFGNECSSEDPRCPGSLPEGQNNPQVCPYNLYAEQLSGSAFTCPRSTNKRSWLYRILPSVSHKPFESIDEGQVTHNWDEVDPDPNQLRWKPFEIPKASQKKVDFVSGLHTLCGAGDIKSNNGLAIHIFLCNTSMENRCFYNSDGDFLIVPQKGNLLIYTEFGKMLVQPNEICVIQRGMRFSIDVFEETRGYILEVYGVHFELPDLGPIGANGLANPRDFLIPIAWYEDRQVPGGYTVINKYQGKLFAAKQDVSPFNVVAWHGNYTPYKYNLKNFMVINSVAFDHADPSIFTVLTAKSVRPGVAIADFVIFPPRWGVADKTFRPPYYHRNCMSEFMGLIRGHYEAKQGGFLPGGGSLHSTMTPHADADCFEKASKVKLAPERIADGTMAFMFESSLSLAVTKWGLKASRCLDENYHKCWEPLKSHFTPNSRNPAEPN


444

In [51]:
dna[1114:1117]

'GAC'

In [52]:
# check if it is exact: c.1115_1117delGAC
dna_mutated = dna[:1114] + dna[1117:]
protein2 = translate_fs(dna_mutated)
protein == protein2

True

In [53]:
other_mutations[7]

'H371fs'

In [55]:
# c.1111dupC
dna_mutated = dna[:1111] + 'C' + dna[1111:]
protein = translate_fs(dna_mutated)
print(other_mutations[7] + ': ' + protein)

H371fs: MAELKYISGFGNECSSEDPRCPGSLPEGQNNPQVCPYNLYAEQLSGSAFTCPRSTNKRSWLYRILPSVSHKPFESIDEGQVTHNWDEVDPDPNQLRWKPFEIPKASQKKVDFVSGLHTLCGAGDIKSNNGLAIHIFLCNTSMENRCFYNSDGDFLIVPQKGNLLIYTEFGKMLVQPNEICVIQRGMRFSIDVFEETRGYILEVYGVHFELPDLGPIGANGLANPRDFLIPIAWYEDRQVPGGYTVINKYQGKLFAAKQDVSPFNVVAWHGNYTPYKYNLKNFMVINSVAFDHADPSIFTVLTAKSVRPGVAIADFVIFPPRWGVADKTFRPPYYHRNCMSEFMGLIRGHYEAKQGGFLPGGGSLHSTMTPPWT


In [56]:
other_mutations[8]

'His371Profs'

In [57]:
other_mutations[9]

'M339fs'

In [58]:
# c.1017_1019delGAGinsTA
dna[1016:1019]

'GAG'

In [59]:
# del GAG ins TA
dna_mutated = dna[:1016] + 'TA' + dna[1019:]
protein = translate_fs(dna_mutated)
print(other_mutations[9] + ': ' + protein)

M339fs: MAELKYISGFGNECSSEDPRCPGSLPEGQNNPQVCPYNLYAEQLSGSAFTCPRSTNKRSWLYRILPSVSHKPFESIDEGQVTHNWDEVDPDPNQLRWKPFEIPKASQKKVDFVSGLHTLCGAGDIKSNNGLAIHIFLCNTSMENRCFYNSDGDFLIVPQKGNLLIYTEFGKMLVQPNEICVIQRGMRFSIDVFEETRGYILEVYGVHFELPDLGPIGANGLANPRDFLIPIAWYEDRQVPGGYTVINKYQGKLFAAKQDVSPFNVVAWHGNYTPYKYNLKNFMVINSVAFDHADPSIFTVLTAKSVRPGVAIADFVIFPPRWGVADKTFRPPYYHRNCIMSSWDSSEVTMRQSKVGSCQGEGVYTAQ


In [61]:
other_mutations[10]

'Q29fs'

In [62]:
# c.85delC
dna_mutated = dna[:85] + dna[86:]
protein = translate_fs(dna_mutated)
print(other_mutations[10] + ': ' + protein)

Q29fs: MAELKYISGFGNECSSEDPRCPGSLPEGRIILRSAPTISMLSSSQDRLSLVHGAPIREAGCIGFYLQFLTSPLNPLTKAKSLTTGMKLILILTSLDGNHLRFQKHLRRK


In [65]:
other_mutations[12]

'R321*'

In [67]:
hgd_protein[320]

'R'

In [68]:
protein = hgd_protein[:320] 
print(other_mutations[12] + ': ' + protein)

R321*: MAELKYISGFGNECSSEDPRCPGSLPEGQNNPQVCPYNLYAEQLSGSAFTCPRSTNKRSWLYRILPSVSHKPFESIDEGQVTHNWDEVDPDPNQLRWKPFEIPKASQKKVDFVSGLHTLCGAGDIKSNNGLAIHIFLCNTSMENRCFYNSDGDFLIVPQKGNLLIYTEFGKMLVQPNEICVIQRGMRFSIDVFEETRGYILEVYGVHFELPDLGPIGANGLANPRDFLIPIAWYEDRQVPGGYTVINKYQGKLFAAKQDVSPFNVVAWHGNYTPYKYNLKNFMVINSVAFDHADPSIFTVLTAKSVRPGVAIADFVIFPP


In [70]:
other_mutations[14]

'S59fs'

In [71]:
# c.175delA
# delete the nucleotide in position 175
dna_mutated = dna[:174] + dna[175:]
protein = translate_fs(dna_mutated)
print(other_mutations[14] + ': ' + protein)

S59fs: MAELKYISGFGNECSSEDPRCPGSLPEGQNNPQVCPYNLYAEQLSGSAFTCPRSTNKRAGCIGFYLQFLTSPLNPLTKAKSLTTGMKLILILTSLDGNHLRFQKHLRRK


In [73]:
other_mutations[16]

'W60*'

In [74]:
hgd_protein[59]

'W'

In [75]:
protein = hgd_protein[:59]
print(other_mutations[16] + ': ' + protein)

W60*: MAELKYISGFGNECSSEDPRCPGSLPEGQNNPQVCPYNLYAEQLSGSAFTCPRSTNKRS


In [8]:
# G205V
# check if in position 205 of hgd_protein there is a G
hgd_protein[204]

'G'

In [9]:
# substitute G with V
protein = hgd_protein[:204] + 'V' + hgd_protein[205:]
print('G205V: ' + protein)

G205V: MAELKYISGFGNECSSEDPRCPGSLPEGQNNPQVCPYNLYAEQLSGSAFTCPRSTNKRSWLYRILPSVSHKPFESIDEGQVTHNWDEVDPDPNQLRWKPFEIPKASQKKVDFVSGLHTLCGAGDIKSNNGLAIHIFLCNTSMENRCFYNSDGDFLIVPQKGNLLIYTEFGKMLVQPNEICVIQRGMRFSIDVFEETRGYILEVYVVHFELPDLGPIGANGLANPRDFLIPIAWYEDRQVPGGYTVINKYQGKLFAAKQDVSPFNVVAWHGNYTPYKYNLKNFMVINSVAFDHADPSIFTVLTAKSVRPGVAIADFVIFPPRWGVADKTFRPPYYHRNCMSEFMGLIRGHYEAKQGGFLPGGGSLHSTMTPHGPDADCFEKASKVKLAPERIADGTMAFMFESSLSLAVTKWGLKASRCLDENYHKCWEPLKSHFTPNSRNPAEPN


In [11]:
# L353Q
# check if in position 353 of hgd_protein there is a L
hgd_protein[352]

'K'

In [12]:
# substitute K with Q
protein = hgd_protein[:352] + 'Q' + hgd_protein[353:]
print('K353Q: ' + protein)

K353Q: MAELKYISGFGNECSSEDPRCPGSLPEGQNNPQVCPYNLYAEQLSGSAFTCPRSTNKRSWLYRILPSVSHKPFESIDEGQVTHNWDEVDPDPNQLRWKPFEIPKASQKKVDFVSGLHTLCGAGDIKSNNGLAIHIFLCNTSMENRCFYNSDGDFLIVPQKGNLLIYTEFGKMLVQPNEICVIQRGMRFSIDVFEETRGYILEVYGVHFELPDLGPIGANGLANPRDFLIPIAWYEDRQVPGGYTVINKYQGKLFAAKQDVSPFNVVAWHGNYTPYKYNLKNFMVINSVAFDHADPSIFTVLTAKSVRPGVAIADFVIFPPRWGVADKTFRPPYYHRNCMSEFMGLIRGHYEAQQGGFLPGGGSLHSTMTPHGPDADCFEKASKVKLAPERIADGTMAFMFESSLSLAVTKWGLKASRCLDENYHKCWEPLKSHFTPNSRNPAEPN
