In [1]:
import requests
import pandas as pd
import plotly.graph_objects as go
from collections import deque

In [2]:
def make_lookup():
    names_of_properties = [
        'Molecular Weight', 'Residue Weight', 'pka1', 'pka2', 'pkaX', 'pI',
        'hydropathy index (Kyte-Doolittle method)', 'Accessible surface'
    ]
    df_properties = pd.read_csv('../data/amino_acid_properties.csv', index_col='1-letter code')
    lookup = {}
    for prop in names_of_properties:
        property_series = df_properties[prop]
        lookup[prop] = dict(property_series)
    return lookup
    

In [3]:
lookup = make_lookup()
lookup['hydropathy index (Kyte-Doolittle method)']['M']

1.9

In [9]:
class Protein():
    def __init__(self, lookup): 
        self.lookup = lookup
        self.id = self.url = self.fasta = self.seq = self.prop_values = self.prop = self.window_size = None
        
    def get_data(self, Id):
        self.Id = Id
        self.url = 'https://www.uniprot.org/uniprot/' + self.Id + '.fasta'
        res = requests.get(self.url)
        self.fasta = res.text
        self.sequence = ''
        for line in self.fasta.split('\n'):
            if not line.startswith('>'):
                self.sequence += line.strip()
        return self.sequence
    
    def map_property(self, selected_property='hydropathy index (Kyte-Doolittle method)'):
        self.prop = selected_property
        self.window_size = None
        prop_dic = self.lookup[self.prop]
        self.prop_values = [prop_dic[amino_acid] for amino_acid in self.sequence]
        return self.prop_values
    
    def rolling_mean(self, selected_property='hydropathy index (Kyte-Doolittle method)', window_size=5):
        prop_values_unsmoothed = self.map_property(selected_property)
        self.prop_values = []
        self.window_size = window_size
        
        window = deque(prop_values_unsmoothed[:window_size - 1], maxlen=window_size)
        for value in prop_values_unsmoothed[window_size - 1:]:
            window.append(value)
            window_sum = sum(window)
            self.prop_values.append(window_sum / window_size)
        return self.prop_values
    
    def make_plot(self):
        data = [
            go.Scatter(x=list(range(0, len(self.prop_values))),
                       y=self.prop_values
                       )
            ]
        plot_title = self.Id
        if self.window_size is not None:
            plot_title += f' with window size of {self.window_size}'
        layout = {
            'title': {
                'text': plot_title
            },
            'yaxis': {
                'title': self.prop
            },
            'xaxis': {
                'title': 'postion'
            }
        }

        fig = go.Figure(data=data, layout=layout)
        fig.show()

In [12]:
Id = 'P32249'
lookup = make_lookup()
prop = 'pI'
# prop = 'hydropathy index (Kyte-Doolittle method)'

prot = Protein(lookup)
seq = prot.get_data(Id)
prop_values = prot.map_property(prop)
# prop_values_smooted = prot.rolling_mean(prop, 15)
prot.make_plot()

In [6]:
file_path = '../data/amino_acid_properties.csv'
properties_df = pd.read_csv(file_path, index_col='1-letter code')

In [7]:
properties_df

Unnamed: 0_level_0,Name,3-letter code,Molecular Weight,Molecular Formula,Residue Formula,Residue Weight,pka1,pka2,pkaX,pI,hydropathy index (Kyte-Doolittle method),Accessible surface
1-letter code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
A,Alanine,Ala,89.1,C3H7NO2,C3H5NO,71.08,2.34,9.69,,6.0,1.8,44.1
R,Arginine,Arg,174.2,C6H14N4O2,C6H12N4O,156.19,2.17,9.04,12.48,10.76,-4.5,159.2
N,Asparagine,Asn,132.12,C4H8N2O3,C4H6N2O2,114.11,2.02,8.8,,5.41,-3.5,80.8
D,Aspartic acid,Asp,133.11,C4H7NO4,C4H5NO3,115.09,1.88,9.6,3.65,2.77,-3.5,76.3
C,Cysteine,Cys,121.16,C3H7NO2S,C3H5NOS,103.15,1.96,10.28,8.18,5.07,2.5,56.4
E,Glutamic acid,Glu,147.13,C5H9NO4,C5H7NO3,129.12,2.19,9.67,4.25,3.22,-3.5,99.2
Q,Glutamine,Gln,146.15,C5H10N2O3,C5H8N2O2,128.13,2.17,9.13,,5.65,-3.5,100.6
G,Glycine,Gly,75.07,C2H5NO2,C2H3NO,57.05,2.34,9.6,,5.97,-0.4,0.0
H,Histidine,His,155.16,C6H9N3O2,C6H7N3O,137.14,1.82,9.17,6.0,7.59,-3.2,98.2
I,Isoleucine,Ile,131.18,C6H13NO2,C6H11NO,113.16,2.36,9.6,,6.02,4.5,90.9


In [8]:
properties_df.columns

Index(['Name', '3-letter code', 'Molecular Weight', 'Molecular Formula',
       'Residue Formula', 'Residue Weight', 'pka1', 'pka2', 'pkaX', 'pI',
       'hydropathy index (Kyte-Doolittle method)', 'Accessible surface'],
      dtype='object')