In [57]:
#import course;course.header()

## A) get data
go to uniprot and download the fasta protein sequence for "G-protein coupled receptor 183" - aka "P32249"

## B) Create a protein class

that can be initialized by a name (or ID), sequence and a metrics dictionary, like:

In [58]:
metrics = {
    "hydropathy": {"A" : "..."},
    "pI": {"A": "..."},
}

In [71]:
import pandas as pd
import plotly.graph_objects as go
import numpy as np
from collections import deque

class Protein(object):

    aa_df = pd.read_csv(r'..\..\data\amino_acid_properties.csv')
    aa_df = aa_df.rename(columns={"hydropathy index (Kyte-Doolittle method)": "hydropathy"})
    #  aa_df = aa_df[["1-letter code","hydropathy index (Kyte-Doolittle method)", "pI"]]
    aa_df = aa_df.set_index("1-letter code")
    aa_metrics = aa_df.to_dict("dict")

    def __init__(self, name, sequence):
        self.name = name
        self.sequence = sequence

    def plot(self, metric="hydropathy", window_size=5):
        translation = []
        for aa in list(self.sequence):
            translation.append(self.aa_metrics[metric][aa])
        
        window = deque([], maxlen = window_size)
        mean_values = []
        for value in translation:
            window.append(value)
            mean_value = np.mean(list(window))
            mean_values.append(mean_value)

        positions = list(np.arange(len(self.sequence)))
        data = [
            go.Bar(
                x=positions, 
                y=mean_values
            )
        ]
        fig = go.Figure(data=data)
        fig.update_layout(title = self.name, template="seaborn")
        return fig


In [72]:
protein_1_sequence = ""
with open ("P32249.fasta", "r") as input_file:
    next(input_file)
    for line in input_file:
        line = line.strip()
        protein_1_sequence += line
protein_1_sequence

protein_1 = Protein("P32249", protein_1_sequence)


In [73]:
protein_1.plot(window_size=50)

## C) Plot

Create a method that has the following signature:

In [63]:
def plot(self, metric="hydropathy", window_size=5):
    """Create plotly fig object.

    The title of the fig contains protein name, 
    the x axis is the amino acid position (int) and
    y axis shows the metric at each given position. 
    A windows size can be specified to average the metrics using a sliding window 

    Args:
        metric (str, optional): Is equal to the key of the metrics dictionary the class was initialized with. 
            Defaults to "hydropathy".
        window_size (int, optional): Size of the sliding window. Defaults to 5.
    """
    
    return fig


## E) What do you observe?
Describe the pattern you see. What could be the reason for the pattern?

You can see hydrophilic and hyprophobic areas in the protein. Aminoacids with similar properies are often close to each other and form destinct areas. In general, the protein seems to be more hydrophobic. These Aminoacids will be located on the inside of the protein.

# Optional

### Retrieve sequence programmatically
* Write a function that takes the uniprot identifier as kwarg and returns the sequence using
    * the buffered sequence from e2
    * using urllib
    * using an alternative?

### Plot annotations into the hydropathy plot
* Plot protein topology features into the same plot 
