In [None]:
# %load topics.py
import pandas as pd
import psutil

pd.set_option("display.max_colwidth" , 300)

df_high_level = pd.DataFrame(
    data=[
        {'day': 'Monday', 'Topic': 'Check-In, recaps and functions'},
        {'day': 'Tuesday', 'Topic': 'Coding philosophy, data flow and some more useful std modules'},
        {'day': 'Wednesday', 'Topic': 'Test driven development, python module, sphinx'},
        {'day': 'Thursday', 'Topic': 'OOP - Object oriented programming'},
        {'day': 'Friday', 'Topic': 'Q&A and code clean up'},
        {'day': '', 'Topic': ''},
        {'day': 'Monday', 'Topic': ''},
        {'day': 'Tuesday', 'Topic': ''},
        {'day': 'Wednesday', 'Topic': ''},
        {'day': 'Thursday', 'Topic': ''},
        {'day': 'Friday', 'Topic': 'Q&A and Tutorium'},


    ]
)

df_details = pd.DataFrame(
    data=[
        {'day': 1, 'Topic': 'Check-in'},
        {'day': 1, 'Topic': 'Procedural stuff'},
        {'day': 1, 'Topic': "python basic in 5'"},
        {'day': 1, 'Topic': 'lists and generators'},
        {'day': 1, 'Topic': 'bisect module'},
        # ----------------------------
        {'day': 2, 'Topic': 'Functions'},
        {'day': 2, 'Topic': 'Zen of Python and general coding philosophy'},
        {'day': 2, 'Topic': 'csv module'},
        {'day': 2, 'Topic': 'Collections module'},
        {'day': 2, 'Topic': 'Exercises 1 & 2'},
        # ----------------------------
        {'day': 3, 'Topic': 'Discussion of Excercises 1 & 2'},
        {'day': 3, 'Topic': 'Basic plotting with plotly'},
        {'day': 3, 'Topic': 'Exercises 3'},
        # -----------------------------
        {'day': 4, 'Topic': 'Discussion of Excercises 3'},
        {'day': 4, 'Topic': "String format"},
        {'day': 4, 'Topic': 'dicts'},
        {'day': 4, 'Topic': 'itertools'},
        # -----------------------------
        {'day': 5, 'Topic': "OOP"},
        # {'day': 3, 'Topic': 'data flow'},
        {'day': 6, 'Topic': "Basic Python package"},
        {'day': 6, 'Topic': "Test Driven development"},
        {'day': 6, 'Topic': "Auto documentation with Sphinx"},
        # -----------------------------
    ]
)


def display_topics(day=1, df=None):
    if df is None:
        df = df_details
    return df[df['day'] == day][['day', 'Topic']].head(20)


# Days 4
## Overview

In [None]:
display_topics(day=4)

# Discssion @ Excersises 



In [None]:
seqs = {}
current_id = None
current_seq = ""
with open("../data/uniprot-filtered-proteome%3AUP000005640+AND+reviewed%3Ayes+AND+organism%3A%22Hom--.fasta") as fasta_file:
    for line in fasta_file:
        if line.startswith(">"):
            if current_id is not None:
                seqs[current_id] = current_seq
            current_seq = ""
            current_id = line.strip()
        else:
            current_seq += line.strip()
    seqs[current_id] = current_seq
    

In [None]:
import csv
hydropathy_lookup = {}

with open("../data/amino_acid_properties.csv") as aap:
    csv_dr = csv.DictReader(aap)
    for line_dict in csv_dr:
        aa = line_dict["1-letter code"]
        hpi = line_dict["hydropathy index (Kyte-Doolittle method)"]
        try:
            hydropathy_lookup[aa] = float(hpi)
        except ValueError:
            pass

hydropathy_lookup

In [None]:
# Sneak preview to pandas ! :)
aa_df = pd.read_csv("../data/amino_acid_properties.csv")
hydropathy_lookup = aa_df.set_index("1-letter code").to_dict()['hydropathy index (Kyte-Doolittle method)']
hydropathy_lookup

In [None]:
from collections import deque
import plotly.graph_objects as go

ids_of_interest = [
    "TMEM63B",
    "TM9SF",
    "MS4A1",
]
for identifier in seqs.keys():
    for ioi in ids_of_interest:
        if ioi in identifier:
            print(identifier)

In [None]:
test = deque([], maxlen=5) 

In [None]:
test.append(">>>")
len(test)

In [None]:
test += [1,2,3,4,5,6,7,]
len(test)

In [None]:

identifier = ">sp|Q5J8X5|M4A13_HUMAN Membrane-spanning 4-domains subfamily A member 13 OS=Homo sapiens OX=9606 GN=MS4A13 PE=2 SV=2"

# identifier = ">sp|Q92544|TM9S4_HUMAN Transmembrane 9 superfamily member 4 OS=Homo sapiens OX=9606 GN=TM9SF4 PE=1 SV=2"

data = []
for window_size in [5, 10, 20, 50]:
    averages_hydropathy_values = []
    av_window = deque([], maxlen=window_size)            
    positions = []
    for aa in seqs[identifier]:
        hydropathy_value = hydropathy_lookup[aa]
        
        av_window.append(hydropathy_value)
        averages_hydropathy_values.append( sum(av_window) / len(av_window))


    data.append(
        go.Scatter(
            y=averages_hydropathy_values,
            name=window_size
        )
    )

fig = go.Figure(
    data=data,
    layout={
        "title": {
            "text": identifier,
            "font_size": 20
        }
    }
)
fig.update_layout(template='plotly_dark')
fig.show()

## Note: the shift in the maximum between 5 and e.g. 20.
Q why is that ?
Q How to fix it ?

In [None]:

identifier = ">sp|Q5J8X5|M4A13_HUMAN Membrane-spanning 4-domains subfamily A member 13 OS=Homo sapiens OX=9606 GN=MS4A13 PE=2 SV=2"

# identifier = ">sp|Q92544|TM9S4_HUMAN Transmembrane 9 superfamily member 4 OS=Homo sapiens OX=9606 GN=TM9SF4 PE=1 SV=2"

data = []
for window_size in [5, 10, 20, 50]:
    averages_hydropathy_values = []
    av_window = deque([], maxlen=window_size)            
    positions = []
    for pos, aa in enumerate(seqs[identifier]):
        positions.append( pos - int(len(av_window) / 2))
        hydropathy_value = hydropathy_lookup[aa]
        
        av_window.append(hydropathy_value)
        averages_hydropathy_values.append( sum(av_window) / len(av_window))


    data.append(
        go.Scatter(
            x=positions,
            y=averages_hydropathy_values,
            name=window_size
        )
    )

fig = go.Figure(
    data=data,
    layout={
        "title": {
            "text": identifier,
            "font_size": 20
        }
    }
)
fig.update_layout(template='plotly_dark')
fig.show()

# String format

a powerful techniques to produce well formatted strings. Reference can be found [here](https://docs.python.org/3.4/library/string.html#format-specification-mini-language)

In [None]:
"{0}".format(42.0030937)    # cast float to string

In [4]:
"{0:.0f}".format(42.0030937)    # cast float to string and round it. Seven digits in total, 4 decimals

'42'

In [None]:
# Align text in longer context
print("Centered      {0:^17.4f}".format(42.0030937))    # center text within longer area
print("Aligned left  {0:<17.4f}".format(42.0030937))    # align left
print("Aligned right {0:>17.4f}".format(42.0030937))    # align right

In [None]:
# Fill empty with placeholder, here _
print("Centered      {0:_^17.4f}".format(42.0030937))    # center text within longer area
print("Aligned left  {0:_<17.4f}".format(42.0030937))    # align left
print("Aligned right {0:_>17.4f}".format(42.0030937))    # align right

In [None]:
"{0:7.4f} and not {1:7.4f}".format(42.0030937, 2)    # use an (unpacked) list of elements

In [None]:
"{first:7.4f} and not {second:7.4f}".format(first=42.0030937, second=2)    # use a (unpacked) dict

In [None]:
string_elements = [42.0030937, 2]
"{0:7.4f} and not {1:7.4f}".format(*string_elements) # use an (unpacked) list of elements

In [None]:
string_elements = {'first': 42.00309377849553, 'second': 2}
"{first:7.4f} and not {second:7.4f}".format(**string_elements)  # use a (unpacked) dict

In [None]:
counters = {"warnings": 12, "critical_errors": 0, "passed evaluations": 1002}

"During run time we found {warnings:0>4d} warnings and {critical_errors:0>4d} critial errors".format(**counters)
print(f"During run time we found {counters['warnings']:0>4d}")
# Note counters dict can carry much more than we actually display in our formatted string

In [None]:
# use different format types ... and yes there are many .. here with an exponent
"During run time {passed evaluations:1.3E} evaluations passed without any issue".format(**counters)

In [None]:
# Python 3.6 introduced another way to use format strings, ie addin an f in front of the string
# allows local variables to be inserted directly

warnings = 12
critical_errors = 0

f"During run time we found {warnings:0>4d} warnings and {critical_errors:0>4d} critial errors"

# Dicts

Dictionaries, short dicts are lookup data structures in the form of key, value pairs. Keys have to be hashable (ie no dicts, sets, lists, yet e.g. tuples, floats, ints and strings work)and values can be of any type.

**NOTE** dicts used to be unordered so best don't assume to be ordered or used ordered dict from colections directly (more to collections later)

Lookup a value using its key in a dict is fast!

one can iterate over a dicts
* keys via d.keys()
* values via d.values()
* or pairs of key, values via d.items()

In [None]:
a = {"King": "Arthur", "Uni": "Hd"}


In [None]:
for key, value in a.items():
    print('Key {0} points to value {1}'.format(key, value))

Given d is an dictionary with unknown content. How to code
* get value for key "A", and if not available return "unknown amino acid"
* set value for key "A" to "Alanine" if not already in d

How to merge two dicts ?

In [None]:
b = {"Language": "English", "King": "Nobody!"}
a.update(b)
a

### Note: Prior to Python 3.6 dicts are not ordered! 
Don't *ASSUME* ie, either limit your module to Python3.6+ or use orderedDict to be on the safe side

In [None]:
from collections import OrderedDict as odict
a = odict({"King": "Arthur", "Uni": "Hd"})
b = odict({"Language": "English", "King": "Nobody!"})
print(a, b)

In [None]:
a.update(b)
a

In [None]:
a.pop.__doc__
# alt "? a.pop"

In [None]:
# q: how to rename a key within a dict
a['Old King'] = a.pop('King')
a
# Note: changes position :)

In [None]:
print(dir(a.pop))

# Iteration helpers

How would you code a loop that generates out of an iterable \['A', 'B', 'C' \]
the following 
* AB, AC, BC
* AA, AB, AC, BA, BB, BC, CA, CB, CC



In [None]:
a = ['A', 'B', 'C']
already_seen = set()
for e1 in a:
    for e2 in a:
        if e1 == e2:
            continue
        
        sorted_tuple = tuple(sorted([e1, e2]))
        
        if sorted_tuple in already_seen:
            continue
        
        print(e1, e2)
        already_seen.add(sorted_tuple)

# [itertools](https://docs.python.org/3.7/library/itertools.html)

Python iterator helpers for efficient and **readable** looping.

In [None]:
from itertools import product

list(product(a, repeat=2))

In [None]:
from itertools import combinations

list(combinations(a, 2))

#### Excersise No. 4

Taken excersise 2, visualize each question for all 20k+ fasta entries in one plotly plot. Identify the proteins with, e.g. the smallest or largest mass by hovering over the points. The plots should give you an idea of the distribution for a given observation / question.
