In [1]:
from collections import Counter, defaultdict
from itertools import chain, combinations, product
import numpy as np
import pandas as pd

# Exercise 0

In [2]:
bdb = """
TSVKTYAKFVTH
TSVKTYAKFSTH
TSVKTYAKFVTH
LSVKKYPKYVVQ
SSVKKYPKYSVL
"""

bdb = bdb.strip().split()
bdb

['TSVKTYAKFVTH',
 'TSVKTYAKFSTH',
 'TSVKTYAKFVTH',
 'LSVKKYPKYVVQ',
 'SSVKKYPKYSVL']

## AA Frequencies

### Absolute Observed AA Frequencies

In [3]:
observed_frequencies = Counter(chain.from_iterable(bdb))
observed_frequencies

Counter({'A': 3,
         'F': 3,
         'H': 3,
         'K': 12,
         'L': 2,
         'P': 2,
         'Q': 1,
         'S': 8,
         'T': 9,
         'V': 10,
         'Y': 7})

In [4]:
keys = sorted(observed_frequencies.keys())
keys

['A', 'F', 'H', 'K', 'L', 'P', 'Q', 'S', 'T', 'V', 'Y']

In [5]:
key2pos = {b:a for a,b in enumerate(keys)}
key2pos

{'A': 0,
 'F': 1,
 'H': 2,
 'K': 3,
 'L': 4,
 'P': 5,
 'Q': 6,
 'S': 7,
 'T': 8,
 'V': 9,
 'Y': 10}

There are $n \times m$ observed frequencies for $n$ strings of length $m$

In [6]:
total = sum(observed_frequencies.values())
total

60

### Relative Observed AA Frequencies

Relative frequencies are calculated as $p_a=\frac{f_a}{\sum_if_i}$

In [7]:
observed_relative_frequencies = {k: v / total for k, v in observed_frequencies.items()}
observed_relative_frequencies

{'A': 0.05,
 'F': 0.05,
 'H': 0.05,
 'K': 0.2,
 'L': 0.03333333333333333,
 'P': 0.03333333333333333,
 'Q': 0.016666666666666666,
 'S': 0.13333333333333333,
 'T': 0.15,
 'V': 0.16666666666666666,
 'Y': 0.11666666666666667}

## Pair Frequencies

### Observed Absolute Pair Frequencies

In [8]:
observed_pair_frequencies = np.zeros(shape=(len(keys), len(keys)))

for a, b in combinations(bdb, 2):
    for x_y in zip(a, b):
        x, y = sorted(x_y)
        observed_pair_frequencies[key2pos[x], key2pos[y]] += 1
        observed_pair_frequencies[key2pos[y], key2pos[x]] += 1
    
pd.DataFrame(observed_pair_frequencies, columns=keys, index=keys, dtype=int)

Unnamed: 0,A,F,H,K,L,P,Q,S,T,V,Y
A,3,0,0,0,0,6,0,0,0,0,0
F,0,3,0,0,0,0,0,0,0,0,6
H,0,0,3,0,3,0,3,0,0,0,0
K,0,0,0,21,0,0,0,0,6,0,0
L,0,0,0,0,0,0,1,1,3,0,0
P,0,0,0,0,0,1,0,0,0,0,0
Q,0,0,0,0,0,0,0,0,0,0,0
S,0,0,0,0,0,0,0,11,3,6,0
T,0,0,0,0,0,0,0,0,9,6,0
V,0,0,0,0,0,0,0,0,0,14,0


The number of total pairs is $m \times {n \choose 2}$ for $n$ strings of length $m$

In [9]:
total_pairs = observed_pair_frequencies.sum()
total_pairs

120.0

### Observed Relative Pair Frequencies

Relative pair frequencies are calculated as $p_{ab}=\frac{f_{ab}}{\sum_{i,j}f_{ij}}=\frac{\text{frequency of pair} ab}{\text{sum of frequencies of all pairs}}$

In [10]:
observed_relative_pair_frequencies = observed_pair_frequencies / observed_pair_frequencies.sum()

pd.DataFrame((100 * observed_relative_pair_frequencies.round(2)).astype(int), columns=keys, index=keys, dtype=int)

Unnamed: 0,A,F,H,K,L,P,Q,S,T,V,Y
A,2,0,0,0,0,5,0,0,0,0,0
F,0,2,0,0,0,0,0,0,0,0,5
H,0,0,2,0,2,0,2,0,0,0,0
K,0,0,0,18,0,0,0,0,5,0,0
L,0,0,0,0,0,0,1,1,2,0,0
P,0,0,0,0,0,1,0,0,0,0,0
Q,0,0,0,0,0,0,0,0,0,0,0
S,0,0,0,0,0,0,0,9,2,5,0
T,0,0,0,0,0,0,0,0,8,5,0
V,0,0,0,0,0,0,0,0,0,12,0


### Expected AA Pair Frequencies

In [11]:
expected_pair_probability = np.zeros(shape=(len(keys), len(keys)))

for (i, a), (j, b) in product(enumerate(keys), repeat=2):
    if a == b:
        expected_pair_probability[i, j] = observed_relative_frequencies[a] * observed_relative_frequencies[b]
    else:
        expected_pair_probability[i, j] = 2 * observed_relative_frequencies[a] * observed_relative_frequencies[b]

pd.DataFrame((100 * expected_pair_probability.round(2)).astype(int), columns=keys, index=keys, dtype=int)

Unnamed: 0,A,F,H,K,L,P,Q,S,T,V,Y
A,0,1,1,2,0,0,0,1,2,2,1
F,1,0,1,2,0,0,0,1,2,2,1
H,1,1,0,2,0,0,0,1,2,2,1
K,2,2,2,4,1,1,1,5,6,7,5
L,0,0,0,1,0,0,0,1,1,1,1
P,0,0,0,1,0,0,0,1,1,1,1
Q,0,0,0,1,0,0,0,0,0,1,0
S,1,1,1,5,1,1,0,2,4,4,3
T,2,2,2,6,1,1,0,4,2,5,3
V,2,2,2,7,1,1,1,4,5,3,4


## Log Odds Ratio and Final Score Matrix

In [12]:
final_score = (2 * np.log2((observed_relative_pair_frequencies + 0.5) / (expected_pair_probability + 1))).round(1)

pd.DataFrame(final_score, columns=keys, index=keys, dtype=int)

Unnamed: 0,A,F,H,K,L,P,Q,S,T,V,Y
A,-1,-2,-2,-2,-2,-1,-2,-2,-2,-2,-2
F,-2,-1,-2,-2,-2,-2,-2,-2,-2,-2,-1
H,-2,-2,-1,-2,-1,-2,-1,-2,-2,-2,-2
K,-2,-2,-2,-1,-2,-2,-2,-2,-1,-2,-2
L,-2,-2,-2,-2,-2,-2,-2,-2,-1,-2,-2
P,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2
Q,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2
S,-2,-2,-2,-2,-2,-2,-2,-1,-2,-1,-2
T,-2,-2,-2,-2,-2,-2,-2,-2,-1,-1,-2
V,-2,-2,-2,-2,-2,-2,-2,-2,-2,-1,-2
