###

# GOR Method Implementation

In [5]:
import pandas as pd
import numpy as np
import math
import os

1. Build a GOR propensity matrix from the files in the propensity folder (for H,E and '-' categories)
2. Create inference method to predict sites in protein sequence via GOR method
3. Compare against secondary structure sequences from PDB

## Create GOR Propensity Matrices

### Create Frequency Tables

Output is three tables: one each for H, E, and '-'
Each table has a row for each position relative to the focal residue and a column for each AA (The X column is dropped).
I'm open to some swaps on that output, but I suspect it will make life a little easier

In [19]:
#Rows are in same order as the table in original GOR paper
AA_LETTERS = ['G', 'A', 'V', 'L', 'I', 'S', 'T', 'D', 'E', 'N', 'Q', 'K', 'H', 'R', 'F', 'Y', 'W', 'C', 'M', 'P', 'X']
STRUCTURE_KEYS = ['H', 'E', '-']
# The X is a way to handle edges via padding, and is zeroed before calculating frequencies

In [2]:
def init_count_table():
    return pd.DataFrame(0,index=np.arange(17),columns=AA_LETTERS)

def init_class_counts():
    return {'H': init_count_table(), 'E': init_count_table(), '-': init_count_table()}

In [6]:
count_tables = init_class_counts()

In [None]:
def count_seq_file(seqfile):
    seq_class_counts = init_class_counts()
    with open(seqfile, 'r') as f:
        lines = f.readlines()
        structure = lines[0].strip()
        sequence = lines[1].strip()
        padding = 'X' * 8
        padded_seq = padding + sequence + padding
    for i in range(0,len(structure)):
        category = structure[i]
        window_seqs = padded_seq[i:i+17]
        for j in range(0,len(window_seqs)):
            seq_class_counts[category].loc[j, window_seqs[j]] += 1
    return seq_class_counts


In [8]:
directory = r"train/"
for filename in os.listdir(directory):
    seq_class_counts = count_seq_file(os.path.join(directory,filename))
    for key in seq_class_counts.keys():
        count_tables[key] += seq_class_counts[key]

In [None]:
trimmed_count_tables = count_tables.copy()
for key in count_tables.keys():
    trimmed_count_tables[key]["X"] = 0

### Create Frequency Tables from Count Tables

In [39]:
trimmed_count_tables['H'].loc[8].sum()

70340

In [42]:
structure_totals = {key: trimmed_count_tables[key].loc[8].sum() for key in STRUCTURE_KEYS}   
n_residues = sum(structure_totals.values()) 
structure_tot_freqs = {key: structure_totals[key]/n_residues for key in STRUCTURE_KEYS}
structure_tot_freqs

{'H': 0.3549997224198929, 'E': 0.22097899980317046, '-': 0.4240212777769366}

In [88]:
trimmed_count_tables['H'].sum(axis=1)

0     68073
1     68500
2     68916
3     69317
4     69671
5     69974
6     70199
7     70326
8     70340
9     70322
10    70155
11    69867
12    69494
13    69066
14    68592
15    68082
16    67541
dtype: int64

In [96]:
trimmed_count_tables['H'].div(trimmed_count_tables['H'].sum(axis=1),axis=0).sum(axis=1)

0     1.0
1     1.0
2     1.0
3     1.0
4     1.0
5     1.0
6     1.0
7     1.0
8     1.0
9     1.0
10    1.0
11    1.0
12    1.0
13    1.0
14    1.0
15    1.0
16    1.0
dtype: float64

In [None]:
freq_tables['H']

### Create Information Tables From Frequency Tables

In [None]:
def compute_info_table(freq, freq_total):
 
    info = {}
    window_list = [-8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8]

    info_s = pd.DataFrame(0.0, index=window_list, columns=AA_LETTERS)
    for pos in range(len(window_list)):
        for aa in AA_LETTERS:
            f_s = freq[s].loc[pos+8, aa]
            f_t = freq_total.loc[pos+8, aa]

            # Avoid log(0)
            if f_s == 0 or f_t == 0:
                info_s.loc[pos, aa] = 0
            else:
                info_s.loc[pos, aa] = math.log2(f_s / f_t)
    info[s] = info_s
    return info

##Inference Method

In [None]:
def predict_secondary_structure(sequence, info_tables, window=17):

    half_window = window // 2
    sequence = sequence.upper()
    length = len(sequence)
    pred_ss = []

    for i in range(length):
        scores = {'H': 0.0, 'E': 0.0, 'C': 0.0}

        # Slide window centered on residue i
        for offset in range(-half_window, half_window + 1):
            pos = offset
            j = i + offset

            # Skip positions that go outside sequence
            if j < 0 or j >= length:
                continue

            aa = sequence[j]
            if aa not in AA_LETTERS:
                continue

            # Add contribution from info table for each structure type
            for s in ['H', 'E', 'C']:
                scores[s] += info_tables[s].loc[pos, aa]

        # Choose structure with highest total score
        best_state = max(scores, key=scores.get)
        pred_ss.append(best_state)

    return ''.join(pred_ss)

## Evaluate Against PDB Files

In [None]:
# Code Goes Here