###

# GOR Method Implementation

In [3]:
import pandas as pd
import numpy as np

1. Build a GOR propensity matrix from the files in the propensity folder (for H,E and '-' categories)
2. Create inference method to predict sites in protein sequence via GOR method
3. Compare against secondary structure sequences from PDB

## Create GOR Propensity Matrices

### Create Frequency Tables

In [46]:
#Rows are in same order as the table in original GOR paper
AA_LETTERS = ['G', 'A', 'V', 'L', 'I', 'S', 'T', 'D', 'E', 'N', 'Q', 'K', 'H', 'R', 'F', 'Y', 'W', 'C', 'M', 'P', 'X']
# The X is a way to handle edges via padding, and is dropped before calculating frequencies

In [59]:
def init_count_table():
    return pd.DataFrame(0,index=np.arange(17),columns=AA_LETTERS)

def init_class_counts():
    return {'H': init_count_table(), 'E': init_count_table(), '-': init_count_table()}

In [60]:
count_tables = init_class_counts()

In [None]:
def count_seq_file(seqfile):
    seq_class_counts = init_class_counts()
    with open(seqfile, 'r') as f:
        lines = f.readlines()
        structure = lines[0].strip()
        sequence = lines[1].strip()
        print(structure)
        print(sequence)
        padding = 'X' * 8
        padded_seq = padding + sequence + padding
    for i in range(0,len(structure)):
        category = structure[i]
        window_seqs = padded_seq[i:i+17]
        for j in range(0,len(window_seqs)):
            seq_class_counts[category].loc[j, window_seqs[j]] += 1
    return seq_class_counts


In [62]:
## Test count_seq_file
test_file = "train/2.dssp"
count_seq_file(test_file)
print(count_tables["-"])

----------EEEEEE--EEEE----EEEEEEEE----EEEEEE-----------HHHH-------EEEEE---EEEE----EEEEEEEEEE--EEEEEEEE----
AGEDVGAPPDHLWVHQEGIYRDEYQRTWVAVVEEETSFLRARVQQIQVPLGDAARPSHLLTSQLPLMWQLYPEERYMDNNSRLWQIQHHLMVRGVQELLLKLLPDD
    G  A  V  L  I  S  T  D  E  N  ...  K  H  R  F  Y  W  C  M  P  X
0   2  5  3  4  2  2  1  2  4  0  ...  0  2  3  1  1  1  0  0  4  8
1   2  5  4  5  3  1  0  1  5  0  ...  0  2  4  1  0  1  0  1  3  7
2   2  4  5  8  2  1  0  1  5  0  ...  0  3  4  0  2  1  0  1  2  6
3   2  3  5  7  2  1  1  1  3  0  ...  1  3  5  0  2  2  0  2  2  5
4   3  2  6  8  2  2  1  3  3  0  ...  1  2  3  0  2  2  0  1  2  4
5   3  2  5  8  1  2  1  4  5  1  ...  1  2  2  0  3  0  0  2  1  3
6   3  3  4  7  1  1  2  4  5  2  ...  1  2  1  0  2  0  0  2  4  2
7   3  4  3  6  1  3  2  5  6  2  ...  0  0  1  0  2  0  0  0  6  1
8   5  4  2  5  1  3  2  5  6  2  ...  0  0  4  1  1  0  0  0  6  0
9   5  3  3  5  2  3  3  5  3  1  ...  0  1  4  1  1  0  0  1  6  1
10  2  3  3  6  1  4  1  5  2  0  ... 

In [41]:
## Output is three tables: one each for H, E, and '-'
## Each table has row for each position relative to the focal residue and a column for each AA
## I'm open to some swaps on that output, but I suspect it will make life a little easier


### Create Information Tables From Frequency Tables

In [None]:
# Code Goes Here

## Inference Method

In [None]:
# Code Goes Here
        

## Evaluate Against PDB Files

In [None]:
# Code Goes Here