get some proteins sequences from one of jeppe's data files

In [1]:
import numpy as np
import torch
import imp
import re
from torch.nn.utils.rnn import pad_sequence

In [2]:
sample_file = '/Users/Deathvoodoo/Documents/openprotein/data/raw/protein_net_testfile.txt'

In [3]:
aa_id_dict = {'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4, 'G': 5, 'H': 6, 'I': 7,
              'K': 8, 'L': 9, 'M': 10, 'N': 11, 'P': 12, 'Q': 13, 'R': 14, 
              'S': 15, 'T': 16, 'V': 17, 'W': 18, 'Y': 19}


In [4]:
def aa_to_onehot(aa_str, aa_to_nr, mask=None):
    if mask!=None:
        mask_ind = np.asarray([x=='+' for x in mask])*1
        mask_ind = np.nonzero(mask_ind)
        aa_str = "".join([aa_str[x] for x in mask_ind[0]]) # because it gets put in another list
    init_array = np.zeros( (len(aa_to_nr.keys()), len(aa_str)) )
    for i,j in enumerate(aa_str):
        init_array[aa_to_nr[j], i] = 1
    return(init_array)

In [5]:
id_list = []
seqs = {}

In [6]:
with open(sample_file) as input:
    lines = input.readlines()
    curr_id = None
    for i, line in enumerate(lines):
        line = line.strip()
        if line == '[ID]':
            curr_id = lines[i+1].strip()
            id_list.append(curr_id)
            seqs[curr_id] = {}
            seqs[curr_id]['primary'] = lines[i+3].strip()
        if line == '[TERTIARY]':
            coords = []
            for j in range(3):
                coords.append(np.fromstring(lines[i+j+1], sep='\t'))
            seqs[curr_id]['tertiary'] = np.array(coords)
        if line == '[MASK]':
            seqs[curr_id]['mask'] = lines[i+1].strip()


In [7]:
def filter_coords(coords, mask):
    mask = np.array([x=='+' for x in mask])
    mask_stretched = np.repeat(mask, 3)
    coords_filt = coords[:, mask_stretched]
    return(coords_filt)

def filter_seqs(protein_dict):
    re_chainbreak = re.compile("\-*\+*\+\-+\+\+*\-*")
    keys_to_remove = []
    for key in protein_dict.keys():
        mask = protein_dict[key]['mask']
        if re_chainbreak.search(mask):
            keys_to_remove.append(key)
        else:
            coords = protein_dict[key]['tertiary']
            coords_filt = filter_coords(coords, mask)
            protein_dict[key]['tertiary'] = coords_filt
    for key in keys_to_remove:
        print(key, " has a chainbreak, removing...")
        del protein_dict[key]

def new_dihedral(p0, p1, p2, p3): # COPY PASTA'D FROM STACKEXCHANGE
    """Praxeolitic formula
    1 sqrt, 1 cross product"""

    b0 = -1.0*(p1 - p0)
    b1 = p2 - p1
    b2 = p3 - p2

    # normalize b1 so that it does not influence magnitude of vector
    # rejections that come next
    b1 /= np.linalg.norm(b1)

    # vector rejections
    # v = projection of b0 onto plane perpendicular to b1
    #   = b0 minus component that aligns with b1
    # w = projection of b2 onto plane perpendicular to b1
    #   = b2 minus component that aligns with b1
    v = b0 - np.dot(b0, b1)*b1
    w = b2 - np.dot(b2, b1)*b1

    # angle between v and w in a plane is the torsion angle
    # v and w may not be normalized but that's fine since tan is y/x
    x = np.dot(v, w)
    y = np.dot(np.cross(b1, v), w)
    #return np.degrees(np.arctan2(y, x))
    return np.arctan2(y, x)

def calc_angles(coords):
    N = coords.shape[1]
    angles_all = []
    for i in range(0, N-3, 3):
        psi = new_dihedral(*[coords[:, x] for x in range(i, i+4)])
        omega = new_dihedral(*[coords[:, x] for x in range(i+1, i+5)])
        phi = new_dihedral(*[coords[:, x] for x in range(i+2, i+6)])
        angles = [psi, omega, phi]
        angles_all.append(angles)
    return(np.array(angles_all))

In [8]:
#for key in seqs.keys():
#    print(key, seqs[key]['mask'])

In [9]:
filter_seqs(seqs)

TBM#T0366  has a chainbreak, removing...
TBM#T0365  has a chainbreak, removing...
TBM#T0364  has a chainbreak, removing...
TBM-hard#T0356  has a chainbreak, removing...
TBM#T0339  has a chainbreak, removing...
TBM#T0359  has a chainbreak, removing...
TBM#T0331  has a chainbreak, removing...
TBM#T0320  has a chainbreak, removing...
TBM-hard#T0321  has a chainbreak, removing...
TBM#T0313  has a chainbreak, removing...
TBM#T0330  has a chainbreak, removing...
TBM#T0301  has a chainbreak, removing...
TBM#T0379  has a chainbreak, removing...
TBM#T0305  has a chainbreak, removing...
TBM#T0378  has a chainbreak, removing...
TBM#T0288  has a chainbreak, removing...
FM#T0296  has a chainbreak, removing...
TBM-hard#T0316  has a chainbreak, removing...
TBM#T0292  has a chainbreak, removing...
TBM#T0293  has a chainbreak, removing...
TBM#T0341  has a chainbreak, removing...
TBM#T0291  has a chainbreak, removing...
FM#T0300  has a chainbreak, removing...
TBM-hard#T0347  has a chainbreak, removing..

In [10]:
len(seqs.keys())

66

In [11]:
seqs['TBM#T0285']['tertiary'].shape

(3, 297)

omega ca to ca, phi c to c, psi n to n, loop over every residue except first (or last) and calculate dihedral angle

In [12]:
test_coords = seqs['TBM#T0285']['tertiary']
test_angles = []
for i in range(0, 297-3, 3):
    psi = new_dihedral(*[test_coords[:, x] for x in range(i, i+4)])
    omega = new_dihedral(*[test_coords[:, x] for x in range(i+1, i+5)])
    phi = new_dihedral(*[test_coords[:, x] for x in range(i+2, i+6)])
    angles = [psi, omega, phi]
    test_angles.append(angles)
    

In [13]:
new_dihedral(*[test_coords[:, x] for x in range(0,0+4)])

-0.7731468104941539

In [14]:
np.array(test_angles).shape

(98, 3)

In [43]:
# check how big the proteins are, dont wanna pad a million zeroes
keylengths=[len(seqs[key]['primary']) for key in seqs.keys()]
max(keylengths)

530

In [36]:
max_length = 1000
sequence_list = []
angle_list = []
for key in seqs.keys():
    sequence = aa_to_onehot(seqs[key]['primary'], aa_id_dict, seqs[key]['mask'])
    coords = seqs[key]['tertiary']
    angles = calc_angles(coords).T
    if max_length-sequence.shape[0] < 0:
        print(key, ' exceeds max length: ', max_length, ' , skipping...')
        continue
    else:
        sequence_padded = np.pad(sequence, pad_width=((0,0), (0, max_length-sequence.shape[1])), constant_values=0)
        sequence_list.append(sequence_padded)
        angles_padded = np.pad(angles, pad_width=((0,0), (0, max_length-sequence.shape[1])), constant_values=0)
        angle_list.append(angles_padded)

In [37]:
np.array(sequence_list).shape

(66, 20, 1000)

In [38]:
np.array(angle_list).shape

(66, 3, 999)

In [40]:
np.max(np.array(angle_list))

3.1415832609565535

In [19]:
np.pad(aa_to_onehot(seqs['TBM#T0285']['primary'], aa_id_dict, seqs['TBM#T0285']['mask']),
       pad_width=((0,0), (0,1000)), constant_values=0 ).sum() # making sure nothing weird goes on

99.0

In [20]:
#for i,j in zip(sequence_list, [seqs[key]['tertiary'].shape[1]/3 for key in seqs.keys()]):
#    print(i.sum(), j) # sanity check

In [21]:
def split_inds(N, train_fraction, validation_fraction):
    indices = np.arange(N)
    train_to = int(np.floor(len(indices)*train_fraction))
    train_inds = indices[0: train_to]
    val_to = int(np.floor(len(indices)*(train_fraction+validation_fraction)))
    val_inds = indices[train_to: val_to]
    test_inds = indices[val_to:]
    return(train_inds, val_inds, test_inds)

In [110]:
sequence_tensor = torch.tensor(sequence_list).float()
sequence_tensor = sequence_tensor.unsqueeze(1)
print(sequence_tensor.shape)

torch.Size([66, 1, 20, 1000])


In [111]:
angle_tensor = torch.tensor(angle_list).float()
angle_tensor = angle_tensor.unsqueeze(1)
print(angle_tensor.shape)

torch.Size([66, 1, 3, 999])


In [112]:
sequence_tensor.dtype

torch.float32

In [113]:
train_inds, val_inds, test_inds = split_inds(66, 0.8, 0.1)

train_data = [sequence_tensor[train_inds, :, :, :], angle_tensor[train_inds, :, :, :]]
val_data = [sequence_tensor[val_inds, :, :, :], angle_tensor[val_inds, :, :, :]]
test_data = [sequence_tensor[test_inds, :, :, :], angle_tensor[test_inds, :, :, :]]

In [114]:
from torch.utils.data import Dataset
class proteindataset(Dataset):
    def __init__(self, seqs, angles):
        self.sequences = seqs
        self.angles = angles

    def __len__(self):
        return self.sequences.shape[0]

    def __getitem__(self, idx):
        return [self.sequences[idx, :, :, :], self.angles[idx, :, :, :]]

In [115]:
train_dataset = proteindataset(train_data[0], train_data[1])
val_dataset = proteindataset(val_data[0], val_data[1])
test_dataset = proteindataset(test_data[0], test_data[1])

In [116]:
trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=4,
                                          shuffle=True, num_workers=2)
valloader = torch.utils.data.DataLoader(val_dataset, batch_size=4,
                                         shuffle=False, num_workers=2)
testloader = torch.utils.data.DataLoader(test_dataset, batch_size=4,
                                         shuffle=False, num_workers=2)

In [122]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, (20, 4)) #in, out, kernel size (can be 1 number)
        self.conv2 = nn.Conv2d(6, 12, (1, 4))
        self.deconv1 = nn.ConvTranspose2d(in_channels=12, out_channels=6, kernel_size=(1, 4))
        self.deconv2 = nn.ConvTranspose2d(in_channels=6, out_channels=3, kernel_size=(1, 3))

    def forward(self, x):
        #print('input shape: ', x.shape)
        conv1_out = F.relu(self.conv1(x))
        #print('conv1 out shape: ', conv1_out.shape)
        conv2_out = F.relu(self.conv2(conv1_out))
        #print('conv2 out shape: ', conv2_out.shape)
        deconv1_out = F.relu(self.deconv1(conv2_out))
        #print('deconv1 out shape: ', deconv1_out.shape)
        deconv2_out = self.deconv2(deconv1_out)
        #print('deconv2 out shape: ', deconv2_out.shape)
        return deconv2_out


net = Net()

In [158]:
criterion = nn.MSELoss()
optimizer = optim.Adam(net.parameters(), lr=1e-4)

prints_per_epoch = 3

verbose_k = np.floor(len(trainloader)/prints_per_epoch)
print(verbose_k)

losses = []
iterations = []
best_loss = None

for epoch in range(10):

    running_loss = 0.0
    for i, data in enumerate(trainloader):
    #for i in range(66):
        #sequence = train_data[0][i, :, :, :].unsqueeze(0)
        #true_angles = train_data[1][i, :, :, :].unsqueeze(0)
        sequence, true_angles = data
        #print(sequence.shape, true_angles.shape)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        predicted_angles = net(sequence)

        loss = criterion(predicted_angles, true_angles)
        loss.backward()
        optimizer.step()

        # print statistics, should add validation loss
        running_loss += loss.item()
        
        if (i+1) % verbose_k == 0:   
            losses.append(running_loss/verbose_k)
            true_iter = len(trainloader)*epoch + i
            iterations.append(true_iter)
            
            if best_loss == None:
                best_loss = running_loss/verbose_k
            else:
                if running_loss/verbose_k <= min(losses):
                    print('new best loss, saving..')
                    best_loss = running_loss/verbose_k
                    torch.save(net.state_dict(), 'best_fcn_parameters.pt')
            
            print('epoch: {}, iteration: {}] loss: {}'.format(epoch, i, running_loss/verbose_k))
            running_loss = 0.0

print('Finished Training')

4.0
epoch: 0, iteration: 3] loss: 0.8308959752321243
0.8133204430341721 0.8133204430341721
new best loss, saving..
epoch: 0, iteration: 7] loss: 0.8133204430341721
0.8099523782730103 0.8099523782730103
new best loss, saving..
epoch: 0, iteration: 11] loss: 0.8099523782730103
0.7754370272159576 0.7754370272159576
new best loss, saving..
epoch: 1, iteration: 3] loss: 0.7754370272159576
0.7317431718111038 0.7317431718111038
new best loss, saving..
epoch: 1, iteration: 7] loss: 0.7317431718111038
0.9498272091150284 0.7317431718111038
epoch: 1, iteration: 11] loss: 0.9498272091150284
1.0188564360141754 0.7317431718111038
epoch: 2, iteration: 3] loss: 1.0188564360141754
0.7587235197424889 0.7317431718111038
epoch: 2, iteration: 7] loss: 0.7587235197424889
0.6247512698173523 0.6247512698173523
new best loss, saving..
epoch: 2, iteration: 11] loss: 0.6247512698173523
0.8720566481351852 0.6247512698173523
epoch: 3, iteration: 3] loss: 0.8720566481351852
0.8232081383466721 0.6247512698173523
epo

In [144]:
len(trainloader)

13

In [145]:
[len(x) for x in trainloader]

[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]

In [146]:
[x.shape for x in train_data]

[torch.Size([52, 1, 20, 1000]), torch.Size([52, 1, 3, 999])]

In [147]:
train_data[1][0, :, : ,:].shape

torch.Size([1, 3, 999])

In [159]:
best_loss

0.6247512698173523

In [160]:
min(losses)

0.6247512698173523