In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [3]:
train_raw = pd.read_csv('../data/train_MPRA.txt', delimiter='\t', header=None)
test_raw = pd.read_csv('../data/test_MPRA.txt', delimiter='\t', header=None)
train_sol = pd.read_csv('../data/trainsolutions.txt', delimiter='\t', header=None)
train_raw.head()
strand_length = 295


In [4]:
# Get our x and y data
train_scores = np.array(train_raw.iloc[:, 2:297]) #Dimensions are 8000 (samples) by 295 (SHARPR scores per nucleotide)
raw_dna_strands_train = [list(train_raw[1][i]) for i in range(len(train_raw))] #List of lists holding DNA strands separated by character. Size 8000 lists each of length 290
embedded_dna_strands_train = [np.column_stack((np.array(pd.get_dummies(pd.concat([pd.Series(raw_dna_strands_train[i]), pd.Series(["A", "C", "T", "G"])]), dtype='int'))[:-4], np.arange(295))) for i in range(len(train_raw))] #One hot encoded dna strands, list of 8000 matrices, each (295,5)
embedded_dna_strands_train = [embedded_dna_strands_train[i] for i in range(len(embedded_dna_strands_train)) if not ("N" in raw_dna_strands_train[i])]
train_scores  = [train_scores[i] for i in range(len(raw_dna_strands_train)) if not ("N" in raw_dna_strands_train[i])]
#Repeat for test data
raw_dna_strands_test = [list(test_raw[1][i]) for i in range(len(test_raw))] #List of lists holding DNA strands separated by character. Size 8000 lists each of length 290
embedded_dna_strands_test = [np.column_stack((np.array(pd.get_dummies(pd.concat([pd.Series(raw_dna_strands_test[i]), pd.Series(["A", "C", "T", "G"])]), dtype='int'))[:-4], np.arange(295))) for i in range(len(test_raw))]
embedded_dna_strands_test = [embedded_dna_strands_test[i] for i in range(len(embedded_dna_strands_test)) if not ("N" in raw_dna_strands_test[i])]

In [5]:
def create_positional_encoding(sequence_length, encoding_dim):
    """
    Create sinusoidal positional encodings for a sequence
    
    Args:
        sequence_length: Length of the sequence
        encoding_dim: Number of dimensions for positional encoding
    """
    # Create position vector (0, 1, 2, ...)
    position = torch.arange(sequence_length).unsqueeze(1)
    
    # Create scaling factor for different dimensions
    div_term = torch.exp(torch.arange(0, encoding_dim, 2) * (-np.log(10000.0) / encoding_dim))
    
    # Create empty encoding matrix
    pos_encoding = torch.zeros(sequence_length, encoding_dim)
    
    # Fill with sine and cosine values
    pos_encoding[:, 0::2] = torch.sin(position * div_term)
    pos_encoding[:, 1::2] = torch.cos(position * div_term)
    
    return pos_encoding

# Convert your embeddings  # Your original embeddings
pos_dim = 16  # Number of positional encoding dimensions

# Convert to tensor if not already
if not isinstance(embedded_dna_strands_train[0], torch.Tensor):
    embedded_dna_strands_train = [torch.tensor(emb, dtype=torch.float32) for emb in embedded_dna_strands_train]

# Get nucleotide part (first 4 dimensions)
nucleotide_encodings = [emb[:, :4] for emb in embedded_dna_strands_train]

# Create positional encodings for sequence length
pos_encodings = create_positional_encoding(embedded_dna_strands_train[0].shape[0], pos_dim)

# Combine nucleotide encodings with positional encodings
final_embeddings = [torch.cat([nuc, pos_encodings], dim=1) for nuc in nucleotide_encodings]

# Stack if you need them in a single tensor
final_embeddings = torch.stack(final_embeddings)

In [6]:
#Add column with unique identifier for each nucleotide (sequence + location)
train_sol[3] = [str(train_sol.iloc[i, 1][5:]).zfill(4) + str(train_sol.iloc[i,2]).zfill(3) for i in range(len(train_sol))]

#Split by activators and repressors
train_sol_act = train_sol[train_sol[0] == 'A'][3]
train_sol_rep = train_sol[train_sol[0] == 'R'][3]

### ML Model

In [7]:
class DNADataset(Dataset):
    def __init__(self, embedded_dna_strands, train_scores):
        self.x = torch.tensor(embedded_dna_strands, dtype=torch.float32) # Convert x and y to tensors
        self.y = torch.tensor(train_scores, dtype=torch.float32)

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

In [8]:
class SelfAttentionFeedForward(nn.Module):
    #Initialize hyperparameters and NN matrices
    def __init__(self, attention_size, seq_len, embed_size, hidden_size, hidden_layers, lr, train_len, num_heads):
        super().__init__()
        self.attention_size = attention_size
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.hidden_layers = hidden_layers
        self.lr = lr
        self.train_len = train_len
        self.seq_len = seq_len
        self.num_heads = num_heads
        #self.dropout_rate = dropout_rate 

        self.initAttention()
        self.initFFN()

        self.optimizer = torch.optim.Adam(
            self.parameters(),
            lr=self.lr,
            amsgrad=True,
        )

    #Initialize our weight matrices as torch objects, allows them to be automatically optimized
    def initAttention(self):
        head_size = self.attention_size // self.num_heads
        self.W_Q = nn.ModuleList([nn.Linear(self.embed_size, head_size, bias=True) for _ in range(self.num_heads)])
        self.W_K = nn.ModuleList([nn.Linear(self.embed_size, head_size, bias=True) for _ in range(self.num_heads)])
        self.W_V = nn.ModuleList([nn.Linear(self.embed_size, head_size, bias=True) for _ in range(self.num_heads)])
        self.W_O = nn.Linear(self.attention_size, self.attention_size)
        self.input_proj = nn.Linear(self.embed_size, self.attention_size)

            # Xavier initialization
        for layer in self.W_Q + self.W_K + self.W_V:
            nn.init.xavier_uniform_(layer.weight)
        nn.init.xavier_uniform_(self.W_O.weight)

        
    
    #Initialize Feed Forward layers, based on however many hidden layers we want
    def initFFN(self):


        self.layer_norm1 = nn.LayerNorm(self.attention_size)
        self.layer_norm2 = nn.LayerNorm(self.attention_size)
        
        
        self.ffn = nn.Sequential(
            nn.Linear(self.attention_size, self.hidden_size),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(self.hidden_size, self.hidden_size),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(self.hidden_size, self.attention_size)
        )

        self.dropout = nn.Dropout(0.1)
        self.output_proj = nn.Linear(self.attention_size, 1)
        self.criterion = nn.MSELoss() # Switch to mean squared error instead of simple norm (this is better apparently?)
    
    def custom_loss(self, pred, target):
        mse = self.criterion(pred, target)
        variance_penalty = -0.01 * torch.var(pred, dim=1).mean()  # Encourage variation
        return mse + variance_penalty

    def forward(self, x):
        # x of size                                               (batch_size, sequence_length, embedding_size)
        if x.shape[-1] != self.embed_size:
            raise ValueError
        
        residual = self.input_proj(x)

        head_outputs = []
        for head in range(self.num_heads):
            queries = self.W_Q[head](x) #                                   (batch_size, sequence_length, attention_size)
            keys = self.W_K[head](x) #                                      (batch_size, sequence_length, attention_size)
            values = self.W_V[head](x) #                                    (batch_size, sequence_length, attention_size)

            # Scale to prevent overflow errors, divide by square root of attention dimension
            scale = torch.sqrt(torch.Tensor([queries.size(-1)]))

            #Compute attention and then normalize
            attention = torch.bmm(queries, keys.transpose(1,2)) / scale #                  (batch_size, seq_len, seq_len)
            weights = torch.nn.functional.dropout(torch.nn.functional.softmax(attention, dim=2), p=0.1, training=self.training) # Apply this per sample

            # Use as weights for values
            context = torch.bmm(weights, values) #    (batch_size, attention_size, sequence_length)
            head_outputs.append(context)

        # Combine heads
        multi_head = torch.cat(head_outputs, dim=-1)
        attention_output = self.W_O(multi_head)

        # Add first layernorm + residual (add initial info)
        x = self.layer_norm1(attention_output + residual)

        # Run through all FFN layers
        ffn_output = self.ffn(x)

        # Add second layernorm + residual
        x = self.layer_norm2(ffn_output + residual)

        #Output projection
        x = self.dropout(x)
        x = self.output_proj(x)

        # Return prediction with added b term
        return x.squeeze(-1)

    def train_step(self, x, y):
        self.optimizer.zero_grad()
        pred = self(x)
        loss = self.custom_loss(pred, y)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.parameters(), max_norm=1.0) # This is for stability
        self.optimizer.step()
        return loss.item() # Diagnostic info

    def train(self, dataloader):
        losses = []
        for epoch in range(self.train_len):
            epoch_loss = 0
            for x_batch, y_batch in dataloader:
                loss = self.train_step(x_batch, y_batch)
                epoch_loss += loss
            avg_loss = epoch_loss / len(dataloader)
            losses.append(avg_loss)
            if (epoch + 1) % 1 == 0:
                print(f"Epoch {epoch+1}/{self.train_len}, Loss: {avg_loss:.4f}")
        return losses
            

In [9]:

dataset = DNADataset(final_embeddings, train_scores)

# Create a DataLoader for batching, shuffling, and parallel data loading
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

model = SelfAttentionFeedForward(60, 295, 20, 30, 1, 1e-1, 20, 5) # (attention_size, seq_len, embed_size, hidden_size, hidden_layers, lr, train_len, num_heads)
model.train(dataloader)


  self.x = torch.tensor(embedded_dna_strands, dtype=torch.float32) # Convert x and y to tensors
  self.y = torch.tensor(train_scores, dtype=torch.float32)


Epoch 1/20, Loss: 1.0252
Epoch 2/20, Loss: 0.3525


KeyboardInterrupt: 

In [10]:
test_predictions = model.forward(torch.Tensor(final_embeddings)).detach().numpy()


: 

In [108]:
test_predictions[2]

array([ 0.14745839,  0.00160678, -0.34630793, -0.07004087, -0.02158372,
        0.04086192,  0.05970462,  0.09347995,  0.09696658,  0.10687621,
        0.10852177,  0.10981353,  0.11047013,  0.11082776,  0.11095841,
        0.11100657,  0.11104329,  0.11100514,  0.11102183,  0.11103423,
        0.11102946,  0.11103041,  0.11101754,  0.11100753,  0.11100896,
        0.11099847,  0.11100467,  0.11100371,  0.11099799,  0.11100133,
        0.1110018 ,  0.11099561,  0.1109999 ,  0.11099608,  0.11099942,
        0.11100037,  0.11099561,  0.1110018 ,  0.11099656,  0.11099608,
        0.11099704,  0.11099608,  0.11099513,  0.11099751,  0.11099751,
        0.11099561,  0.11099799,  0.11099608,  0.11099656,  0.1109937 ,
        0.11099561,  0.11099561,  0.11099418,  0.11099418,  0.11099418,
        0.11099561,  0.11099513,  0.11099465,  0.11099513,  0.11099561,
        0.11099513,  0.11099513,  0.11099704,  0.11099656,  0.11099656,
        0.11099656,  0.11099656,  0.11099656,  0.11099656,  0.11

In [85]:
r = test_predictions.reshape(7720*295)
r.sort()

In [87]:
r[:-10]

array([-0.05639719, -0.05639719, -0.05639719, ..., -0.05638735,
       -0.05638735, -0.05638735], shape=(2277390,), dtype=float32)