In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from collections import Counter
from rich import print
from sklearn.metrics import precision_score, recall_score, mean_absolute_error


In [2]:

class GenomicTokenizer:
    def __init__(self, ngram=5, stride=2):
        self.ngram = ngram
        self.stride = stride
        
    def tokenize(self, t):
        t = t.upper()
        if self.ngram == 1:
            toks = list(t)
        else:
            toks = [t[i:i+self.ngram] for i in range(0, len(t), self.stride) if len(t[i:i+self.ngram]) == self.ngram]
        if len(toks[-1]) < self.ngram:
            toks = toks[:-1]
        return toks


class GenomicVocab:
    def __init__(self, itos):
        self.itos = itos
        self.stoi = {v:k for k,v in enumerate(self.itos)}
        
    @classmethod
    def create(cls, tokens, max_vocab, min_freq):
        freq = Counter(tokens)
        itos = ['<pad>'] + [o for o,c in freq.most_common(max_vocab-1) if c >= min_freq]
        return cls(itos)


class SiRNADataset(Dataset):
    def __init__(self, df, columns, vocab, tokenizer, max_len):
        self.df = df
        self.columns = columns
        self.vocab = vocab
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        seqs = [self.tokenize_and_encode(row[col]) for col in self.columns]
        target = torch.tensor(row['mRNA_remaining_pct'], dtype=torch.float)

        return seqs, target

    def tokenize_and_encode(self, seq):
        if ' ' in seq:  # Modified sequence
            tokens = seq.split()
        else:  # Regular sequence
            tokens = self.tokenizer.tokenize(seq)
        
        encoded = [self.vocab.stoi.get(token, 0) for token in tokens]  # Use 0 (pad) for unknown tokens
        padded = encoded + [0] * (self.max_len - len(encoded))
        return torch.tensor(padded[:self.max_len], dtype=torch.long)



In [3]:

class SiRNAModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=200, hidden_dim=256, n_layers=3, dropout=0.5):
        super(SiRNAModel, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.gru = nn.GRU(embed_dim, hidden_dim, n_layers, bidirectional=True, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 4, 1) # Bi-direactional and two feature columns
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        embedded = [self.embedding(seq) for seq in x]
        outputs = []
        for embed in embedded:
            x, _ = self.gru(embed)
            x = self.dropout(x[:, -1, :])  # Use last hidden state
            outputs.append(x)
        
        x = torch.cat(outputs, dim=1)
        x = self.fc(x)
        return x.squeeze()


def calculate_metrics(y_true, y_pred, threshold=30):
    mae = np.mean(np.abs(y_true - y_pred))

    y_true_binary = (y_true < threshold).astype(int)
    y_pred_binary = (y_pred < threshold).astype(int)

    mask = (y_pred >= 0) & (y_pred <= threshold)
    range_mae = mean_absolute_error(y_true[mask], y_pred[mask]) if mask.sum() > 0 else 100

    precision = precision_score(y_true_binary, y_pred_binary, average='binary')
    recall = recall_score(y_true_binary, y_pred_binary, average='binary')
    f1 = 2 * precision * recall / (precision + recall)
    score = (1 - mae / 100) * 0.5 + (1 - range_mae / 100) * f1 * 0.5
    return score



def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=50, device='cuda'):
    model.to(device)
    best_score = -float('inf')
    best_model = None

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for inputs, targets in tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}'):
            inputs = [x.to(device) for x in inputs]
            targets = targets.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        model.eval()
        val_loss = 0
        val_preds = []
        val_targets = []

        with torch.no_grad():
            for inputs, targets in val_loader:
                inputs = [x.to(device) for x in inputs]
                targets = targets.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, targets)
                val_loss += loss.item()
                val_preds.extend(outputs.cpu().numpy())
                val_targets.extend(targets.cpu().numpy())
        
        train_loss /= len(train_loader)
        val_loss /= len(val_loader)
        
        val_preds = np.array(val_preds)
        val_targets = np.array(val_targets)
        score = calculate_metrics(val_targets, val_preds)
        
        print(f'Epoch {epoch+1}/{num_epochs}')
        print(f'Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')
        print(f'Learning Rate: {optimizer.param_groups[0]["lr"]:.6f}')
        print(f'Validation Score: {score:.4f}')

        if score > best_score:
            best_score = score
            best_model = model.state_dict().copy()
            print(f'New best model found with socre: {best_score:.4f}')

    return best_model

def evaluate_model(model, test_loader, device='cuda'):
    model.eval()
    predictions = []
    targets = []
    
    with torch.no_grad():
        for inputs, target in test_loader:
            inputs = [x.to(device) for x in inputs]
            outputs = model(inputs)
            predictions.extend(outputs.cpu().numpy())
            targets.extend(target.numpy())

    y_pred = np.array(predictions)
    y_test = np.array(targets)
    
    score = calculate_metrics(y_test, y_pred)
    print(f"Test Score: {score:.4f}")



In [5]:

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load data
train_data = pd.read_csv('../train_data.csv')
columns = ['siRNA_antisense_seq', 'modified_siRNA_antisense_seq_list']

train_data.dropna(subset=columns + ['mRNA_remaining_pct'], inplace=True)
train_data, val_data = train_test_split(train_data, test_size=0.1, random_state=42)

# Create vocabulary
tokenizer = GenomicTokenizer(ngram=3, stride=1)

all_tokens = []
for col in columns:
    for seq in train_data[col]:
        if ' ' in seq:  # Modified sequence
            all_tokens.extend(seq.split())
        else:
            all_tokens.extend(tokenizer.tokenize(seq))
vocab = GenomicVocab.create(all_tokens, max_vocab=10000, min_freq=1)

# Find max sequence length (==25 in this case)
max_len = max(max(len(seq.split()) if ' ' in seq else len(tokenizer.tokenize(seq)) 
                    for seq in train_data[col]) for col in columns)




In [6]:
### Load trained GRU model
gru_model = SiRNAModel(92)
gru_model.load_state_dict(torch.load('../GRU_weights')) # Trained GRU model
gru_model.eval()


SiRNAModel(
  (embedding): Embedding(92, 200, padding_idx=0)
  (gru): GRU(200, 256, num_layers=3, batch_first=True, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=1024, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [7]:
import pandas as pd

df_original = pd.read_csv("../train_data.csv")
n_original = df_original.shape[0]
df_submit = pd.read_csv("../sample_submission.csv")
df = pd.concat([df_original, df_submit], axis=0).reset_index(drop=True)

all_dataset = SiRNADataset(df, columns, vocab, tokenizer, max_len)
all_loader = DataLoader(all_dataset, batch_size=df.shape[0], shuffle=False)
for x, y in all_loader:
    None



In [8]:
###
def get_GRU_output(x, model):
    model.eval()
    temp1 = model._modules['embedding'](x[0])
    temp2 = model._modules['embedding'](x[1])
    temp1, _ = model._modules['gru'](temp1)
    temp2, _ = model._modules['gru'](temp2)
    retval = torch.cat([temp1[:, -1, :], temp2[:, -1, :]], dim=1)
    return retval

gru_feature_1 = np.zeros((df.shape[0], 1024))
gru_feature_2 = np.zeros((df.shape[0], 1))

for i in range(0, len(y), 100):
    with torch.no_grad():
        temp_x = [x[0][i:(i+100)], x[1][i:(i+100)]]
        gru_feature_2[i:(i+100),0] = gru_model(temp_x)
        temp = get_GRU_output(temp_x, gru_model)
        gru_feature_1[i:(i+100)] = np.array(temp)
        # print(temp.shape)
    # break

In [9]:
gru_feature = np.concatenate([np.array(df['id'], dtype=np.int64).reshape(df.shape[0],1), gru_feature_2, gru_feature_1], axis=1)
gru_feature_df = pd.DataFrame(gru_feature, columns=['id', 'GRU_predict']+['GRU_feature_'+str(i+1) for i in range(gru_feature_1.shape[1])])
gru_feature_df


Unnamed: 0,id,GRU_predict,GRU_feature_1,GRU_feature_2,GRU_feature_3,GRU_feature_4,GRU_feature_5,GRU_feature_6,GRU_feature_7,GRU_feature_8,...,GRU_feature_1015,GRU_feature_1016,GRU_feature_1017,GRU_feature_1018,GRU_feature_1019,GRU_feature_1020,GRU_feature_1021,GRU_feature_1022,GRU_feature_1023,GRU_feature_1024
0,7.0,33.832245,-1.000000,-1.000000,-3.731370e-03,-0.000202,-0.999999,-1.000000,-1.192093e-07,1.000000,...,-0.999999,0.000001,0.000000e+00,0.678871,-1.000000,-1.000000,0.000000e+00,0.000000e+00,0.010479,0.999814
1,16.0,30.202850,-0.214032,-1.000000,-6.271336e-01,-0.046766,0.040665,-0.999848,-9.706432e-02,0.882791,...,-0.999901,0.000000,0.000000e+00,0.999990,-1.000000,-1.000000,0.000000e+00,0.000000e+00,0.001998,-0.999935
2,17.0,34.311020,-0.997860,-1.000000,8.344650e-07,-0.994178,-0.999998,-0.999971,0.000000e+00,1.000000,...,-0.999996,0.000019,0.000000e+00,0.820488,-0.999999,-1.000000,0.000000e+00,0.000000e+00,0.052048,0.999945
3,22.0,60.863113,-1.000000,-1.000000,-5.203009e-02,-0.959726,-0.009937,-1.000000,-6.341934e-05,1.000000,...,0.998358,0.000583,1.490116e-05,-0.991066,-0.999998,-1.000000,-1.192093e-07,-6.451774e-01,0.009498,-0.999946
4,35.0,22.256744,-0.001306,-0.990601,-1.000000e+00,-0.000002,0.005046,-0.738691,8.090854e-03,0.061507,...,-0.991022,0.000000,0.000000e+00,1.000000,-1.000000,-1.000000,0.000000e+00,0.000000e+00,0.001862,-0.999841
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30651,59038.0,24.268742,-0.068692,-0.907492,-2.798337e-02,-0.001813,0.000003,-0.999626,-1.144409e-05,0.975849,...,-0.999998,0.000095,0.000000e+00,0.019696,-0.999973,-1.000000,0.000000e+00,0.000000e+00,0.091423,0.999998
30652,59042.0,45.576744,-0.270773,-1.000000,-9.663385e-01,-0.162008,-0.000003,-0.999982,-1.192093e-07,0.999949,...,0.999938,0.000108,4.768372e-07,-0.998831,-1.000000,-1.000000,0.000000e+00,-1.455724e-03,0.064839,-0.999992
30653,59050.0,88.484909,0.999995,-1.000000,-2.801418e-06,-0.212895,-1.000000,-1.000000,1.943111e-05,1.000000,...,0.922921,0.006740,1.192027e-02,-1.000000,-0.999826,-0.999891,-6.580353e-05,-2.756112e-01,0.890531,-0.997934
30654,59052.0,28.083324,-0.023629,-0.999626,-5.207360e-03,-0.000332,-0.000034,0.999846,8.344650e-07,0.026372,...,-0.999997,0.000004,0.000000e+00,0.999980,-1.000000,-1.000000,0.000000e+00,-1.192093e-07,0.042450,-0.977028


In [10]:
no_signal_col = []
for i in range(1024):
    if np.std(gru_feature_df['GRU_feature_'+str(i+1)]) < 0.01:
        no_signal_col.append('GRU_feature_'+str(i+1))

print( len(no_signal_col) )

gru_feature_df = gru_feature_df[[x for x in gru_feature_df.columns if x not in no_signal_col]]

In [11]:
# gru_feature_df.to_csv('../GRU_features.csv', index=False)
gru_feature_df[['id','GRU_predict']].to_csv('../GRU_features_predict_only.csv', index=False)