In [None]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader


In [None]:
df_original = pd.read_csv("../train_data.csv")
n_original = df_original.shape[0]
df_submit = pd.read_csv("../sample_submission.csv")
df = pd.concat([df_original, df_submit], axis=0).reset_index(drop=True)

# Features from RiNALMo model
df = pd.read_csv("../mrnafm_features.csv", index_col=0).merge(
    df,
    on='id'
)

# Features from RiNALMo model
df = pd.read_csv("../RiNalMo_features.csv", index_col=0).merge(
    df,
    on='id'
)

col_names = ['RiNALMo_feature_'+str(i) for i in range(1, 1281)] + ['mRNAFM_feature_'+str(i) for i in range(1,9)]

X = torch.tensor(df[col_names].values).float()
Y = torch.tensor(df[['mRNA_remaining_pct']].values).float()

X_train = X[:20000,:]
Y_train = Y[:20000,:]
X_valid = X[20000:n_original,:]
Y_valid = Y[20000:n_original,:]
X_test = X[n_original:,:]
Y_test = Y[n_original:,:]


In [None]:
class NNmodel(nn.Module):
    def __init__(self, input_dim, hidden_dim=50):
        super().__init__()
        self.linear1 = nn.Linear(input_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, 1)
        self.relu = nn.ReLU()
    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        pred = self.linear2(x)
        return(pred)

def trainer(model, loss_fn, optimizer, train_loader, test_loader=None, n_epochs=50):
    for epoch in range(n_epochs):
        model.train()
        for X_temp, Y_temp in train_loader:
            Y_pred = model(X_temp)
            loss = loss_fn(Y_pred, Y_temp)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        if test_loader and epoch % 5 == 0:
            model.eval()
            loss_test = 0
            n_sample = 0
            for X_temp, Y_temp in test_loader:
                Y_pred = model(X_temp)
                loss = loss_fn(Y_pred, Y_temp)
                loss_test = loss_test + loss.item()
                n_sample = n_sample + X_temp.shape[0]
            print('Epoch '+str(epoch) +', Validation Loss = ' + str(loss_test/n_sample))

# torch.sqrt(torch.mean((torch.mean(Y_train)-Y_valid)**2))


In [None]:
model = NNmodel(
    input_dim = X.shape[1],
    hidden_dim = 4
)

loss_fn = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

train_dataloader = DataLoader(TensorDataset(X_train, Y_train), batch_size=50)
valid_dataloader = DataLoader(TensorDataset(X_valid, Y_valid), batch_size=50)


In [None]:
trainer(model, loss_fn, optimizer, train_dataloader, valid_dataloader, n_epochs=100)


In [None]:
X_train = X[:n_original,:]
Y_train = Y[:n_original,:]

model = NNmodel(
    input_dim = X.shape[1],
    hidden_dim = 4
)

loss_fn = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

train_dataloader = DataLoader(TensorDataset(X_train, Y_train), batch_size=50)

trainer(model, loss_fn, optimizer, train_dataloader, n_epochs=100)



In [None]:

Y_pred = model(X)


In [None]:
with torch.no_grad():
    temp = pd.DataFrame( Y_pred.numpy(), columns=['Pretrained_feature_predict'] )
temp['id'] = df['id']


In [None]:
temp.to_csv('../pretrained_feature_predict.csv')