In [2]:
import numpy as np
import pandas as pd
from NCPR_functions import load_NCPR, train_test_split

from river import metrics, preprocessing, stream, linear_model, tree, ensemble, compat, compose
from sklearn import datasets
import sklearn
import model_to_river
import my_pipeline
from ensemble_class import EnsembleModel
from torch import nn, optim
import torch

In [3]:
dict_data, NCPR_df = load_NCPR('data/NCPR_bert.npz', 'data/uniprot-NCPR.tab', 'data/uniprot-NCPR.fasta')
xtrain, ytrain, xtest, ytest = train_test_split(dict_data, NCPR_df)

def river_nn(X,y, net):
    model = compose.Pipeline(
        preprocessing.StandardScaler(),
        compat.PyTorch2RiverRegressor(
            net=net,
            loss_fn=nn.MSELoss(),
            optimizer=optim.SGD(net.parameters(), lr=1e-3),
            batch_size=2
            )
        )
    model = model.to(device=torch.device('cuda:0'))
    metric = metrics.Accuracy()
    for i in range(len(X)-1):
            train_stream = stream.iter_array(
                X[i], y[i],
                feature_names = ['x{}'.format(j) for j in range(len(X[i]))] 
            )
            
            for data, target in train_stream:
                x = model.transform_one(data)
                model = model.learn_one(x, target)

    test_stream = stream.iter_array(
        X[-1], y[-1],
        feature_names = ['x{}'.format(j) for j in range(len(X[-1]))] 
        )
    for data, target in test_stream:
        y_pred = model.predict_proba_one(data)      # make a prediction
        metric = metric.update(target, y_pred)
    return metric.get()

In [18]:
class MyIterableDataset(torch.utils.data.IterableDataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    
    def __len__(self):
        return len(self.y)

    def __iter__(self):
        for i in range(len(self.y)):
            yield self.X[i], self.y[i]

In [19]:
train_data = MyIterableDataset(xtrain, ytrain)
test_data = MyIterableDataset(xtest, ytest)

In [23]:
train_loader = torch.utils.data.DataLoader(train_data, batch_size=16)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=16)

In [19]:
net = nn.Sequential(
    nn.Linear(768, 900),
    nn.ReLU(),
    nn.Dropout(0.1),
    nn.Linear(900, 1100),
    nn.Linear(1100, 1274),
)
X = [xtrain, xtest]
y = [ytrain, ytest]

In [None]:
import torch.optim as optim
cuda0 = torch.device('cuda:0')
net = net.to(device=cuda0)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
net.train()

In [None]:
for epoch in range(10):
    for data, target in train_loader:
        optimizer.zero_grad()

        data = data.to(cuda0)
        target = target.to(cuda0)
        # Forward pass
        output = model(data)

        # Calculate loss
        loss = loss_fn(output, target)

        # Backward pass
        loss.backward()
        
        # Weight update
        optimizer.step()
    print('Train Epoch: %d  Loss: %.4f' % (epoch + 1,  loss.item()))

In [14]:
model.eval()

test_loss = 0
correct = 0

# Turning off automatic differentiation
with torch.no_grad():
    for data, target in test_loader:
        data = data.to(cuda0)
        target = target.to(cuda0)

        output = net(data)
        test_loss += loss_fn(output, target).item()  # Sum up batch loss
        pred = output.argmax(dim=1, keepdim=True)  # Get the index of the max class score
        correct += pred.eq(target.view_as(pred)).sum().item()

test_loss /= len(test_loader.dataset)

print('Test set: Average loss: %.4f, Accuracy: %d/%d (%.4f)' %
      (test_loss, correct, len(test_loader.dataset),
       100. * correct / len(test_loader.dataset)))

NameError: name 'river_nn' is not defined