## CPU training

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import torch.autograd.profiler as profiler

class ParamsDownloader:
    def __init__(self, data_path):
        self.data_path = data_path
        self.load_data()

    def load_data(self):
        self.ln1b = np.load(f'{self.data_path}/ln1b.npy')
        self.ln1w = np.load(f'{self.data_path}/ln1w.npy')
        self.ln2b = np.load(f'{self.data_path}/ln2b.npy')
        self.ln2w = np.load(f'{self.data_path}/ln2w.npy')
        


class Model(nn.Module):
    def __init__(self, ln1w, ln1b, ln2w, ln2b):
        super(Model, self).__init__()
        N, H1 = ln1w.shape
        H2, _ = ln2w.shape
        
        self.l1 = nn.Linear(N, H1)
        self.l1.weight.data = torch.from_numpy(ln1w).float()
        self.l1.bias.data = torch.from_numpy(ln1b).float()

        self.l2 = nn.Linear(H1, H2)
        self.l2.weight.data = torch.from_numpy(ln2w).float()
        self.l2.bias.data = torch.from_numpy(ln2b).float()

    def forward(self, x):
        y1 = self.l1(x)
        y1_relu = F.relu(y1)
        y2 = self.l2(y1_relu)
        return y1, y1_relu, y2
    
    
X_train = np.load('../dataset/x_train.npy')
y_train = np.load('../dataset/y_train.npy').astype(np.int64)
print(X_train [0,0:5])
# X_test = np.load('../dataset/x_test.npy')
# y_test = np.load('../dataset/y_test.npy')

N_EPOCHS = 10
BATCH_SIZE = 32
train_loader = torch.utils.data.DataLoader(TensorDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).long()), batch_size=BATCH_SIZE, shuffle=False)
data_loader = ParamsDownloader('../with-torch-tests/trained-model-cpu')
model = Model(data_loader.ln1w, data_loader.ln1b, data_loader.ln2w, data_loader.ln2b)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss(reduction='mean')

for epoch in range(N_EPOCHS):
    for i, (X, y) in enumerate(train_loader):
        optimizer.zero_grad()
        y1, y1_relu, y2 = model(X)
        loss = criterion(y2, y)
        loss.backward()
        optimizer.step()
    print(f"Epoch: {epoch}, Loss: {loss.item()}")

x_test = np.load('../dataset/x_test.npy')
y_test = np.load('../dataset/y_test.npy')
y1, y1_relu, y2 = model(torch.from_numpy(x_test).float())
y_pred = y2.argmax(dim=1).numpy()
accuracy = (y_pred == y_test).mean()
print(f"Accuracy: {accuracy}")

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import torch.autograd.profiler as profiler

class ParamsDownloader:
    def __init__(self, data_path):
        self.data_path = data_path
        self.load_data()

    def load_data(self):
        self.ln1b = np.load(f'{self.data_path}/ln1b.npy')
        self.ln1w = np.load(f'{self.data_path}/ln1w.npy')
        self.ln2b = np.load(f'{self.data_path}/ln2b.npy')
        self.ln2w = np.load(f'{self.data_path}/ln2w.npy')
        

class Model(nn.Module):
    def __init__(self, ln1w, ln1b, ln2w, ln2b):
        super(Model, self).__init__()
        N, H1 = ln1w.shape
        H2, _ = ln2w.shape
        
        self.l1 = nn.Linear(N, H1)
        self.l1.weight.data = torch.from_numpy(ln1w).float()
        self.l1.bias.data = torch.from_numpy(ln1b).float()

        self.l2 = nn.Linear(H1, H2)
        self.l2.weight.data = torch.from_numpy(ln2w).float()
        self.l2.bias.data = torch.from_numpy(ln2b).float()

    def forward(self, x):
        y1 = self.l1(x)
        y1_relu = F.relu(y1)
        y2 = self.l2(y1_relu)
        return y1, y1_relu, y2
    
    
X_train = np.load('../dataset/x_train.npy')
y_train = np.load('../dataset/y_train.npy').astype(np.int64)

N_EPOCHS = 10
BATCH_SIZE = 32
train_loader = torch.utils.data.DataLoader(TensorDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).long()), batch_size=BATCH_SIZE, shuffle=False)
data_loader = ParamsDownloader('../with-torch-tests/trained-model-cpu')
model = Model(data_loader.ln1w, data_loader.ln1b, data_loader.ln2w, data_loader.ln2b)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss(reduction='mean')

for epoch in range(N_EPOCHS):
    with profiler.profile(record_shapes=True, use_cuda=False) as prof:
        for i, (X, y) in enumerate(train_loader):
            optimizer.zero_grad()
            y1, y1_relu, y2 = model(X)
            loss = criterion(y2, y)
            loss.backward()
            optimizer.step()
        print(f"epoch: {epoch+1}, loss: {loss.item()}")
    print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))

x_test = np.load('../dataset/x_test.npy')
y_test = np.load('../dataset/y_test.npy')
y1, y1_relu, y2 = model(torch.from_numpy(x_test).float())
y_pred = y2.argmax(dim=1).numpy()
accuracy = (y_pred == y_test).mean()
print(f"Accuracy: {accuracy}")


## GPU training

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import torch.autograd.profiler as profiler
import time

# Set the device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

class ParamsDownloader:
    def __init__(self, data_path):
        self.data_path = data_path
        self.load_data()

    def load_data(self):
        self.ln1b = np.load(f'{self.data_path}/ln1b.npy')
        self.ln1w = np.load(f'{self.data_path}/ln1w.npy')
        self.ln2b = np.load(f'{self.data_path}/ln2b.npy')
        self.ln2w = np.load(f'{self.data_path}/ln2w.npy')

class Model(nn.Module):
    def __init__(self, ln1w, ln1b, ln2w, ln2b):
        super(Model, self).__init__()
        N, H1 = ln1w.shape
        H2, _ = ln2w.shape
        
        self.l1 = nn.Linear(N, H1)
        self.l1.weight.data = torch.from_numpy(ln1w).float().to(device)
        self.l1.bias.data = torch.from_numpy(ln1b).float().to(device)

        self.l2 = nn.Linear(H1, H2)
        self.l2.weight.data = torch.from_numpy(ln2w).float().to(device)
        self.l2.bias.data = torch.from_numpy(ln2b).float().to(device)

    def forward(self, x):
        y1 = self.l1(x)
        y1_relu = F.relu(y1)
        y2 = self.l2(y1_relu)
        return y1, y1_relu, y2
    
# Load data and move to device
X_train = np.load('../dataset/x_train.npy')
y_train = np.load('../dataset/y_train.npy').astype(np.int64)

print(X_train[0, 0:5])

N_EPOCHS = 20
BATCH_SIZE = 256

train_loader = DataLoader(TensorDataset(torch.from_numpy(X_train).float(), 
                                        torch.from_numpy(y_train).long()), 
                          batch_size=BATCH_SIZE, shuffle=False)

data_loader = ParamsDownloader('../with-torch-tests/trained-model')
model = Model(data_loader.ln1w, data_loader.ln1b, data_loader.ln2w, data_loader.ln2b).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss(reduction='mean')

total_time = 0.0
# Training loop
for epoch in range(N_EPOCHS):
    start_time = time.time()
    for i, (X, y) in enumerate(train_loader):
        optimizer.zero_grad()
        X = X.to(device)
        y = y.to(device)
        y1, y1_relu, y2 = model(X)
        loss = criterion(y2, y)
        loss.backward()
        optimizer.step()
    print(f"Epoch: {epoch+1}, Loss: {loss.item()}")
    print(f"Time taken: {time.time() - start_time}")
    total_time += time.time() - start_time
print(f"Total time taken: {total_time}")
print(f"Average time per epoch: {total_time / N_EPOCHS} seconds" )
# Testing the model
x_test = np.load('../dataset/x_test.npy')
y_test = np.load('../dataset/y_test.npy')

y1, y1_relu, y2 = model(torch.from_numpy(x_test).float().to(device))
y_pred = y2.argmax(dim=1).cpu().numpy()
accuracy = (y_pred == y_test).mean()
print(f"Accuracy: {accuracy}")


Using device: cuda
[ 1.2802037   0.1930291   0.14813042 -0.9187077   0.39928705]


  _torch_pytree._register_pytree_node(


Epoch: 1, Loss: 17.527576446533203
Time taken: 0.7082126140594482
Epoch: 2, Loss: 11.733003616333008
Time taken: 0.3717939853668213
Epoch: 3, Loss: 9.270295143127441
Time taken: 0.3654367923736572
Epoch: 4, Loss: 7.723914623260498
Time taken: 0.3745746612548828
Epoch: 5, Loss: 6.6460981369018555
Time taken: 0.47765040397644043
Epoch: 6, Loss: 5.852162837982178
Time taken: 0.4715690612792969
Epoch: 7, Loss: 5.230144500732422
Time taken: 0.4310479164123535
Epoch: 8, Loss: 4.732577323913574
Time taken: 0.39620208740234375
Epoch: 9, Loss: 4.3370442390441895
Time taken: 0.40636658668518066
Epoch: 10, Loss: 4.010096549987793
Time taken: 0.40944695472717285
Epoch: 11, Loss: 3.7300851345062256
Time taken: 0.3954811096191406
Epoch: 12, Loss: 3.4881057739257812
Time taken: 0.3741786479949951
Epoch: 13, Loss: 3.283463478088379
Time taken: 0.4808011054992676
Epoch: 14, Loss: 3.105936288833618
Time taken: 0.40611791610717773
Epoch: 15, Loss: 2.9462108612060547
Time taken: 0.3911290168762207
Epoch: 

In [14]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import torch.autograd.profiler as profiler

# Set the device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

class ParamsDownloader:
    def __init__(self, data_path):
        self.data_path = data_path
        self.load_data()

    def load_data(self):
        self.ln1b = np.load(f'{self.data_path}/ln1b.npy')
        self.ln1w = np.load(f'{self.data_path}/ln1w.npy')
        self.ln2b = np.load(f'{self.data_path}/ln2b.npy')
        self.ln2w = np.load(f'{self.data_path}/ln2w.npy')

class Model(nn.Module):
    def __init__(self, ln1w, ln1b, ln2w, ln2b):
        super(Model, self).__init__()
        N, H1 = ln1w.shape
        H2, _ = ln2w.shape
        
        self.l1 = nn.Linear(N, H1)
        self.l1.weight.data = torch.from_numpy(ln1w).float().to(device)
        self.l1.bias.data = torch.from_numpy(ln1b).float().to(device)

        self.l2 = nn.Linear(H1, H2)
        self.l2.weight.data = torch.from_numpy(ln2w).float().to(device)
        self.l2.bias.data = torch.from_numpy(ln2b).float().to(device)

    def forward(self, x):
        y1 = self.l1(x)
        y1_relu = F.relu(y1)
        y2 = self.l2(y1_relu)
        return y1, y1_relu, y2
    
# Load data and move to device
X_train = np.load('../dataset/x_train.npy')
y_train = np.load('../dataset/y_train.npy').astype(np.int64)

print(X_train[0, 0:5])

N_EPOCHS = 5
BATCH_SIZE = 1024

train_loader = DataLoader(TensorDataset(torch.from_numpy(X_train).float(), 
                                        torch.from_numpy(y_train).long()), 
                          batch_size=BATCH_SIZE, shuffle=False)

data_loader = ParamsDownloader('../with-torch-tests/trained-model')
model = Model(data_loader.ln1w, data_loader.ln1b, data_loader.ln2w, data_loader.ln2b).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss(reduction='mean')
# Training loop
for epoch in range(N_EPOCHS):
    with profiler.profile(record_shapes=True,use_cuda=True) as prof:
        for i, (X, y) in enumerate(train_loader):
            optimizer.zero_grad()
            X = X.to(device)
            y = y.to(device)
            y1, y1_relu, y2 = model(X)
            loss = criterion(y2, y)
            loss.backward()
            optimizer.step()
    print(f"Epoch: {epoch+1}, Loss: {loss.item()}")
    # Print profiling results
    print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))

# Testing the model
x_test = np.load('../dataset/x_test.npy')
y_test = np.load('../dataset/y_test.npy')

y1, y1_relu, y2 = model(torch.from_numpy(x_test).float().to(device))
y_pred = y2.argmax(dim=1).cpu().numpy()
accuracy = (y_pred == y_test).mean()
print(f"Accuracy: {accuracy}")


Using device: cuda
[ 1.2802037   0.1930291   0.14813042 -0.9187077   0.39928705]
Epoch: 1, Loss: 33.8510627746582
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
enumerate(DataLoader)#_SingleProcessDataLoaderIter._...        33.12%        1.251s        97.54%        3.683s     111.612ms     713.047ms        18.80%        3.682s     111.562ms            33  
                                           aten::select        32.75%        1.237s        33