## CPU training

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import torch.autograd.profiler as profiler

class ParamsDownloader:
    def __init__(self, data_path):
        self.data_path = data_path
        self.load_data()

    def load_data(self):
        self.ln1b = np.load(f'{self.data_path}/ln1b.npy')
        self.ln1w = np.load(f'{self.data_path}/ln1w.npy')
        self.ln2b = np.load(f'{self.data_path}/ln2b.npy')
        self.ln2w = np.load(f'{self.data_path}/ln2w.npy')
        


class Model(nn.Module):
    def __init__(self, ln1w, ln1b, ln2w, ln2b):
        super(Model, self).__init__()
        N, H1 = ln1w.shape
        H2, _ = ln2w.shape
        
        self.l1 = nn.Linear(N, H1)
        self.l1.weight.data = torch.from_numpy(ln1w).float()
        self.l1.bias.data = torch.from_numpy(ln1b).float()

        self.l2 = nn.Linear(H1, H2)
        self.l2.weight.data = torch.from_numpy(ln2w).float()
        self.l2.bias.data = torch.from_numpy(ln2b).float()

    def forward(self, x):
        y1 = self.l1(x)
        y1_relu = F.relu(y1)
        y2 = self.l2(y1_relu)
        return y1, y1_relu, y2
    
    
X_train = np.load('../dataset/x_train.npy')
y_train = np.load('../dataset/y_train.npy').astype(np.int64)
print(X_train [0,0:5])
# X_test = np.load('../dataset/x_test.npy')
# y_test = np.load('../dataset/y_test.npy')

N_EPOCHS = 10
BATCH_SIZE = 32
train_loader = torch.utils.data.DataLoader(TensorDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).long()), batch_size=BATCH_SIZE, shuffle=False)
data_loader = ParamsDownloader('../with-torch-tests/trained-model-cpu')
model = Model(data_loader.ln1w, data_loader.ln1b, data_loader.ln2w, data_loader.ln2b)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss(reduction='mean')

for epoch in range(N_EPOCHS):
    for i, (X, y) in enumerate(train_loader):
        optimizer.zero_grad()
        y1, y1_relu, y2 = model(X)
        loss = criterion(y2, y)
        loss.backward()
        optimizer.step()
    print(f"Epoch: {epoch}, Loss: {loss.item()}")

x_test = np.load('../dataset/x_test.npy')
y_test = np.load('../dataset/y_test.npy')
y1, y1_relu, y2 = model(torch.from_numpy(x_test).float())
y_pred = y2.argmax(dim=1).numpy()
accuracy = (y_pred == y_test).mean()
print(f"Accuracy: {accuracy}")

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import torch.autograd.profiler as profiler

class ParamsDownloader:
    def __init__(self, data_path):
        self.data_path = data_path
        self.load_data()

    def load_data(self):
        self.ln1b = np.load(f'{self.data_path}/ln1b.npy')
        self.ln1w = np.load(f'{self.data_path}/ln1w.npy')
        self.ln2b = np.load(f'{self.data_path}/ln2b.npy')
        self.ln2w = np.load(f'{self.data_path}/ln2w.npy')
        

class Model(nn.Module):
    def __init__(self, ln1w, ln1b, ln2w, ln2b):
        super(Model, self).__init__()
        N, H1 = ln1w.shape
        H2, _ = ln2w.shape
        
        self.l1 = nn.Linear(N, H1)
        self.l1.weight.data = torch.from_numpy(ln1w).float()
        self.l1.bias.data = torch.from_numpy(ln1b).float()

        self.l2 = nn.Linear(H1, H2)
        self.l2.weight.data = torch.from_numpy(ln2w).float()
        self.l2.bias.data = torch.from_numpy(ln2b).float()

    def forward(self, x):
        y1 = self.l1(x)
        y1_relu = F.relu(y1)
        y2 = self.l2(y1_relu)
        return y1, y1_relu, y2
    
    
X_train = np.load('../dataset/x_train.npy')
y_train = np.load('../dataset/y_train.npy').astype(np.int64)

N_EPOCHS = 10
BATCH_SIZE = 32
train_loader = torch.utils.data.DataLoader(TensorDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).long()), batch_size=BATCH_SIZE, shuffle=False)
data_loader = ParamsDownloader('../with-torch-tests/trained-model-cpu')
model = Model(data_loader.ln1w, data_loader.ln1b, data_loader.ln2w, data_loader.ln2b)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss(reduction='mean')

for epoch in range(N_EPOCHS):
    with profiler.profile(record_shapes=True, use_cuda=False) as prof:
        for i, (X, y) in enumerate(train_loader):
            optimizer.zero_grad()
            y1, y1_relu, y2 = model(X)
            loss = criterion(y2, y)
            loss.backward()
            optimizer.step()
        print(f"epoch: {epoch+1}, loss: {loss.item()}")
    print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))

x_test = np.load('../dataset/x_test.npy')
y_test = np.load('../dataset/y_test.npy')
y1, y1_relu, y2 = model(torch.from_numpy(x_test).float())
y_pred = y2.argmax(dim=1).numpy()
accuracy = (y_pred == y_test).mean()
print(f"Accuracy: {accuracy}")


## GPU training

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import torch.autograd.profiler as profiler

# Set the device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

class ParamsDownloader:
    def __init__(self, data_path):
        self.data_path = data_path
        self.load_data()

    def load_data(self):
        self.ln1b = np.load(f'{self.data_path}/ln1b.npy')
        self.ln1w = np.load(f'{self.data_path}/ln1w.npy')
        self.ln2b = np.load(f'{self.data_path}/ln2b.npy')
        self.ln2w = np.load(f'{self.data_path}/ln2w.npy')

class Model(nn.Module):
    def __init__(self, ln1w, ln1b, ln2w, ln2b):
        super(Model, self).__init__()
        N, H1 = ln1w.shape
        H2, _ = ln2w.shape
        
        self.l1 = nn.Linear(N, H1)
        self.l1.weight.data = torch.from_numpy(ln1w).float().to(device)
        self.l1.bias.data = torch.from_numpy(ln1b).float().to(device)

        self.l2 = nn.Linear(H1, H2)
        self.l2.weight.data = torch.from_numpy(ln2w).float().to(device)
        self.l2.bias.data = torch.from_numpy(ln2b).float().to(device)

    def forward(self, x):
        y1 = self.l1(x)
        y1_relu = F.relu(y1)
        y2 = self.l2(y1_relu)
        return y1, y1_relu, y2
    
# Load data and move to device
X_train = np.load('../dataset/x_train.npy')
y_train = np.load('../dataset/y_train.npy').astype(np.int64)

print(X_train[0, 0:5])

N_EPOCHS = 10
BATCH_SIZE = 32

train_loader = DataLoader(TensorDataset(torch.from_numpy(X_train).float().to(device), 
                                        torch.from_numpy(y_train).long().to(device)), 
                          batch_size=BATCH_SIZE, shuffle=False)

data_loader = ParamsDownloader('../with-torch-tests/trained-model-cpu')
model = Model(data_loader.ln1w, data_loader.ln1b, data_loader.ln2w, data_loader.ln2b).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss(reduction='mean')

# Training loop
for epoch in range(N_EPOCHS):
    for i, (X, y) in enumerate(train_loader):
        optimizer.zero_grad()
        y1, y1_relu, y2 = model(X)
        loss = criterion(y2, y)
        loss.backward()
        optimizer.step()
    print(f"Epoch: {epoch+1}, Loss: {loss.item()}")

# Testing the model
x_test = np.load('../dataset/x_test.npy')
y_test = np.load('../dataset/y_test.npy')

y1, y1_relu, y2 = model(torch.from_numpy(x_test).float().to(device))
y_pred = y2.argmax(dim=1).cpu().numpy()
accuracy = (y_pred == y_test).mean()
print(f"Accuracy: {accuracy}")


In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import torch.autograd.profiler as profiler

# Set the device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

class ParamsDownloader:
    def __init__(self, data_path):
        self.data_path = data_path
        self.load_data()

    def load_data(self):
        self.ln1b = np.load(f'{self.data_path}/ln1b.npy')
        self.ln1w = np.load(f'{self.data_path}/ln1w.npy')
        self.ln2b = np.load(f'{self.data_path}/ln2b.npy')
        self.ln2w = np.load(f'{self.data_path}/ln2w.npy')

class Model(nn.Module):
    def __init__(self, ln1w, ln1b, ln2w, ln2b):
        super(Model, self).__init__()
        N, H1 = ln1w.shape
        H2, _ = ln2w.shape
        
        self.l1 = nn.Linear(N, H1)
        self.l1.weight.data = torch.from_numpy(ln1w).float().to(device)
        self.l1.bias.data = torch.from_numpy(ln1b).float().to(device)

        self.l2 = nn.Linear(H1, H2)
        self.l2.weight.data = torch.from_numpy(ln2w).float().to(device)
        self.l2.bias.data = torch.from_numpy(ln2b).float().to(device)

    def forward(self, x):
        y1 = self.l1(x)
        y1_relu = F.relu(y1)
        y2 = self.l2(y1_relu)
        return y1, y1_relu, y2
    
# Load data and move to device
X_train = np.load('../dataset/x_train.npy')
y_train = np.load('../dataset/y_train.npy').astype(np.int64)

print(X_train[0, 0:5])

N_EPOCHS = 10
BATCH_SIZE = 32

train_loader = DataLoader(TensorDataset(torch.from_numpy(X_train).float().to(device), 
                                        torch.from_numpy(y_train).long().to(device)), 
                          batch_size=BATCH_SIZE, shuffle=False)

data_loader = ParamsDownloader('../with-torch-tests/trained-model-cpu')
model = Model(data_loader.ln1w, data_loader.ln1b, data_loader.ln2w, data_loader.ln2b).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss(reduction='mean')

# Training loop
for epoch in range(N_EPOCHS):
    with profiler.profile(record_shapes=True) as prof:
        for i, (X, y) in enumerate(train_loader):
            optimizer.zero_grad()
            y1, y1_relu, y2 = model(X)
            loss = criterion(y2, y)
            loss.backward()
            optimizer.step()
    print(f"Epoch: {epoch+1}, Loss: {loss.item()}")
    # Print profiling results
    print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))

# Testing the model
x_test = np.load('../dataset/x_test.npy')
y_test = np.load('../dataset/y_test.npy')

y1, y1_relu, y2 = model(torch.from_numpy(x_test).float().to(device))
y_pred = y2.argmax(dim=1).cpu().numpy()
accuracy = (y_pred == y_test).mean()
print(f"Accuracy: {accuracy}")


Using device: cpu
[0.78198105 0.4786313  0.7654065  0.05289226 0.815995  ]


  _torch_pytree._register_pytree_node(


Epoch: 0, Loss: 3.9396262168884277


KeyboardInterrupt: 

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
import torch.autograd.profiler as profiler

# test kernels speed 
repeats = 100

# Set the device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# softmax 
x = torch.randn(10000, 10000).to(device)
with profiler.profile(record_shapes=True, use_cuda=True) as prof:
    for _ in range(repeats):
        y = F.log_softmax(x, dim=1)
print("SOFTMAX")
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))


# NLLLoss
x = torch.randn(10000, 10000).to(device)
y = torch.randint(0, 10000, (10000,)).to(device)
criterion = nn.NLLLoss()
with profiler.profile(record_shapes=True, use_cuda=True) as prof:
    for _ in range(repeats):
        loss = criterion(F.log_softmax(x, dim=1), y)
print("NLLLoss")
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))


# cross entropy
x = torch.randn(10000, 10000).to(device)
y = torch.randint(0, 10000, (10000,)).to(device)
criterion = nn.CrossEntropyLoss()
with profiler.profile(record_shapes=True, use_cuda=True) as prof:
    for _ in range(repeats):
        loss = criterion(x, y)
print("CrossEntropyLoss")
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))


# Linear layer
x = torch.randn(2048, 2048).to(device)
linear = nn.Linear(2048, 2048).to(device)
with profiler.profile(record_shapes=True, use_cuda=True) as prof:
    for _ in range(repeats):
        y = linear(x)
print("Linear")
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))


# relu 
x = torch.randn(10000, 10000).to(device)
with profiler.profile(record_shapes=True, use_cuda=True) as prof:
    for _ in range(repeats):
        y = F.relu(x)
print("ReLU")
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))

Using device: cuda
SOFTMAX
----------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                  Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
----------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
     aten::log_softmax         4.88%       1.955ms       100.00%      40.026ms     400.260us     590.000us         0.07%     822.587ms       8.226ms           100  
    aten::_log_softmax        95.12%      38.071ms        95.12%      38.071ms     380.710us     821.997ms        99.93%     821.997ms       8.220ms           100  
----------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  -----------