In [4]:
import unicodedata
import re
from tqdm import tqdm
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s


def load_lines(path):
    sentences = []
    with open(path, encoding='utf8') as f:
        for line in tqdm(f):
            sentences.append(normalizeString(line))
    return sentences

In [5]:
!tar -xvzf /kaggle/input/europarl/de-en\ \(1\).tgz -C /kaggle/working/

europarl-v7.de-en.de
europarl-v7.de-en.en


In [6]:

# Loading the data
de_data = load_lines('/kaggle/working/europarl-v7.de-en.de')
en_data = load_lines('/kaggle/working/europarl-v7.de-en.en')




1920209it [02:28, 12939.96it/s]
1920209it [02:09, 14880.87it/s]


In [7]:
SOS = 0
EOS = 1
PAD = 2
en_word_count = [3]
de_word_count = [3]
en_word_counter = {}
de_word_counter = {}
en_word_index = {}
de_word_index = {}

In [8]:
def tokenize(sentence, word_counter, word_index, word_count, MAX_LEN=10):
    split_sentence = [word for word in sentence.split(' ')]
    tokenized = [SOS]
    if len(split_sentence)>MAX_LEN:
        return None
    for i in split_sentence[:MAX_LEN]:
        if i in word_index:
            word_counter[i] += 1
            tokenized.append(word_index[i])
        else:
            word_index[i] = word_count[0]
            word_count[0] += 1
            word_counter[i] = 1
            tokenized.append(word_index[i])
    tokenized.append(EOS)
    tokenized += [PAD]*(MAX_LEN - len(split_sentence))
    return tokenized

In [9]:
from tqdm import tqdm
de_data_tokenized = []
en_data_tokenized = []
de_test_data = []
en_test_data = []
import random
for i in tqdm(range(len(de_data))):
    tokens_ret_de = tokenize(de_data[i], de_word_counter, de_word_index, de_word_count)
    tokens_ret_en = tokenize(en_data[i], en_word_counter, en_word_index, en_word_count)
    if (tokens_ret_de != None) and (tokens_ret_en != None):
        if random.random() < 0.05:
            de_test_data.append(tokens_ret_de)
            en_test_data.append(tokens_ret_en)
        else:
            en_data_tokenized.append(tokens_ret_en)
            de_data_tokenized.append(tokens_ret_de)
    


print(de_data_tokenized[:10])
en_word_count = en_word_count[0]
de_word_count = de_word_count[0]

100%|██████████| 1920209/1920209 [00:18<00:00, 105436.23it/s]

[[0, 3, 4, 5, 1, 2, 2, 2, 2, 2, 2, 2], [0, 6, 7, 8, 9, 10, 11, 12, 10, 13, 14, 1], [0, 19, 20, 21, 22, 14, 1, 2, 2, 2, 2, 2], [0, 19, 20, 21, 22, 14, 1, 2, 2, 2, 2, 2], [0, 19, 20, 23, 1, 2, 2, 2, 2, 2, 2, 2], [0, 24, 25, 26, 27, 16, 28, 6, 29, 30, 14, 1], [0, 16, 31, 4, 32, 33, 34, 35, 14, 1, 2, 2], [0, 50, 51, 52, 53, 54, 55, 1, 2, 2, 2, 2], [0, 50, 49, 45, 56, 57, 16, 58, 40, 59, 55, 1], [0, 60, 61, 62, 63, 16, 64, 14, 1, 2, 2, 2]]





In [10]:
print(len(de_data_tokenized))
print(len(en_data_tokenized))

188878
188878


In [11]:
import torch
def gen_batches(input_data, output_data, batch_size):
    batches = []
    for i in tqdm(range(0, len(input_data), batch_size)):
        count = min(len(input_data) - batch_size, batch_size)
        input_tensor = torch.LongTensor(input_data[i:i+count][:]).cuda()
        output_tensor = torch.LongTensor(output_data[i:i+count][:]).cuda()
        batches.append([input_tensor, output_tensor])
    return batches

In [12]:
data_loader = gen_batches(en_data_tokenized, de_data_tokenized, 64)
test_data_loader = gen_batches(en_test_data,de_test_data,64)

100%|██████████| 2952/2952 [00:03<00:00, 784.43it/s] 
100%|██████████| 156/156 [00:00<00:00, 2982.60it/s]


In [13]:
import torch 
import torch.nn as nn

class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, hidden_size, n_heads, dropout):
        super().__init__()
        
        assert hidden_size % n_heads == 0
        
        self.hidden_size = hidden_size
        self.n_heads = n_heads
        self.head_size = hidden_size // n_heads
        
        self.fc_query = nn.Linear(hidden_size, hidden_size)
        self.fc_key = nn.Linear(hidden_size, hidden_size)
        self.fc_value = nn.Linear(hidden_size, hidden_size)
        self.fc_out = nn.Linear(hidden_size, hidden_size)
    
        self.dp = nn.Dropout(dropout)
        
        self.coefficient = torch.sqrt(torch.FloatTensor([self.head_size])).cuda()
        
    def forward(self, query, key, value, mask=None):
        b_size = query.shape[0]
   
        query_output = self.fc_query(query)
        key_output = self.fc_key(key)
        value_output = self.fc_value(value)
     
        query_output = query_output.view(b_size, -1, self.n_heads, self.head_size).permute(0, 2, 1, 3)
        key_output = key_output.view(b_size, -1, self.n_heads, self.head_size).permute(0, 2, 1, 3)
        value_output = value_output.view(b_size, -1, self.n_heads, self.head_size).permute(0, 2, 1, 3)
      
        energy = torch.matmul(query_output, key_output.permute(0, 1, 3, 2)) / self.coefficient
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)
        
        attention = torch.softmax(energy, dim = -1)    
        output = torch.matmul(self.dp(attention), value_output)
        output = output.permute(0, 2, 1, 3).contiguous()
        output = output.view(b_size, -1, self.hidden_size)  
        output = self.fc_out(output)
        return output, attention



class FeedForwardLayer(nn.Module):
    def __init__(self, hidden_size, ff_size, dropout):
        super().__init__()

        self.ff_layer = nn.Sequential(
            nn.Linear(hidden_size, ff_size),
            nn.ReLU(),
            
            nn.Dropout(dropout),
            nn.Linear(ff_size, hidden_size)
        )
        
    def forward(self, input):
        output = self.ff_layer(input)
        return output

class EncoderLayer(nn.Module):
    def __init__(self, hidden_size, n_heads, ff_size,  dropout):
        super().__init__()
        
        self.self_atten = MultiHeadAttentionLayer(hidden_size, n_heads, dropout)
        self.self_atten_norm = nn.LayerNorm(hidden_size)
        self.ff_layer = FeedForwardLayer(hidden_size, ff_size, dropout)
        self.dp = nn.Dropout(dropout)
        self.ff_layer_norm = nn.LayerNorm(hidden_size)
        
    def forward(self, input, input_mask):
        #self attention
        atten_result, _ = self.self_atten(input, input, input, input_mask)
        
        atten_norm = self.self_atten_norm(input + self.dp(atten_result))
        ff_result = self.ff_layer(atten_norm)
        
        output = self.ff_layer_norm(atten_norm + self.dp(ff_result))
        return output

class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers, n_heads, ff_size,dropout, MAX_LENGTH=100):
        super().__init__()

        
        self.te = nn.Embedding(input_size, hidden_size)
        self.pe = nn.Embedding(MAX_LENGTH, hidden_size)
        
        encoding_layers = []
        for _ in range(n_layers):
            encoding_layers.append(EncoderLayer(hidden_size, n_heads, ff_size, dropout))
        self.encode_sequence = nn.Sequential(*encoding_layers)
        
        self.dp = nn.Dropout(dropout)
        
        self.coefficient = torch.sqrt(torch.FloatTensor([hidden_size])).cuda()
        
    def forward(self, input, input_mask):
        b_size = input.shape[0]
        input_size = input.shape[1]
        
        pos = torch.arange(0, input_size).unsqueeze(0).repeat(b_size, 1).cuda()
        input = self.dp((self.te(input) * self.coefficient) + self.pe(pos))

        for layer in self.encode_sequence:
            input = layer(input, input_mask)
  
        return input

class DecoderLayer(nn.Module):
    def __init__(self, hidden_size, n_heads, ff_size, dropout):
        super().__init__()
        
        self.self_atten = MultiHeadAttentionLayer(hidden_size, n_heads, dropout)
        self.self_atten_norm = nn.LayerNorm(hidden_size)
        self.encoder_atten = MultiHeadAttentionLayer(hidden_size, n_heads, dropout)
        self.encoder_atten_norm = nn.LayerNorm(hidden_size)
        self.ff_layer = FeedForwardLayer(hidden_size, ff_size, dropout)
        self.ff_layer_norm = nn.LayerNorm(hidden_size)
        self.dp = nn.Dropout(dropout)
        
    def forward(self, target, encoded_input, target_mask, input_mask):
        #self attention
        atten_result, _ = self.self_atten(target, target, target, target_mask)
        
        atten_norm = self.self_atten_norm(target + self.dp(atten_result))

        atten_encoded, attention = self.encoder_atten(atten_norm, encoded_input, encoded_input, input_mask)
        
        encoded_norm = self.encoder_atten_norm(atten_norm + self.dp(atten_encoded))

        ff_result = self.ff_layer(encoded_norm)

        output = self.ff_layer_norm(encoded_norm + self.dp(ff_result))

        return output, attention

class Decoder(nn.Module):
    def __init__(self, output_size, hidden_size, n_layers, n_heads, ff_size, dropout, MAX_LENGTH=100):
        super().__init__()
        
        self.te = nn.Embedding(output_size, hidden_size)
        self.pe = nn.Embedding(MAX_LENGTH, hidden_size)

        decoding_layers = []
        for _ in range(n_layers):
            decoding_layers.append(DecoderLayer(hidden_size, n_heads, ff_size, dropout))
        
        self.decode_sequence = nn.Sequential(*decoding_layers) 
        
        self.fc_out = nn.Linear(hidden_size, output_size)
        
        self.dp = nn.Dropout(dropout)
        
        self.coefficient = torch.sqrt(torch.FloatTensor([hidden_size])).cuda()
        
    def forward(self, target, encoded_input, target_mask, input_mask):    
        b_size = target.shape[0]
        target_size = target.shape[1]
        
        pos = torch.arange(0, target_size).unsqueeze(0).repeat(b_size, 1).cuda()
        target = self.dp((self.te(target) * self.coefficient) + self.pe(pos))
        for layer in self.decode_sequence:
            target, attention = layer(target, encoded_input, target_mask, input_mask)

        output = self.fc_out(target)
        return output, attention

class Transformer(nn.Module):
    def __init__(self, encoder, decoder, padding_index=0):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.padding_index = padding_index
        
    def make_input_mask(self, input):

        input_mask = (input != self.padding_index).unsqueeze(1).unsqueeze(2)
        return input_mask
    
    def make_target_mask(self, target):

        target_pad_mask = (target != self.padding_index).unsqueeze(1).unsqueeze(2)
        target_sub_mask = torch.tril(torch.ones((target.shape[1], target.shape[1]))).bool().cuda()
        target_mask = target_pad_mask & target_sub_mask
        return target_mask

    def forward(self, input, target):   
        input_mask = self.make_input_mask(input)
        target_mask = self.make_target_mask(target)

        #encoder feed through
        encoded_input = self.encoder(input, input_mask)

        #decoder feed_through
        output, attention = self.decoder(target, encoded_input, target_mask, input_mask)

        return output, attention

In [14]:
# print(en_word_index)
print(de_word_count)

encoder_part = Encoder(en_word_count, 64, 3, 8, 128, 0.1)
decoder_part = Decoder(de_word_count, 64, 3, 8, 128, 0.1)

transformer = Transformer(encoder_part, decoder_part, PAD).cuda()

54433


In [15]:
def initialize_weights(model):
        if hasattr(model, 'weight') and model.weight.dim() > 1:
            nn.init.xavier_uniform_(model.weight.data)

import torch.optim as optim
transformer.apply(initialize_weights)
loss_func = nn.CrossEntropyLoss(ignore_index=PAD)
optimizer = optim.Adam(transformer.parameters(), lr=0.0005)

In [16]:
def evaluate():
    shuffle(test_data_loader)
    test_loss = 0
    print("evaluating on test data")
    for input, target in tqdm(test_data_loader):
        
        #pass through transformer
        output, _ = transformer(input, target[:,:-1])
        output_dim = output.shape[-1]

        #flatten and omit SOS from target
        output = output.contiguous().view(-1, output_dim)
        target = target[:,1:].contiguous().view(-1)

        #loss
        loss = loss_func(output, target)
        test_loss += loss
               
    return test_loss.item()

In [25]:
import os

# Define the path for saving the model
model_dir = '/kaggle/working/'
model_filename = os.path.join(model_dir, 'transformer_model_{}.pt'.format(epoch))




NameError: name 'epoch' is not defined

In [17]:
def calculate_accuracy(output, target):
    # Convert output probabilities to predicted class (argmax over output dimension)
    predictions = output.argmax(dim=1, keepdim=True)
    # Compare with actual classes
    correct = predictions.eq(target.view_as(predictions)).sum()
    # Calculate accuracy
    acc = correct.float() / target.shape[0]
    return acc


In [18]:
import time
from random import shuffle


import torch

# Function to convert data to tensor and move to GPU if available
def to_tensor_and_device(data):
    if not torch.is_tensor(data):
        data = torch.LongTensor(data)
    if torch.cuda.is_available():
        data = data.cuda()
    return data

# Convert and move your data
en_test_data = to_tensor_and_device(en_test_data)
de_test_data = to_tensor_and_device(de_test_data)

for epoch in range(40):
    shuffle(data_loader)
    shuffle(test_data_loader)
    train_loss = 0
    train_acc = 0
    start_time = time.time()
    
    for input, target in tqdm(data_loader):
        optimizer.zero_grad()

        # Pass through transformer
        output, _ = transformer(input, target[:,:-1])
        output_dim = output.shape[-1]

        # Flatten and omit SOS from target
        output = output.contiguous().view(-1, output_dim)
        target = target[:,1:].contiguous().view(-1)

        # Loss
        loss = loss_func(output, target)

        # Backpropagation
        loss.backward()
        nn.utils.clip_grad_norm_(transformer.parameters(), 1)
        optimizer.step()

        train_loss += loss.item()
        train_acc += calculate_accuracy(output, target).item()

    train_loss /= len(data_loader)
    train_acc /= len(data_loader)
    end_time = int(time.time() - start_time)

    

   
    print(f'Epoch: {epoch}, Time: {end_time}s, Estimated {(20-epoch)*end_time} seconds remaining.')
    print(f'\tTraining Loss: {train_loss:.4f}, Training Accuracy: {train_acc * 100:.2f}%')

    # Evaluate on test data
    test_loss = 0
    test_acc = 0
    with torch.no_grad():
        for input, target in tqdm(test_data_loader):

            # Pass through transformer
            output, _ = transformer(input, target[:,:-1])
            output_dim = output.shape[-1]

            # Flatten and omit SOS from target
            output = output.contiguous().view(-1, output_dim)
            target = target[:,1:].contiguous().view(-1)

            # Loss
            loss = loss_func(output, target)
            test_loss += loss.item()
            test_acc += calculate_accuracy(output, target).item()

    test_loss /= len(test_data_loader)
    test_acc /= len(test_data_loader)
    print(f'\tTest Loss: {test_loss:.4f}, Test Accuracy: {test_acc * 100:.2f}%')

print('Training finished!')

100%|██████████| 2952/2952 [01:13<00:00, 40.20it/s]


Epoch: 0, Time: 73s, Estimated 1460 seconds remaining.
	Training Loss: 4.2732, Training Accuracy: 27.90%


100%|██████████| 156/156 [00:01<00:00, 108.16it/s]


	Test Loss: 3.3129, Test Accuracy: 35.43%


100%|██████████| 2952/2952 [01:11<00:00, 41.09it/s]


Epoch: 1, Time: 71s, Estimated 1349 seconds remaining.
	Training Loss: 2.9130, Training Accuracy: 38.27%


100%|██████████| 156/156 [00:01<00:00, 97.33it/s] 


	Test Loss: 2.8211, Test Accuracy: 39.68%


100%|██████████| 2952/2952 [01:12<00:00, 40.96it/s]


Epoch: 2, Time: 72s, Estimated 1296 seconds remaining.
	Training Loss: 2.4456, Training Accuracy: 41.68%


100%|██████████| 156/156 [00:01<00:00, 107.54it/s]


	Test Loss: 2.6105, Test Accuracy: 41.53%


100%|██████████| 2952/2952 [01:11<00:00, 41.13it/s]


Epoch: 3, Time: 71s, Estimated 1207 seconds remaining.
	Training Loss: 2.1534, Training Accuracy: 43.75%


100%|██████████| 156/156 [00:01<00:00, 107.20it/s]


	Test Loss: 2.5018, Test Accuracy: 42.68%


100%|██████████| 2952/2952 [01:11<00:00, 41.09it/s]


Epoch: 4, Time: 71s, Estimated 1136 seconds remaining.
	Training Loss: 1.9523, Training Accuracy: 45.16%


100%|██████████| 156/156 [00:01<00:00, 104.03it/s]


	Test Loss: 2.4472, Test Accuracy: 43.13%


100%|██████████| 2952/2952 [01:12<00:00, 40.99it/s]


Epoch: 5, Time: 72s, Estimated 1080 seconds remaining.
	Training Loss: 1.8040, Training Accuracy: 46.22%


100%|██████████| 156/156 [00:01<00:00, 108.45it/s]


	Test Loss: 2.4149, Test Accuracy: 43.51%


100%|██████████| 2952/2952 [01:11<00:00, 41.10it/s]


Epoch: 6, Time: 71s, Estimated 994 seconds remaining.
	Training Loss: 1.6888, Training Accuracy: 47.07%


100%|██████████| 156/156 [00:01<00:00, 106.79it/s]


	Test Loss: 2.4024, Test Accuracy: 43.90%


100%|██████████| 2952/2952 [01:11<00:00, 41.06it/s]


Epoch: 7, Time: 71s, Estimated 923 seconds remaining.
	Training Loss: 1.5984, Training Accuracy: 47.78%


100%|██████████| 156/156 [00:01<00:00, 103.78it/s]


	Test Loss: 2.3869, Test Accuracy: 44.16%


100%|██████████| 2952/2952 [01:12<00:00, 40.97it/s]


Epoch: 8, Time: 72s, Estimated 864 seconds remaining.
	Training Loss: 1.5257, Training Accuracy: 48.40%


100%|██████████| 156/156 [00:01<00:00, 107.57it/s]


	Test Loss: 2.4059, Test Accuracy: 44.09%


100%|██████████| 2952/2952 [01:11<00:00, 41.19it/s]


Epoch: 9, Time: 71s, Estimated 781 seconds remaining.
	Training Loss: 1.4649, Training Accuracy: 48.95%


100%|██████████| 156/156 [00:01<00:00, 108.33it/s]


	Test Loss: 2.4053, Test Accuracy: 44.30%


100%|██████████| 2952/2952 [01:11<00:00, 41.29it/s]


Epoch: 10, Time: 71s, Estimated 710 seconds remaining.
	Training Loss: 1.4143, Training Accuracy: 49.46%


100%|██████████| 156/156 [00:01<00:00, 105.06it/s]


	Test Loss: 2.4255, Test Accuracy: 44.35%


100%|██████████| 2952/2952 [01:12<00:00, 40.93it/s]


Epoch: 11, Time: 72s, Estimated 648 seconds remaining.
	Training Loss: 1.3773, Training Accuracy: 49.81%


100%|██████████| 156/156 [00:01<00:00, 103.09it/s]


	Test Loss: 2.4243, Test Accuracy: 44.37%


100%|██████████| 2952/2952 [01:11<00:00, 41.10it/s]


Epoch: 12, Time: 71s, Estimated 568 seconds remaining.
	Training Loss: 1.3417, Training Accuracy: 50.18%


100%|██████████| 156/156 [00:01<00:00, 108.13it/s]


	Test Loss: 2.4363, Test Accuracy: 44.43%


100%|██████████| 2952/2952 [01:11<00:00, 41.20it/s]


Epoch: 13, Time: 71s, Estimated 497 seconds remaining.
	Training Loss: 1.3151, Training Accuracy: 50.47%


100%|██████████| 156/156 [00:01<00:00, 105.75it/s]


	Test Loss: 2.4460, Test Accuracy: 44.46%


100%|██████████| 2952/2952 [01:12<00:00, 40.99it/s]


Epoch: 14, Time: 72s, Estimated 432 seconds remaining.
	Training Loss: 1.2904, Training Accuracy: 50.71%


100%|██████████| 156/156 [00:01<00:00, 108.90it/s]


	Test Loss: 2.4649, Test Accuracy: 44.39%


100%|██████████| 2952/2952 [01:11<00:00, 41.13it/s]


Epoch: 15, Time: 71s, Estimated 355 seconds remaining.
	Training Loss: 1.2710, Training Accuracy: 50.93%


100%|██████████| 156/156 [00:01<00:00, 108.17it/s]


	Test Loss: 2.4765, Test Accuracy: 44.42%


100%|██████████| 2952/2952 [01:11<00:00, 41.09it/s]


Epoch: 16, Time: 71s, Estimated 284 seconds remaining.
	Training Loss: 1.2526, Training Accuracy: 51.12%


100%|██████████| 156/156 [00:01<00:00, 104.79it/s]


	Test Loss: 2.4825, Test Accuracy: 44.29%


100%|██████████| 2952/2952 [01:11<00:00, 41.06it/s]


Epoch: 17, Time: 71s, Estimated 213 seconds remaining.
	Training Loss: 1.2369, Training Accuracy: 51.33%


100%|██████████| 156/156 [00:01<00:00, 108.38it/s]


	Test Loss: 2.4801, Test Accuracy: 44.54%


100%|██████████| 2952/2952 [01:11<00:00, 41.11it/s]


Epoch: 18, Time: 71s, Estimated 142 seconds remaining.
	Training Loss: 1.2235, Training Accuracy: 51.46%


100%|██████████| 156/156 [00:01<00:00, 106.25it/s]


	Test Loss: 2.4987, Test Accuracy: 44.36%


100%|██████████| 2952/2952 [01:11<00:00, 41.05it/s]


Epoch: 19, Time: 71s, Estimated 71 seconds remaining.
	Training Loss: 1.2100, Training Accuracy: 51.59%


100%|██████████| 156/156 [00:01<00:00, 104.53it/s]


	Test Loss: 2.4986, Test Accuracy: 44.40%


100%|██████████| 2952/2952 [01:12<00:00, 40.99it/s]


Epoch: 20, Time: 72s, Estimated 0 seconds remaining.
	Training Loss: 1.1982, Training Accuracy: 51.75%


100%|██████████| 156/156 [00:01<00:00, 108.41it/s]


	Test Loss: 2.4994, Test Accuracy: 44.46%


100%|██████████| 2952/2952 [01:11<00:00, 41.11it/s]


Epoch: 21, Time: 71s, Estimated -71 seconds remaining.
	Training Loss: 1.1866, Training Accuracy: 51.87%


100%|██████████| 156/156 [00:01<00:00, 107.36it/s]


	Test Loss: 2.5107, Test Accuracy: 44.40%


100%|██████████| 2952/2952 [01:11<00:00, 41.03it/s]


Epoch: 22, Time: 71s, Estimated -142 seconds remaining.
	Training Loss: 1.1767, Training Accuracy: 51.99%


100%|██████████| 156/156 [00:01<00:00, 104.50it/s]


	Test Loss: 2.5123, Test Accuracy: 44.38%


100%|██████████| 2952/2952 [01:11<00:00, 41.00it/s]


Epoch: 23, Time: 72s, Estimated -216 seconds remaining.
	Training Loss: 1.1666, Training Accuracy: 52.09%


100%|██████████| 156/156 [00:01<00:00, 106.87it/s]


	Test Loss: 2.5122, Test Accuracy: 44.50%


100%|██████████| 2952/2952 [01:11<00:00, 41.08it/s]


Epoch: 24, Time: 71s, Estimated -284 seconds remaining.
	Training Loss: 1.1584, Training Accuracy: 52.18%


100%|██████████| 156/156 [00:01<00:00, 107.75it/s]


	Test Loss: 2.5045, Test Accuracy: 44.57%


100%|██████████| 2952/2952 [01:12<00:00, 40.98it/s]


Epoch: 25, Time: 72s, Estimated -360 seconds remaining.
	Training Loss: 1.1504, Training Accuracy: 52.27%


100%|██████████| 156/156 [00:01<00:00, 107.11it/s]


	Test Loss: 2.5121, Test Accuracy: 44.44%


100%|██████████| 2952/2952 [01:11<00:00, 41.03it/s]


Epoch: 26, Time: 71s, Estimated -426 seconds remaining.
	Training Loss: 1.1429, Training Accuracy: 52.34%


100%|██████████| 156/156 [00:01<00:00, 104.27it/s]


	Test Loss: 2.5179, Test Accuracy: 44.40%


100%|██████████| 2952/2952 [01:11<00:00, 41.08it/s]


Epoch: 27, Time: 71s, Estimated -497 seconds remaining.
	Training Loss: 1.1352, Training Accuracy: 52.46%


100%|██████████| 156/156 [00:01<00:00, 108.04it/s]


	Test Loss: 2.5175, Test Accuracy: 44.51%


100%|██████████| 2952/2952 [01:11<00:00, 41.31it/s]


Epoch: 28, Time: 71s, Estimated -568 seconds remaining.
	Training Loss: 1.1285, Training Accuracy: 52.51%


100%|██████████| 156/156 [00:01<00:00, 105.27it/s]


	Test Loss: 2.5252, Test Accuracy: 44.44%


100%|██████████| 2952/2952 [01:11<00:00, 41.07it/s]


Epoch: 29, Time: 71s, Estimated -639 seconds remaining.
	Training Loss: 1.1226, Training Accuracy: 52.60%


100%|██████████| 156/156 [00:01<00:00, 106.03it/s]


	Test Loss: 2.5379, Test Accuracy: 44.38%


100%|██████████| 2952/2952 [01:11<00:00, 41.32it/s]


Epoch: 30, Time: 71s, Estimated -710 seconds remaining.
	Training Loss: 1.1166, Training Accuracy: 52.66%


100%|██████████| 156/156 [00:01<00:00, 106.54it/s]


	Test Loss: 2.5406, Test Accuracy: 44.46%


100%|██████████| 2952/2952 [01:11<00:00, 41.08it/s]


Epoch: 31, Time: 71s, Estimated -781 seconds remaining.
	Training Loss: 1.1115, Training Accuracy: 52.72%


100%|██████████| 156/156 [00:01<00:00, 104.65it/s]


	Test Loss: 2.5449, Test Accuracy: 44.50%


100%|██████████| 2952/2952 [01:12<00:00, 40.96it/s]


Epoch: 32, Time: 72s, Estimated -864 seconds remaining.
	Training Loss: 1.1065, Training Accuracy: 52.78%


100%|██████████| 156/156 [00:01<00:00, 108.45it/s]


	Test Loss: 2.5431, Test Accuracy: 44.34%


100%|██████████| 2952/2952 [01:11<00:00, 41.05it/s]


Epoch: 33, Time: 71s, Estimated -923 seconds remaining.
	Training Loss: 1.1024, Training Accuracy: 52.82%


100%|██████████| 156/156 [00:01<00:00, 109.41it/s]


	Test Loss: 2.5433, Test Accuracy: 44.38%


100%|██████████| 2952/2952 [01:11<00:00, 41.17it/s]


Epoch: 34, Time: 71s, Estimated -994 seconds remaining.
	Training Loss: 1.0956, Training Accuracy: 52.92%


100%|██████████| 156/156 [00:01<00:00, 106.02it/s]


	Test Loss: 2.5509, Test Accuracy: 44.43%


100%|██████████| 2952/2952 [01:11<00:00, 41.06it/s]


Epoch: 35, Time: 71s, Estimated -1065 seconds remaining.
	Training Loss: 1.0912, Training Accuracy: 52.95%


100%|██████████| 156/156 [00:01<00:00, 106.96it/s]


	Test Loss: 2.5431, Test Accuracy: 44.42%


100%|██████████| 2952/2952 [01:11<00:00, 41.23it/s]


Epoch: 36, Time: 71s, Estimated -1136 seconds remaining.
	Training Loss: 1.0869, Training Accuracy: 53.01%


100%|██████████| 156/156 [00:01<00:00, 104.30it/s]


	Test Loss: 2.5582, Test Accuracy: 44.54%


100%|██████████| 2952/2952 [01:11<00:00, 41.13it/s]


Epoch: 37, Time: 71s, Estimated -1207 seconds remaining.
	Training Loss: 1.0837, Training Accuracy: 53.04%


100%|██████████| 156/156 [00:01<00:00, 103.97it/s]


	Test Loss: 2.5541, Test Accuracy: 44.51%


100%|██████████| 2952/2952 [01:11<00:00, 41.05it/s]


Epoch: 38, Time: 71s, Estimated -1278 seconds remaining.
	Training Loss: 1.0799, Training Accuracy: 53.08%


100%|██████████| 156/156 [00:01<00:00, 101.95it/s]


	Test Loss: 2.5544, Test Accuracy: 44.44%


100%|██████████| 2952/2952 [01:11<00:00, 41.10it/s]


Epoch: 39, Time: 71s, Estimated -1349 seconds remaining.
	Training Loss: 1.0761, Training Accuracy: 53.12%


100%|██████████| 156/156 [00:01<00:00, 106.42it/s]

	Test Loss: 2.5527, Test Accuracy: 44.37%
Training finished!





In [19]:
import gc
gc.collect()


0