In [19]:
from transformers import (
    AdamW,
    BertConfig,
    BertTokenizer,
    BertForSequenceClassification,
    get_linear_schedule_with_warmup, 
    XLMRobertaForSequenceClassification,
    AutoTokenizer, AutoModelForMaskedLM
)
import numpy as np
import os
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import time

In [2]:
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
model = AutoModelForMaskedLM.from_pretrained("xlm-roberta-base")

In [7]:
authors = np.repeat([1, 3, 2, 2, 3, 3, 2, 2, 3, 5, 4, 5, 3, 5, 2, 5, 6, 3, 5], 3)
#huggingface, google, deepl
translators = np.repeat([1, 2, 3],19)
fullds =[] 
directory = "russian short stories"
directories = ['init translations', 'google translation', 'deepl translations']
#directory = "test translation"
#files = ["Сказка о лысом пророке Елисее chiornyi.txt"]
for directory in directories:
    #print(directory)
    for file in os.listdir(directory):
#for file in files:
        filename = os.fsdecode(file)
        if filename.endswith(".txt"):
            #print(filename)
            #f = open(f"{directory}/{filename}", encoding='utf-8') #'r',
            f = open(f"{directory}/{filename}", 'r',encoding='utf-8')
            text = f.read()
            f.close()
            fullds.append({'text':text})
            #fullds.append(text)
translated_df = pd.DataFrame(fullds)

In [8]:
intrain = [i % 9 > 2 for i in range(translated_df.size)]
notintrain = [not element for element in intrain]
train_df = translated_df[intrain]
test_df = translated_df[notintrain]
codestrain = authors[intrain]
#codestrain = translators[intrain]
codestest = authors[notintrain]
#codestest = translators[notintrain]

In [9]:
MAX_LEN = 128
def get_encodings(texts):
    token_ids = []
    for text in texts:
        token_id = tokenizer.encode(text, 
                                    add_special_tokens=True,
                                    truncation=True,
                                    max_length=MAX_LEN,
                                    padding='max_length')
                                    #pad_to_max_length=True)
        token_ids.append(token_id)
    return token_ids

def get_attention_masks(padded_encodings):
    attention_masks = []
    for encoding in padded_encodings:
        attention_mask = [int(token_id > 0) for token_id in encoding]
        attention_masks.append(attention_mask)
    return attention_masks

In [10]:
train_encodings = get_encodings(train_df.text.values)
train_attention_masks = get_attention_masks(train_encodings)

test_encodings = get_encodings(test_df.text.values)
test_attention_masks = get_attention_masks(test_encodings)

In [15]:
batch_size = 4
batch_size = 1
#batch_size = 512
# Load input data into tensors
train_input_ids = torch.tensor(train_encodings)
train_masks = torch.tensor(train_attention_masks)
train_labels = torch.tensor(codestrain)

test_input_ids = torch.tensor(test_encodings)
test_masks = torch.tensor(test_attention_masks)
test_labels = torch.tensor(codestest)

# Create the DataLoader and Sampler for both sets.
train_data = TensorDataset(train_input_ids, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = TensorDataset(test_input_ids, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [16]:
lr = 4e-5
eps = 1e-8
epochs = 10

# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = torch.optim.AdamW(model.parameters(), lr=lr, eps=eps)

In [17]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [20]:
#              Training
train_loss_values = []
test_loss_values = []
epochs = 8
for epoch_i in range(0, epochs):
    print('\n======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    t0 = time.time()
    model.train()
    train_loss, train_accuracy = 0, 0
    for step, batch in enumerate(train_dataloader):
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
        input_ids, input_masks, labels = tuple(t for t in batch)# .to(device)
        model.zero_grad()     
        print(input_ids.shape)
        print(labels.shape)
        #outputs = model(torch.LongTensor(input_ids), attention_mask=torch.LongTensor(input_masks), labels=labels) #complains here
        outputs = model(input_ids.to(torch.long), attention_mask=input_masks.to(torch.long), labels=labels.to(torch.long))
        
        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        loss = outputs[0]
        logits = outputs[1]

        logits = logits.detach().cpu().numpy()
        labels = labels.detach().cpu().numpy()
        
        train_loss += loss.item()

        # Calculate the accuracy for this batch of test sentences.
        batch_accuracy = flat_accuracy(logits, labels)
        # Accumulate the total accuracy.
        train_accuracy += batch_accuracy
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        #scheduler.step()

    avg_train_loss = train_loss / len(train_dataloader)   
    avg_train_accuracy = train_accuracy / len(train_dataloader)        
    # Store the loss value for plotting the learning curve.
    train_loss_values.append(avg_train_loss)
    print("\n  Accuracy: {0:.3f}".format(avg_train_accuracy))
    print("  Average training loss: {0:.3f}".format(avg_train_loss))
    #print("  Training epoch took: {:}".format(format_time(time.time() - t0)))
    #  Test
    # After the completion of each training epoch, measure our performance on our test set.
    t0 = time.time()
    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()
    test_loss, test_accuracy = 0, 0

    for batch in test_dataloader:
        
        input_ids, input_masks, labels = tuple(t for t in batch) #.to(device)
        
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        
            outputs = model(input_ids.to(torch.long), attention_mask=input_masks.to(torch.long), labels=labels.to(torch.long))
            #outputs = model(input_ids, attention_mask=input_masks,labels=labels)
        
        loss = outputs[0]
        logits = outputs[1]

        logits = logits.detach().cpu().numpy()
        labels = labels.to('cpu').numpy()

        batch_accuracy = flat_accuracy(logits, labels)
        test_accuracy += batch_accuracy
        test_loss += loss.item()

    avg_test_loss = test_loss / len(test_dataloader)
    avg_test_accuracy = test_accuracy / len(test_dataloader)

    test_loss_values.append(avg_test_loss)

    # Report the final accuracy for this validation run.
    print("  Accuracy: {0:.3f}".format(avg_test_accuracy))
    print("  Average eval loss: {0:.3f}".format(avg_test_loss))
    #print("  Testing took: {:}".format(format_time(time.time() - t0)))

print("\nTraining complete!")


torch.Size([1, 128])
torch.Size([1])


ValueError: Expected input batch_size (128) to match target batch_size (1).