<a href="https://colab.research.google.com/github/Urdatorn/automatisk-metrisk-analys/blob/master/bert_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

from transformers import BertTokenizer, BertForTokenClassification, Trainer, TrainingArguments
import torch

# Load the pre-trained model and tokenizer
model_name = "cabrooks/LOGION-base"
model = BertForTokenClassification.from_pretrained(model_name, num_labels=3)
tokenizer = BertTokenizer.from_pretrained(model_name)

# Prepare your datasets
train_path = "/content/iliad_train.txt"
valid_path = "/content/iliad_valid.txt"

# This function will tokenize the text and create the labels accordingly
def encode_examples(text_file, tokenizer):
    tokenized_inputs = []
    labels = []

    with open(text_file, "r", encoding="utf-8") as f:
        for line in f:
            text, label_str = line.strip().split('], [')
            text = text.replace('[', '').replace(']', '').strip()
            label_str = label_str.replace(']', '').strip()
            inputs = tokenizer(text, max_length=128, truncation=True, padding='max_length', return_tensors="pt")
            label_mapping = {'-': 0, 'u': 1, ' ': 2}  # Define your label mapping
            label_ids = [label_mapping[label] for label in label_str if label in label_mapping]
            # Use the real 'labels' length, rest is padded
            labels.append(label_ids + [label_mapping[' ']] * (128 - len(label_ids)))
            tokenized_inputs.append(inputs)

    return tokenized_inputs, labels

train_tokenized, train_labels = encode_examples(train_path, tokenizer)
valid_tokenized, valid_labels = encode_examples(valid_path, tokenizer)

# Convert to torch dataset
class GreekMeterDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = GreekMeterDataset(train_tokenized, train_labels)
valid_dataset = GreekMeterDataset(valid_tokenized, valid_labels)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset
)

# Train the model
trainer.train()

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m54.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.19.0-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m95.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m77.8 MB/s[0m eta [36m0:00:00[0m
Col

Downloading (…)lve/main/config.json:   0%|          | 0.00/755 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/452M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at cabrooks/LOGION-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)okenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/530k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/800k [00:00<?, ?B/s]

FileNotFoundError: ignored

In [None]:
from transformers import BertTokenizer, BertForTokenClassification
import torch

# Load the tokenizer from the original pre-trained model
model_name = "cabrooks/LOGION-base"  # Make sure this is the model you trained with
tokenizer = BertTokenizer.from_pretrained(model_name)

# Specify the checkpoint directory
checkpoint = "/content/results/checkpoint-2526"  # Replace with your checkpoint directory

# Load the model from the checkpoint
model = BertForTokenClassification.from_pretrained(checkpoint)

# Prepare the text you want to scan
text_to_scan = "ὣς εἰπὼν σάκος εἷλε τετυγμένον υἷος ἑοῖο"  # Replace with your text

# Tokenize the text
inputs = tokenizer(text_to_scan, return_tensors="pt", padding=True, truncation=True)

# Get predictions
with torch.no_grad():
    outputs = model(**inputs)

# Get the predicted token class IDs
predictions = torch.argmax(outputs.logits, dim=-1)

# Convert IDs to labels
label_mapping = {0: '-', 1: 'u', 2: ' '}  # The same mapping used during training
predicted_labels = [label_mapping[label_id] for label_id in predictions[0].numpy()]

# Now we have to align the predicted labels back to our original text tokens
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

# We should skip special tokens like [CLS], [SEP], [PAD]
predictions_aligned = [(token, label) for token, label in zip(tokens, predicted_labels) if token not in tokenizer.all_special_tokens]

# Join the labels to form the scanned meter
scanned_meter = ''.join(label for token, label in predictions_aligned)
print(scanned_meter)

 -- uu -u u-uu 


In [None]:
# Print tokens and their corresponding predicted labels
for token, label in zip(tokens, predicted_labels):
    if token not in tokenizer.all_special_tokens:
        print(f"{token}: {label}")

ως:  
ειπω: -
##ν: -
σακο:  
##ς: u
ει: u
##λε:  
τετ: -
##υ: u
##γμενο:  
##ν: u
υιος: -
ε: u
##οι: u
##ο:  


In [None]:
from google.colab import drive

# This will prompt for authorization to access your Google Drive
drive.mount('/content/drive')

# Copy the folder to Google Drive
!cp -r "/content/results/checkpoint-2526" "/content/drive/MyDrive/"

# Optionally, you can print the contents of the directory to confirm the copy
!echo "Copied Files:"
!ls "/content/drive/MyDrive/checkpoint-2526"

Mounted at /content/drive
Copied Files:
config.json	   optimizer.pt   scheduler.pt	      training_args.bin
model.safetensors  rng_state.pth  trainer_state.json
