In [None]:
import torch
from transformers import BertTokenizer, BertForMaskedLM, DataCollatorForLanguageModeling
from datasets import load_dataset
from transformers import Trainer, TrainingArguments
import accelerate

In [None]:
tf_device = '/GPU:0' if tf.config.list_physical_devices('GPU') else '/CPU:0'
print(tf_device)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

In [None]:
# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-german-cased")

In [None]:
def load_and_prepare_dataset(dataset_path):
    # Load your custom dataset
    dataset = load_dataset('text', data_files={'train': dataset_path})

    # Tokenize the text
    def tokenize_function(examples):
        return tokenizer(examples["text"], return_special_tokens_mask=True, truncation=True, padding="max_length", max_length=512)

    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    return tokenized_datasets

In [None]:
dataset_path = '../dataset/clean_text_data.txt'
tokenized_datasets = load_and_prepare_dataset(dataset_path)

In [None]:
# Data collator used for dynamic masking
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [None]:
model = BertForMaskedLM.from_pretrained("bert-base-german-cased")
model.to(device)

In [None]:
training_args = TrainingArguments(
    output_dir="./bert-german-checkpoints",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
)

In [None]:
trainer.train()

In [None]:
model.save_pretrained("./bert-german-science")

In [None]:
from transformers import *
model = TFBertForSequenceClassification.from_pretrained('../models/G-SciEdBERT_model_combined')

In [None]:
!huggingface-cli login

In [None]:
model = BertForMaskedLM.from_pretrained('../models/G-SciEdBert')

In [None]:
tokenizer.push_to_hub("ai4stem-uga/G-SciEdBERT-scoring")

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-german-cased', do_lower_case=True) # tokenizer

In [None]:
model.push_to_hub("ai4stem-uga/G-SciEdBERT-scoring")