In [1]:
print("hi")

hi


In [2]:
import os
import glob
from tqdm import tqdm
import re
import math

In [3]:
class CFG:
    DEBUG = False
    
    PDFS_FOLDER = '/kaggle/input/harry-potter-books-in-pdf-1-7/HP books/'
    PDFS_PATHS = glob.glob(PDFS_FOLDER + '*.pdf')[0] if DEBUG else glob.glob(PDFS_FOLDER + '*.pdf')
    
    ### Text dataframe
    MIN_LENGTH = 400 # min number of characters per chunk
    MAX_LENGTH = 800 # max number of characters per chunk
    
    ### model
    MODEL_NAME = 'bert-base-uncased' # 'roberta-base'
    
    ### split
    VALID_SIZE = 0.2
    SEED = 88    
    
    ### training
    LR = 2e-5
    EPOCHS = 2 if DEBUG else 3
    BATCH_SIZE = 16
    
    ### path to save tokenizer files
    TOKENIZER_SAVE_PATH = './hp-tokenizer-'
    
    ### path to save model files
    MODEL_SAVE_PATH = './hp-model-'
    
    ### path to save txt files
    TXT_SAVE_PATH = './text-files/'

In [4]:
model_card = 'bert-base-uncased'

# Domain-pre-training corpora
dpt_corpus_train = 'data_prep/pubmed_abstracts_1.txt'
dpt_corpus_train_data_selected = 'data/dapt_train_data_selected.txt'  # Optional: If you want to use data selection for DAPT
dpt_corpus_val = 'data/pubmed_abstracts_2.txt'  # Optional: If you want to use a validation set for DAPT

# Fine-tuning corpora
# If there are multiple downstream NLP tasks/corpora, you can concatenate those files together
ft_corpus_train = 'data/ft_train.txt'

# how should the ft_train.txt look like? ans = "Each line in the ft_train.txt file should represent a single training example for the fine-tuning task. The format of each line will depend on the specific NLP task you are working on. Here are some common formats for different tasks:
# 1. Text Classification:
#    Each line contains the text followed by a tab and then the label.
#    Example:
#    ```
#    This is a positive review.    positive
#    This is a negative review.    negative
#    ```
# should it be a csv or txt?
# ans = "The ft_train.txt file can be either a plain text file (.txt) or a CSV file (.csv), depending on your preference and the tools you are using. If you choose to use a CSV file, make sure to properly format it with commas separating the fields and include headers if necessary. For example, a CSV format for text classification might look like this:
# ```
#     text,label
#     This is a positive review.,positive
#     This is a negative review.,negative
#     ```

In [5]:
from transformers import (
    AutoModelForMaskedLM, AutoTokenizer, pipeline,
    LineByLineTextDataset, DataCollatorForLanguageModeling,
    Trainer, TrainingArguments
)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
from transformers import AutoModelForMaskedLM, AutoTokenizer


In [7]:

model = AutoModelForMaskedLM.from_pretrained(model_card) #


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
model.config

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "dtype": "float32",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.57.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [9]:
tokenizer = AutoTokenizer.from_pretrained(CFG.MODEL_NAME) # load tokenizer

model.config.save_pretrained(f'{CFG.TOKENIZER_SAVE_PATH}{CFG.MODEL_NAME}') # save tokenizer config
tokenizer.save_pretrained(f'{CFG.TOKENIZER_SAVE_PATH}{CFG.MODEL_NAME}'); # save tokenizer

glob.glob(f'{CFG.TOKENIZER_SAVE_PATH}{CFG.MODEL_NAME}/*')

['./hp-tokenizer-bert-base-uncased\\config.json',
 './hp-tokenizer-bert-base-uncased\\special_tokens_map.json',
 './hp-tokenizer-bert-base-uncased\\tokenizer.json',
 './hp-tokenizer-bert-base-uncased\\tokenizer_config.json',
 './hp-tokenizer-bert-base-uncased\\vocab.txt']

In [10]:
from transformers import LineByLineTextDataset


In [11]:
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import pipeline
from transformers import LineByLineTextDataset

In [12]:
from transformers import (
    AutoModelForMaskedLM, AutoTokenizer, pipeline,
    LineByLineTextDataset, DataCollatorForLanguageModeling,
    Trainer, TrainingArguments
)

In [13]:
train_dataset = LineByLineTextDataset(
    tokenizer = tokenizer,
    file_path = "data_prep/pubmed_abstracts_1.txt", # train text file here
    block_size = 256
)

valid_dataset = LineByLineTextDataset(
    tokenizer = tokenizer,
    file_path = "data_prep/pubmed_abstracts_2.txt", # valid text file here
    block_size = 256
)



In [14]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [15]:
training_args = TrainingArguments(
    output_dir = f"{CFG.MODEL_SAVE_PATH}{CFG.MODEL_NAME}-ckpts-trainer", # path to save checkpoint files during training
    overwrite_output_dir = True,
    learning_rate = CFG.LR,
    num_train_epochs = CFG.EPOCHS,
    per_device_train_batch_size = CFG.BATCH_SIZE,
    per_device_eval_batch_size = CFG.BATCH_SIZE,
    eval_strategy = 'steps',
    save_total_limit = 1,
    eval_steps = 10 if CFG.DEBUG else 100,
    metric_for_best_model = 'eval_loss',
    greater_is_better = False,
    load_best_model_at_end = True,
    prediction_loss_only = True,
    report_to = "none"
)

trainer = Trainer(
    model = model,
    args = training_args,
    data_collator = data_collator,
    train_dataset = train_dataset,
    eval_dataset = valid_dataset
)

In [16]:
import transformers
print(transformers.__file__)
print(transformers.__version__) #4.57.0


k:\ner\nerenv_py311\Lib\site-packages\transformers\__init__.py
4.57.0


In [17]:
train_results = trainer.train()
trainer.save_model(f'{CFG.MODEL_SAVE_PATH}{CFG.MODEL_NAME}-best') # path to save checkpoint files after training is done



Step,Training Loss,Validation Loss
100,No log,1.768546


KeyboardInterrupt: 