In [1]:
# !pip install transformers
# !pip install datasets
# !pip install trl
# !pip install peft
# !pip install matplotlib
# !pip install accelerate
# INSTALL TORCH MANUALLY
# https://pytorch.org/get-started/locally/

In [2]:
def read_txt(file_path):
    with open(file_path, "r") as file:
        text = file.read()
    return text


def write_file(file_path, data):
    with open(file_path, "w") as f:
        for datum in data:
            f.write(datum)
            f.write("\n")

In [3]:
# import torch
# from transformers import AutoModelForCausalLM, AutoTokenizer

# torch.set_default_device("cuda")
# model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1_5", trust_remote_code=True, torch_dtype="auto")
# tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5", trust_remote_code=True, torch_dtype="auto")
# inputs = tokenizer(
#     "Write a detailed analogy between mathematics and a lighthouse.", return_tensors="pt", return_attention_mask=False
# )

# outputs = model.generate(**inputs, max_length=200)
# text = tokenizer.batch_decode(outputs)[0]
# print(text)

In [4]:
from transformers import TextDataset, DataCollatorForLanguageModeling


def load_dataset(file_path, tokenizer, block_size=128):
    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size,
    )
    return dataset


def load_data_collator(tokenizer, mlm=False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm,
    )
    return data_collator

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import re

text_data_1 = read_txt("./lotr/the-fellowship-of-the-ring.txt")
text_data_2 = read_txt("./lotr/the-two-towers.txt")
text_data_3 = read_txt("./lotr/the-return-of-the-king.txt")

regex_pattern = r"\b[^.!?]+[.!?]+"
sentence_data_1 = re.findall(regex_pattern, text_data_1)
sentence_data_2 = re.findall(regex_pattern, text_data_2)
sentence_data_3 = re.findall(regex_pattern, text_data_3)
sentence_data = sentence_data_1 + sentence_data_2 + sentence_data_3

longest_sentence = 0
min_words_in_sentence = 0  # 10
max_string_sentence_len = 1024  # 512

clean_data = list()
for sentence in sentence_data:
    word_list = sentence.split()
    clean_sentence = " ".join(word_list)

    if len(word_list) > min_words_in_sentence:
        if len(clean_sentence) < max_string_sentence_len:
            clean_data.append(clean_sentence)

    if len(clean_sentence) > longest_sentence:
        longest_sentence = len(clean_sentence)

write_file("./lotr/all.txt", clean_data)

print("FIRST INDEX")
print(clean_data[0])
print("SECOND INDEX")
print(clean_data[1])
print("HOW MANY SENTENCES")
print(len(clean_data))
print("LONGEST SENTENCE")
print(longest_sentence)

FIRST INDEX
Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone, Nine for Mortal Men doomed to die, One for the Dark Lord on his dark throne In the Land of Mordor where the Shadows lie.
SECOND INDEX
One Ring to rule them all, One Ring to find them, One Ring to bring them all and in the darkness bind them In the Land of Mordor where the Shadows lie.
HOW MANY SENTENCES
35506
LONGEST SENTENCE
631


In [6]:
from transformers import (
    Trainer,
    TrainingArguments,
    TextDataset,
    DataCollatorForLanguageModeling,
)
from transformers import AdamW
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import Trainer, TrainingArguments
from datasets import Dataset, concatenate_datasets, load_dataset
import torch
import torch.nn.functional as F
from datasets import load_dataset
from accelerate import Accelerator
import os
import tempfile
import gc
import json

# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "garbage_collection_threshold:0.6,max_split_size_mb:128"
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "backend:cudaMallocAsync"
# os.environ["PYTORCH_NO_CUDA_MEMORY_CACHING"] = "1"
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

torch.cuda.empty_cache()
gc.collect()

device = "cuda" if torch.cuda.is_available() else "cpu"
print(torch.cuda.get_device_name())

accelerator = Accelerator()
device = accelerator.device

# model_name = "microsoft/phi-1_5"
model_name = "bert-base-cased"

NVIDIA GeForce RTX 3070


In [7]:
# dataset = load_dataset(
#     "text",
#     data_files={
#         "train": [
#             "./lotr/all.txt",
#             # "./lotr/the-fellowship-of-the-ring.txt",
#             # "./lotr/the-return-of-the-king.txt",
#             # "./lotr/the-two-towers.txt",
#         ]
#     },
# )

# encoded_dataset = dataset.map(
#     lambda x: tokenizer.encode_plus(
#         x["text"],
#         add_special_tokens=True,
#         max_length=1024,
#         padding="max_length",
#         truncation=True,
#     ),
#     batched=True,
# )

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, torch_dtype="auto", device_map="auto")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_special_tokens({"pad_token": "[PAD]"})


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

Using eos_token, but it is not set yet.


In [9]:
train_dataset = load_dataset("text", data_files={"train": ["./lotr/all.txt"]})

tokenized_train_dataset = train_dataset.map(lambda examples: tokenize_function(examples))
display(tokenized_train_dataset)

# data = torch.utils.data.DataLoader(train_dataset, shuffle=True)
# display(data)

Downloading and preparing dataset text/default to C:/Users/D4nk/.cache/huggingface/datasets/text/default-2d8d0ef27730de4f/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...


Downloading data files: 100%|██████████| 1/1 [00:00<?, ?it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 166.01it/s]
                                                        

Dataset text downloaded and prepared to C:/Users/D4nk/.cache/huggingface/datasets/text/default-2d8d0ef27730de4f/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.


100%|██████████| 1/1 [00:00<00:00, 99.98it/s]
                                                                   

DatasetDict({
    train: Dataset({
        features: ['text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 35506
    })
})

In [11]:
model = AutoModelForCausalLM.from_pretrained(
    model_name, trust_remote_code=True, torch_dtype="auto"
)  # , device_map="auto"
model.to(device)

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertLMHeadModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertLMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertLMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertLMHeadModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [12]:
### NOT USED
# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
# train_loader = DataLoader(encoded_dataset["train"], batch_size=8, shuffle=True)
# eval_loader = DataLoader(encoded_dataset["test"], batch_size=8)
# optimizer = AdamW(model.parameters(), lr=5e-5)
###

### https://github.com/huggingface/accelerate/tree/main
# model = torch.nn.Transformer()
# optimizer = torch.optim.Adam(model.parameters())

# dataset = load_dataset("my_dataset")
# data = torch.utils.data.DataLoader(tokenized_train_dataset["train"], shuffle=False)

# model, optimizer, data = accelerator.prepare(model, optimizer, data)
###

args = TrainingArguments(
    output_dir="./output",
    # num_train_epochs=3,
    # per_device_train_batch_size=8,
    # per_device_eval_batch_size=8,
    # warmup_steps=500,
    # weight_decay=0.01,
    # logging_dir="./logs",
    # logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train_dataset["train"],
    # eval_dataset=dataset["test"],
    # optimizers=optimizer
    # data_collator=data_collator,
    # tokenizer=tokenizer,
)

trainer.train()

# model.train()
# for epoch in range(10):
#     index = 0
#     for datum in data:
#         display(datum)
#         # source, targets = datum
#         # source = source.to(device)
#         # targets = targets.to(device)
#         optimizer.zero_grad()
#         # output = model(source)
#         # loss = F.cross_entropy(output, targets)
#         # datum["input_ids"][index]
#         output = model(datum["text"])
#         loss = F.cross_entropy(output, datum["input_ids"][index])
#         accelerator.backward(loss)
#         optimizer.step()

  0%|          | 0/13317 [00:00<?, ?it/s]

ValueError: The model did not return a loss from the inputs, only the following keys: logits. For reference, the inputs it received are input_ids,token_type_ids,attention_mask.

In [None]:
# model_checkpoint = "bert-base-uncased"

# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, truncation=True, padding='max_length', return_special_tokens_mask=True)

# text_df = pd.DataFrame({'Text':text})

# # set up train and eval dataset
# train_size=0.8
# train_dataset = text_df.sample(frac=train_size,random_state=200)
# test_dataset = text_df.drop(train_dataset.index).reset_index(drop=True)
# train_dataset = train_dataset.reset_index(drop=True)

# print("defined training and test set")
# def tokenize(text_df, tokenizer):
#     tokenized_inputs = tokenizer(text_df["Text"], is_split_into_words=False, padding='max_length',
#                                  truncation=True,
#                                  return_special_tokens_mask=True)# , return_tensors="pt").to(device) #commented out bc gives errors
#     return tokenized_inputs

# train_data = Dataset.from_pandas(train_dataset).map(tokenize,
#     fn_kwargs={'tokenizer':tokenizer},
#     remove_columns=['Text'])
# #train_data.set_format("torch")
# test_data = Dataset.from_pandas(test_dataset).map(tokenize,
#     fn_kwargs={'tokenizer':tokenizer},
#     remove_columns=['Text'])
# #test_data.set_format("torch")
# print("tokenized data")

# test_labels = Dataset.from_pandas(pd.DataFrame({'labels':test_data['input_ids'].copy()}))
# train_labels = Dataset.from_pandas(pd.DataFrame({'labels':train_data['input_ids'].copy()}))
# test_data = concatenate_datasets([test_data, test_labels], axis=1)
# train_data = concatenate_datasets([train_data, train_labels], axis=1)


# #initiating model
# model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
# model.to(device)
# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer)
# args = TrainingArguments(
#     save_path),
#     evaluation_strategy="steps",
#     save_strategy="epoch",
#     learning_rate=1e-3,
#     num_train_epochs=1,
#     weight_decay=0.01,
#     push_to_hub=False,
#     per_device_train_batch_size = 8,#256,
#     per_device_eval_batch_size = 8,#256,
#     logging_steps=50,
#     eval_steps = 50,
#     save_total_limit = 3, #saves only last 3 checkpoints
#     gradient_accumulation_steps=32,#64,
#     gradient_checkpointing=True,
#     fp16=True,
#     optim="adafactor"
# )

# trainer = Trainer(
#     model=model,
#     args=args,
#     train_dataset=train_data,
#     eval_dataset=test_data,
#     data_collator=data_collator,
#     tokenizer=tokenizer,
# )

# train_result = trainer.train()

In [None]:
# train_file_path = "./lotr/the-fellowship-of-the-ring.txt"
# model_name = "microsoft/phi-1_5"
# output_dir = "./output"
# overwrite_output_dir = True
# per_device_train_batch_size = 8
# num_train_epochs = 50.0
# save_steps = 50000

# train(
#     train_file_path=train_file_path,
#     model_name=model_name,
#     output_dir=output_dir,
#     overwrite_output_dir=overwrite_output_dir,
#     per_device_train_batch_size=per_device_train_batch_size,
#     num_train_epochs=num_train_epochs,
#     save_steps=save_steps,
# )

In [None]:
# train_file_path = "./lotr/the-fellowship-of-the-ring.txt"
# model_name = "microsoft/phi-1_5"
# output_dir = "./output"
# overwrite_output_dir = True
# per_device_train_batch_size = 8
# num_train_epochs = 50.0
# save_steps = 50000

# train(
#     train_file_path=train_file_path,
#     model_name=model_name,
#     output_dir=output_dir,
#     overwrite_output_dir=overwrite_output_dir,
#     per_device_train_batch_size=per_device_train_batch_size,
#     num_train_epochs=num_train_epochs,
#     save_steps=save_steps,
# )

In [None]:
# dataset = load_dataset(
#     "text",
#     data_files={
#         "train": [
#             "./lotr/the-fellowship-of-the-ring.txt",
#             "./lotr/the-return-of-the-king.txt",
#             "./lotr/the-two-towers.txt",
#         ]
#     },
# )


# def load_dataset(file_path, tokenizer, block_size=128):
#     dataset = TextDataset(
#         tokenizer=tokenizer,
#         file_path=file_path,
#         block_size=block_size,
#     )
#     return dataset


# print(f"Train dataset size: {len(dataset['train'])}")

# tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
# tokenized_datasets = dataset.map(tokenize_function, batched=True)
# small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
# small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

In [None]:
# inputs = tokenizer(
#     "write me a lord of the rings style short story",
#     return_tensors="pt",
#     return_attention_mask=False,
# )
# outputs = model.generate(**inputs, max_length=200)
# text = tokenizer.batch_decode(outputs)[0]
# print(text)

In [None]:
# from peft import (
#     get_peft_config,
#     get_peft_model,
#     PromptTuningInit,
#     PromptTuningConfig,
#     TaskType,
#     PeftType,
#     PromptEncoderConfig,
#     PrefixTuningConfig,
#     LoraConfig,
#     PeftConfig,
# )

# lora_peft_config = LoraConfig()

# dynamic_padding = True


# def tokenize_func(examples):
#     return tokenizer(
#         examples["input"], truncation=True, max_length=1044
#     )  # max_length=512,  padding=True


# train_dataset_final = train_dataset.map(tokenize_func, batched=True)

In [None]:
# tokenizer.pad_token = tokenizer.eos_token
# model.resize_token_embeddings(len(tokenizer))
# model = prepare_model_for_int8_training(model)
# model = get_peft_model(model, lora_peft_config)
# training_args = model_training_args
# trainer = SFTTrainer(
#     model=model,
#     train_dataset=train_dataset_final,
#     dataset_text_field="text",
#     max_seq_length=1044,
#     tokenizer=tokenizer,
#     args=model_training_args,
#     packing=True,
#     peft_config=lora_peft_config,
# )
# trainer.train()