<a href="https://colab.research.google.com/github/gglchrm/tarot_NN/blob/main/LLaMa_finetuning_tarotByDendory.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install accelerate
!pip install torch torchdata transformers datasets loralib peft pandas numpy

In [None]:
# Restart session here

In [None]:
from transformers import LlamaTokenizer, LlamaForCausalLM
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
import pandas as pd
import torch

model_name = "openlm-research/open_llama_3b_v2" # Base model to use
training_file = "tarot_readings.csv.1" # CSV file to use
num_epochs = 3 # Number of iterations to train
num_rows = 500 # Number of rows to use for training
device = torch.device("cuda:0") # cpu or cuda

In [None]:
def fine_tune_model(model, optimizer, batch, device):
    model.train()

    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['target_ids'].to(device)
    decoder_attention_mask = batch['target_attention_mask'].to(device)

    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels, decoder_attention_mask=decoder_attention_mask)
    loss = outputs.loss
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return loss.item()

In [None]:
def tacot_reading(model, tokenizer, card1, card2, card3):
    prompt = "Give me a one paragraph tarot reading if I pull the cards {}, {} and {}.".format(card1, card2, card3)

    inputs = tokenizer(prompt, return_tensors="pt")
    completion = tokenizer.decode(model.generate(inputs["input_ids"], max_new_tokens=1000)[0], skip_special_tokens=True)

    print("Prompt: {}".format(prompt))
    print("Response: {}".format(completion))
    print()

    return completion

In [None]:
print("* Loading model [{}]...".format(model_name))
tokenizer = LlamaTokenizer.from_pretrained(model_path)
model = LlamaForCausalLM.from_pretrained(model_path).to(DEVICE)

In [None]:
print("* Running 3 inferences (pre-training)...")
tacot_reading(model, tokenizer, "The moon", "Two of Swords", "Three of Wands")
tacot_reading(model, tokenizer, "The hermit", "Ace of Pentacles", "Judgement")
tacot_reading(model, tokenizer, "Seven of Cups", "The chariot", "King of Swords")

In [None]:
print("* Creating dataset from [{}]...".format(training_file))
dataset = create_tarot_dataset(training_file, tokenizer, num_rows)
data_loader = DataLoader(dataset, batch_size=16, shuffle=True)

In [None]:
print("* Training model for {} epochs..".format(num_epochs))
optimizer = Adam(model.parameters(), lr=1e-4)
for epoch in range(num_epochs):
    loss = 0
    for batch in data_loader:
        loss += fine_tune_model(model, optimizer, batch, device)
    print("Epoch {} average loss: {}".format((epoch+1), (loss / len(data_loader))))

In [None]:
print("* Running 3 inferences (post-training)...")
tacot_reading(model, tokenizer, "The moon", "Two of Swords", "Three of Wands")
tacot_reading(model, tokenizer, "The hermit", "Ace of Pentacles", "Judgement")
tacot_reading(model, tokenizer, "Seven of Cups", "The chariot", "King of Swords")

In [None]:
from transformers import TextDataset, DataCollatorForLanguageModeling

train_path = 'taror_reading.csv.1'

train_dataset = TextDataset(tokenizer=tokenizer,file_path=train_path,block_size=64)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./finetuned", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=8, # number of training epochs
    per_device_train_batch_size=1, # batch size for training
    per_device_eval_batch_size=1,  # batch size for evaluation
    warmup_steps=10,# number of warmup steps for learning rate scheduler
    gradient_accumulation_steps=8, # to make "virtual" batch size larger
    )

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    optimizers = (torch.optim.AdamW(model.parameters(),lr=1e-5),None) # Optimizer and lr scheduler
)

In [None]:
trainer.train()

In [None]:
# Пример вероятностного сэмплирвоания с ограничением
#card1 = "Солнце"
#card2 = "Маг"
#card3 = "Шут"
#q = "Почему я не могу найти работу?"
#text = "".format(card1, card2, card3, q)
text = "Карта Таро \"Маг\" означает следующее:"
input_ids = tokenizer.encode(text, return_tensors="pt").to(DEVICE)
model.eval()
with torch.no_grad():
    out = model.generate(input_ids,
                        do_sample=True,
                        num_beams=5,
                        temperature=1.1,
                        top_p=0.9,
                        top_k=10,
                        max_length=100,
                        )

generated_text = list(map(tokenizer.decode, out))[0]
print()
print(generated_text)

NameError: name 'tokenizer' is not defined