In [1]:
import torch
print(torch.cuda.is_available())  # Should return True if a GPU is available
print(torch.cuda.device_count())  # Number of available GPUs
print(torch.cuda.current_device())  # Index of the currently active GPU (usually 0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
torch.cuda.empty_cache()

True
1
0
Using device: cuda


In [18]:
import pandas as pd
from datasets import Dataset
import pandas as pd
import re

# Specify the path to your JSON file
json_file_path = 'aquinas.json'

# Load the JSON file into a DataFrame
df = pd.read_json(json_file_path)

# Consolidate the article body
df["articleBody_Concat"] = df["articleBody"].apply(lambda x: x[-1] if len(x) > 0 else x)

# Clean up the question
df["articleTitle_Clean"] = df["articleTitle"].apply(lambda x: re.sub(r'Article\s\d+\.\s', '', x))

# Filter to relevant columns -- article Q and body A
df = df[["articleTitle_Clean", "articleBody_Concat"]].rename(columns = {"articleTitle_Clean":"question",
                                                                        "articleBody_Concat":"response"})

# Display the DataFrame
data = df.to_dict(orient='records')
data = [d for d in data if isinstance(d['response'], str) and len(d['response']) > 0]
print(data[0])

{'question': 'Whether, besides philosophy, any further doctrine is required?', 'response': 'I answer that, It was necessary for man\'s salvation that there should be a knowledge revealed by God besides philosophical science built up by human reason. Firstly, indeed, because man is directed to God, as to an end that surpasses the grasp of his reason: "The eye hath not seen, O God, besides Thee, what things Thou hast prepared for them that wait for Thee" (Isaiah 64:4). But the end must first be known by men who are to direct their thoughts and actions to the end. Hence it was necessary for the salvation of man that certain truths which exceed human reason should be made known to him by divine revelation. Even as regards those truths about God which human reason could have discovered, it was necessary that man should be taught by a divine revelation; because the truth about God such as reason could discover, would only be known by a few, and that after a long time, and with the admixture 

In [19]:
import torch
from torch.utils.data import Dataset

# Tokenize the data
def tokenize_function(examples):
    inputs = tokenizer(examples['question'], truncation=True, padding='max_length', max_length=512)
    outputs = tokenizer(examples['response'], truncation=True, padding='max_length', max_length=512)
    inputs['labels'] = outputs['input_ids']
    return inputs

from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM

torch.cuda.empty_cache()
model_name = 'facebook/bart-base'
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)


tokenized_data = [tokenize_function(pair) for pair in data]

# Custom dataset class
class QADataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val) for key, val in self.encodings[idx].items()}
        return item

dataset = QADataset(tokenized_data)
dataset[0]



{'input_ids': tensor([    0, 18259,     6, 12035, 10561,     6,   143,   617, 26944,    16,
          1552,   116,     2,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,   

In [24]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir='./logs',
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

# Train the model
trainer.train()


Step,Training Loss


KeyboardInterrupt: 

In [26]:
# Encode input text
model.to('cpu')

# Input text for translation
input_text = "Is the soul composed of the intellect and will?"

# Tokenize the input text
inputs = tokenizer(input_text, truncation=True, padding='max_length', max_length=512, return_tensors='pt')

# Generate translation
outputs = model.generate(**inputs, early_stopping=True, max_length=200)

# Decode the output text
translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(translated_text)



I answer that, As stated above (II-II:84:1), the soul is composed of the intellect and will.


In [27]:
output_model = 'aquinas-bart'
model.save_pretrained("./aquinas-bart")
tokenizer.save_pretrained("./aquinas-bart")


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


('./aquinas-bart\\tokenizer_config.json',
 './aquinas-bart\\special_tokens_map.json',
 './aquinas-bart\\vocab.json',
 './aquinas-bart\\merges.txt',
 './aquinas-bart\\added_tokens.json',
 './aquinas-bart\\tokenizer.json')

In [28]:
from huggingface_hub import notebook_login
import os

# Use this token: os.environ["HF_TOKEN"]
# on here at the CLI: huggingface-cli login

model.push_to_hub("bmconrad/aquinas-bart")
tokenizer.push_to_hub("bmconrad/aquinas-bart")

README.md:   0%|          | 0.00/177 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/177 [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/bmconrad/aquinas-bart/commit/cb5a13160de4760b57a587cdba6c906122cac4b8', commit_message='Upload tokenizer', commit_description='', oid='cb5a13160de4760b57a587cdba6c906122cac4b8', pr_url=None, pr_revision=None, pr_num=None)

In [12]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("bmconrad/aquinas-t5")
model = AutoModelForCausalLM.from_pretrained("bmconrad/aquinas-t5")

tokenizer_config.json:   0%|          | 0.00/496 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/946 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [15]:
# Define your prompt
prompt = "that righteousness might not be of the"

# Tokenize the input prompt
inputs = tokenizer(prompt, return_tensors="pt")

# Generate the text
outputs = model.generate(
    inputs.input_ids,
    max_length=20,            # Maximum length of generated text
    num_return_sequences=3,    # Number of sequences to generate
    no_repeat_ngram_size=2,    # Prevent repeating n-grams
    top_k=50,                  # Number of highest probability vocabulary tokens to keep for top-k-filtering
    top_p=0.95,                # If set to float < 1, only the most probable tokens with probabilities that add up to top_p are kept for generation
    temperature=0.7,           # The temperature of the sampling distribution
    do_sample=True             # Sampling or greedy decoding
)

for row in outputs:
    # Decode and print the generated text
    generated_text = tokenizer.decode(row, skip_special_tokens=True)
    print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


that righteousness might not be of the flesh, but of spirit: and righteousness by faith of God through
that righteousness might not be of the wicked: but he that believeth in his righteousness shall be saved
that righteousness might not be of the least degree, which is given unto a perfect man, but of
