In [1]:
from fine_tune import training_data
training_data

{'review': 'The ambiance was wonderful and the service was top-notch, but the pasta was unfortunately overcooked and bland. The dessert, however, a chocolate lava cake, was divine.', 'summary': 'Good service and dessert, but the main course was disappointing.'}


[{'review': 'The ambiance was wonderful and the service was top-notch, but the pasta was unfortunately overcooked and bland. The dessert, however, a chocolate lava cake, was divine.',
  'summary': 'Good service and dessert, but the main course was disappointing.'},
 {'review': 'I waited for 45 minutes just to be seated, and another hour for my food. The steak was tough and the fries were cold. I will not be coming back.',
  'summary': 'Long wait times and poor food quality led to a bad experience.'},
 {'review': 'Absolutely spectacular! From the moment we walked in, we were treated like royalty. Every dish was a masterpiece of flavor and presentation. A truly unforgettable night.',
  'summary': 'The restaurant offered an exceptional experience with outstanding food.'}]

In [2]:
from transformers import AutoTokenizer

# --- Add this new code below your existing training_data list ---

# 1. Load the tokenizer for our chosen model
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# A tokenizer needs a padding token to make all sequences the same length.
# We'll use the "end of sentence" token as our padding token.
tokenizer.pad_token = tokenizer.eos_token

# 2. Format the data into a single string for each example
# We're creating a temporary list to hold our formatted text
formatted_texts = []
for item in training_data:
    text = f"Review: {item['review']}\nSummary: {item['summary']}"
    formatted_texts.append(text)

# 3. Tokenize the formatted text
# This converts our list of strings into a dictionary of numbers
tokenized_data = tokenizer(formatted_texts, padding=True, truncation=True, max_length=256)

# Let's inspect the output for our first review
print("\n--- Tokenizer Output ---")
print("These are the token IDs for the first review:")
print(tokenized_data['input_ids'][0])
print(tokenized_data['input_ids'][1])

  from .autonotebook import tqdm as notebook_tqdm



--- Tokenizer Output ---
These are the token IDs for the first review:
[14832, 25, 383, 4915, 3610, 373, 7932, 290, 262, 2139, 373, 1353, 12, 1662, 354, 11, 475, 262, 26296, 373, 12716, 25676, 46288, 290, 34377, 13, 383, 23084, 11, 2158, 11, 257, 11311, 28856, 12187, 11, 373, 11871, 13, 198, 22093, 25, 4599, 2139, 290, 23084, 11, 475, 262, 1388, 1781, 373, 17185, 13]
[14832, 25, 314, 13488, 329, 4153, 2431, 655, 284, 307, 21639, 11, 290, 1194, 1711, 329, 616, 2057, 13, 383, 26320, 373, 5802, 290, 262, 31757, 547, 4692, 13, 314, 481, 407, 307, 2406, 736, 13, 198, 22093, 25, 5882, 4043, 1661, 290, 3595, 2057, 3081, 2957, 284, 257, 2089, 1998, 13, 50256, 50256]


In [3]:
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import Dataset

# --- Add this new code below the tokenizer section ---

# 4. Prepare the data for the Trainer
# The Trainer API expects a Hugging Face Dataset object.
# First, we need to structure our tokenized data correctly.
input_data = []
for i in range(len(formatted_texts)):
    input_data.append({'input_ids': tokenized_data['input_ids'][i], 'attention_mask': tokenized_data['attention_mask'][i]})

# Create the Dataset object
train_dataset = Dataset.from_list(input_data)

# The model needs "labels" to calculate the loss. For language modeling,
# the labels are typically just the input_ids themselves.
def add_labels(examples):
    examples["labels"] = examples["input_ids"][:]
    return examples

train_dataset = train_dataset.map(add_labels)


# 5. Load the Model and Configure Training
model = AutoModelForCausalLM.from_pretrained(model_name)

# Define the training arguments. These are like the settings for our training run.
training_args = TrainingArguments(
    output_dir="./restaurant_summarizer", # Where to save the model
    num_train_epochs=3,                  # How many times to go through the data
    per_device_train_batch_size=1,       # How many examples to process at once
    logging_steps=1,                     # How often to log the training loss
)

# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

# 6. Start Fine-Tuning!
print("\n--- Starting Fine-Tuning ---")
trainer.train()
print("--- Fine-Tuning Complete ---")

Map:   0%|          | 0/3 [00:00<?, ? examples/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Map: 100%|██████████| 3/3 [00:00<00:00, 364.30 examples/s]



--- Starting Fine-Tuning ---


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
1,3.8888
2,4.9552
3,3.7346
4,3.6147
5,3.9047
6,4.4546
7,3.3873
8,4.298
9,3.6767


--- Fine-Tuning Complete ---
