In [44]:
import pandas as pd
import requests
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

# Load DataFrame
df = pd.read_csv("extracted_ebooks_info.csv")

In [21]:
import pandas as pd
import requests
from transformers import GPT2Tokenizer
from datasets import Dataset

# Load the DataFrame (replace with actual file path or DataFrame loading code)
df = pd.read_csv('extracted_ebooks_info.csv')

# Function to fetch the text from the link
def fetch_text_from_link(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return ""

# Apply the function to the 'Plain Text Link' column to extract the actual text
df['text'] = df['Plain Text Link'].apply(fetch_text_from_link)

# Optionally, you can add metadata to the text (e.g., Author, Title) if needed
df['formatted_text'] = df.apply(lambda row: f"Author: {row['Author']}\nTitle: {row['Title']}\nRelease Date: {row['Release Date']}\nCategory: {row['Category']}\n{row['text']}", axis=1)

# Now, the DataFrame contains a 'formatted_text' column with the actual text data ready for tokenization


In [41]:
# Create a Dataset
dataset = Dataset.from_pandas(df[['formatted_text']])

# Load the GPT2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Step #5: Add special tokens to the tokenizer (if not already present)
special_tokens_dict = {'additional_special_tokens': ['[AUTHOR]', '[TITLE]', '[RELEASE_DATE]', '[CATEGORY]']}
tokenizer.add_tokens(special_tokens_dict['additional_special_tokens'])

# Resize model's token embeddings to account for the new special tokens
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

Embedding(50261, 768)

In [54]:
# Tokenize the 'formatted_text' column
tokenizer.pad_token = tokenizer.eos_token
def tokenize_function(examples):
    # Tokenize and create the labels (input_ids == labels)
    encodings = tokenizer(examples['formatted_text'], truncation=True, padding="max_length", max_length=512)
    encodings['labels'] = encodings['input_ids']  # Set labels equal to input_ids for causal language modeling
    return encodings

# Apply tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 437/437 [02:57<00:00,  2.47 examples/s]


In [55]:
# Split the tokenized dataset into training and validation sets using Hugging Face's `train_test_split`
split_dataset = tokenized_datasets.train_test_split(test_size=0.1)

# Convert to DatasetDict for compatibility with Trainer
dataset_dict = DatasetDict({
    'train': split_dataset['train'],
    'validation': split_dataset['test']
})

In [None]:
# Set up the training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",  # where to save model
    num_train_epochs=3,  # number of training epochs
    per_device_train_batch_size=2,  # batch size for training
    per_device_eval_batch_size=2,  # batch size for evaluation
    warmup_steps=500,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    logging_dir="./logs",  # directory for storing logs
    logging_steps=10,  # log every 10 steps
    evaluation_strategy="steps",  # evaluate every X steps
    save_steps=500,  # save the model every 500 steps
    save_total_limit=2,  # limit the total number of saved checkpoints
)

In [57]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_dict['train'],
    eval_dataset=dataset_dict['validation'],
    tokenizer=tokenizer
)

  trainer = Trainer(


In [58]:
# Fine-tune the model
trainer.train()

# Save the fine-tuned model
trainer.save_model("./fine_tuned_gpt2")


  0%|          | 0/591 [05:39<?, ?it/s]         

{'loss': 4.9699, 'grad_norm': 30.966703414916992, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.05}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                

  0%|          | 0/591 [06:08<?, ?it/s]        
[A
[A

{'eval_loss': 4.2132954597473145, 'eval_runtime': 29.3998, 'eval_samples_per_second': 1.497, 'eval_steps_per_second': 0.748, 'epoch': 0.05}



  0%|          | 0/591 [07:04<?, ?it/s]         

{'loss': 4.8252, 'grad_norm': 14.857179641723633, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.1}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                

  0%|          | 0/591 [07:34<?, ?it/s]        
[A
[A

{'eval_loss': 4.076850414276123, 'eval_runtime': 30.0961, 'eval_samples_per_second': 1.462, 'eval_steps_per_second': 0.731, 'epoch': 0.1}



  0%|          | 0/591 [08:39<?, ?it/s]           

{'loss': 4.6046, 'grad_norm': 14.077605247497559, 'learning_rate': 3e-06, 'epoch': 0.15}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                  

  0%|          | 0/591 [09:16<?, ?it/s]        
[A
[A

{'eval_loss': 3.900312900543213, 'eval_runtime': 37.2505, 'eval_samples_per_second': 1.181, 'eval_steps_per_second': 0.591, 'epoch': 0.15}



  0%|          | 0/591 [10:18<?, ?it/s]           

{'loss': 4.1471, 'grad_norm': 13.649974822998047, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.2}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                  

  0%|          | 0/591 [10:53<?, ?it/s]        
[A
[A

{'eval_loss': 3.662198781967163, 'eval_runtime': 34.9805, 'eval_samples_per_second': 1.258, 'eval_steps_per_second': 0.629, 'epoch': 0.2}



  0%|          | 0/591 [11:57<?, ?it/s]           

{'loss': 3.8556, 'grad_norm': 11.377331733703613, 'learning_rate': 5e-06, 'epoch': 0.25}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                  

  0%|          | 0/591 [12:32<?, ?it/s]        
[A
[A

{'eval_loss': 3.317432165145874, 'eval_runtime': 35.1082, 'eval_samples_per_second': 1.253, 'eval_steps_per_second': 0.627, 'epoch': 0.25}



  0%|          | 0/591 [13:35<?, ?it/s]         

{'loss': 3.4283, 'grad_norm': 9.273543357849121, 'learning_rate': 6e-06, 'epoch': 0.3}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                

  0%|          | 0/591 [14:08<?, ?it/s]        
[A
[A

{'eval_loss': 3.028886556625366, 'eval_runtime': 32.4029, 'eval_samples_per_second': 1.358, 'eval_steps_per_second': 0.679, 'epoch': 0.3}



  0%|          | 0/591 [15:09<?, ?it/s]         

{'loss': 3.2189, 'grad_norm': 8.496755599975586, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.36}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                

  0%|          | 0/591 [15:43<?, ?it/s]        
[A
[A

{'eval_loss': 2.738224983215332, 'eval_runtime': 34.8336, 'eval_samples_per_second': 1.263, 'eval_steps_per_second': 0.632, 'epoch': 0.36}



  0%|          | 0/591 [16:48<?, ?it/s]         

{'loss': 2.8279, 'grad_norm': 7.37190580368042, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.41}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                

  0%|          | 0/591 [17:21<?, ?it/s]        
[A
[A

{'eval_loss': 2.5315096378326416, 'eval_runtime': 32.763, 'eval_samples_per_second': 1.343, 'eval_steps_per_second': 0.671, 'epoch': 0.41}



  0%|          | 0/591 [18:20<?, ?it/s]         

{'loss': 2.639, 'grad_norm': 6.901695251464844, 'learning_rate': 9e-06, 'epoch': 0.46}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                

  0%|          | 0/591 [18:52<?, ?it/s]        
[A
[A

{'eval_loss': 2.3466639518737793, 'eval_runtime': 31.8982, 'eval_samples_per_second': 1.379, 'eval_steps_per_second': 0.69, 'epoch': 0.46}



  0%|          | 0/591 [19:51<?, ?it/s]          

{'loss': 2.6392, 'grad_norm': 7.938910007476807, 'learning_rate': 1e-05, 'epoch': 0.51}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 

  0%|          | 0/591 [20:22<?, ?it/s]        
[A
[A

{'eval_loss': 2.1952569484710693, 'eval_runtime': 31.806, 'eval_samples_per_second': 1.383, 'eval_steps_per_second': 0.692, 'epoch': 0.51}



  0%|          | 0/591 [21:21<?, ?it/s]          

{'loss': 2.3898, 'grad_norm': 11.806139945983887, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.56}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 

  0%|          | 0/591 [21:53<?, ?it/s]        
[A
[A

{'eval_loss': 2.0760610103607178, 'eval_runtime': 31.6928, 'eval_samples_per_second': 1.388, 'eval_steps_per_second': 0.694, 'epoch': 0.56}



  0%|          | 0/591 [22:53<?, ?it/s]          

{'loss': 2.2601, 'grad_norm': 8.977227210998535, 'learning_rate': 1.2e-05, 'epoch': 0.61}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 

  0%|          | 0/591 [23:25<?, ?it/s]        
[A
[A

{'eval_loss': 1.9626556634902954, 'eval_runtime': 31.7789, 'eval_samples_per_second': 1.385, 'eval_steps_per_second': 0.692, 'epoch': 0.61}



  0%|          | 0/591 [24:22<?, ?it/s]          

{'loss': 2.0972, 'grad_norm': 8.184053421020508, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.66}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 

  0%|          | 0/591 [24:55<?, ?it/s]        
[A
[A

{'eval_loss': 1.8535517454147339, 'eval_runtime': 32.7666, 'eval_samples_per_second': 1.343, 'eval_steps_per_second': 0.671, 'epoch': 0.66}



  0%|          | 0/591 [25:54<?, ?it/s]          

{'loss': 1.9576, 'grad_norm': 7.1303300857543945, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.71}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 

  0%|          | 0/591 [26:25<?, ?it/s]        
[A
[A

{'eval_loss': 1.7629368305206299, 'eval_runtime': 30.9657, 'eval_samples_per_second': 1.421, 'eval_steps_per_second': 0.71, 'epoch': 0.71}



  0%|          | 0/591 [27:26<?, ?it/s]          

{'loss': 1.8222, 'grad_norm': 10.007413864135742, 'learning_rate': 1.5e-05, 'epoch': 0.76}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 

  0%|          | 0/591 [27:58<?, ?it/s]        
[A
[A

{'eval_loss': 1.6866081953048706, 'eval_runtime': 31.784, 'eval_samples_per_second': 1.384, 'eval_steps_per_second': 0.692, 'epoch': 0.76}



  0%|          | 0/591 [28:55<?, ?it/s]          

{'loss': 1.9855, 'grad_norm': 12.610899925231934, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.81}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 

  0%|          | 0/591 [29:28<?, ?it/s]        
[A
[A

{'eval_loss': 1.6097537279129028, 'eval_runtime': 33.0521, 'eval_samples_per_second': 1.331, 'eval_steps_per_second': 0.666, 'epoch': 0.81}



  0%|          | 0/591 [30:32<?, ?it/s]          

{'loss': 1.7515, 'grad_norm': 7.718392848968506, 'learning_rate': 1.7000000000000003e-05, 'epoch': 0.86}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 

  0%|          | 0/591 [31:06<?, ?it/s]        
[A
[A

{'eval_loss': 1.5396647453308105, 'eval_runtime': 33.8574, 'eval_samples_per_second': 1.3, 'eval_steps_per_second': 0.65, 'epoch': 0.86}



  0%|          | 0/591 [32:08<?, ?it/s]          

{'loss': 1.8306, 'grad_norm': 14.932454109191895, 'learning_rate': 1.8e-05, 'epoch': 0.91}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 

  0%|          | 0/591 [32:41<?, ?it/s]        
[A
[A

{'eval_loss': 1.4818336963653564, 'eval_runtime': 33.037, 'eval_samples_per_second': 1.332, 'eval_steps_per_second': 0.666, 'epoch': 0.91}



  0%|          | 0/591 [33:42<?, ?it/s]          

{'loss': 1.4974, 'grad_norm': 12.523064613342285, 'learning_rate': 1.9e-05, 'epoch': 0.96}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 

  0%|          | 0/591 [34:13<?, ?it/s]        
[A
[A

{'eval_loss': 1.4380919933319092, 'eval_runtime': 31.5057, 'eval_samples_per_second': 1.397, 'eval_steps_per_second': 0.698, 'epoch': 0.96}



  0%|          | 0/591 [35:07<?, ?it/s]          

{'loss': 1.6846, 'grad_norm': 8.553607940673828, 'learning_rate': 2e-05, 'epoch': 1.02}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 

  0%|          | 0/591 [35:39<?, ?it/s]        
[A
[A

{'eval_loss': 1.4026708602905273, 'eval_runtime': 31.8971, 'eval_samples_per_second': 1.379, 'eval_steps_per_second': 0.69, 'epoch': 1.02}



  0%|          | 0/591 [36:35<?, ?it/s]          

{'loss': 1.6574, 'grad_norm': 12.443800926208496, 'learning_rate': 2.1e-05, 'epoch': 1.07}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 

  0%|          | 0/591 [37:04<?, ?it/s]        
[A
[A

{'eval_loss': 1.3765207529067993, 'eval_runtime': 29.4179, 'eval_samples_per_second': 1.496, 'eval_steps_per_second': 0.748, 'epoch': 1.07}



  0%|          | 0/591 [37:59<?, ?it/s]          

{'loss': 1.6133, 'grad_norm': 9.425243377685547, 'learning_rate': 2.2000000000000003e-05, 'epoch': 1.12}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 

  0%|          | 0/591 [38:28<?, ?it/s]        
[A
[A

{'eval_loss': 1.3406742811203003, 'eval_runtime': 29.3532, 'eval_samples_per_second': 1.499, 'eval_steps_per_second': 0.749, 'epoch': 1.12}



  0%|          | 0/591 [39:23<?, ?it/s]          

{'loss': 1.4344, 'grad_norm': 6.62037992477417, 'learning_rate': 2.3000000000000003e-05, 'epoch': 1.17}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 

  0%|          | 0/591 [39:52<?, ?it/s]        
[A
[A

{'eval_loss': 1.3327058553695679, 'eval_runtime': 29.2739, 'eval_samples_per_second': 1.503, 'eval_steps_per_second': 0.752, 'epoch': 1.17}



  0%|          | 0/591 [40:46<?, ?it/s]          

{'loss': 1.4007, 'grad_norm': 4.541355133056641, 'learning_rate': 2.4e-05, 'epoch': 1.22}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 

  0%|          | 0/591 [41:15<?, ?it/s]        
[A
[A

{'eval_loss': 1.313994288444519, 'eval_runtime': 28.9873, 'eval_samples_per_second': 1.518, 'eval_steps_per_second': 0.759, 'epoch': 1.22}



  0%|          | 0/591 [42:09<?, ?it/s]          

{'loss': 1.4762, 'grad_norm': 6.688156604766846, 'learning_rate': 2.5e-05, 'epoch': 1.27}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 

  0%|          | 0/591 [42:39<?, ?it/s]        
[A
[A

{'eval_loss': 1.304343819618225, 'eval_runtime': 29.1127, 'eval_samples_per_second': 1.511, 'eval_steps_per_second': 0.756, 'epoch': 1.27}



  0%|          | 0/591 [43:33<?, ?it/s]          

{'loss': 1.3104, 'grad_norm': 5.723019123077393, 'learning_rate': 2.6000000000000002e-05, 'epoch': 1.32}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 

  0%|          | 0/591 [44:03<?, ?it/s]        
[A
[A

{'eval_loss': 1.2760658264160156, 'eval_runtime': 29.4407, 'eval_samples_per_second': 1.495, 'eval_steps_per_second': 0.747, 'epoch': 1.32}



  0%|          | 0/591 [44:57<?, ?it/s]          

{'loss': 1.1048, 'grad_norm': 6.548392295837402, 'learning_rate': 2.7000000000000002e-05, 'epoch': 1.37}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 

  0%|          | 0/591 [45:26<?, ?it/s]        
[A
[A

{'eval_loss': 1.254521131515503, 'eval_runtime': 29.2263, 'eval_samples_per_second': 1.505, 'eval_steps_per_second': 0.753, 'epoch': 1.37}



  0%|          | 0/591 [46:20<?, ?it/s]          

{'loss': 1.484, 'grad_norm': 5.043571949005127, 'learning_rate': 2.8000000000000003e-05, 'epoch': 1.42}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 

  0%|          | 0/591 [46:49<?, ?it/s]        
[A
[A

{'eval_loss': 1.2518596649169922, 'eval_runtime': 29.1883, 'eval_samples_per_second': 1.507, 'eval_steps_per_second': 0.754, 'epoch': 1.42}



  0%|          | 0/591 [47:43<?, ?it/s]          

{'loss': 1.1455, 'grad_norm': 8.445704460144043, 'learning_rate': 2.9e-05, 'epoch': 1.47}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 

  0%|          | 0/591 [48:13<?, ?it/s]        
[A
[A

{'eval_loss': 1.2365226745605469, 'eval_runtime': 29.55, 'eval_samples_per_second': 1.489, 'eval_steps_per_second': 0.745, 'epoch': 1.47}



  0%|          | 0/591 [49:09<?, ?it/s]          

{'loss': 1.1675, 'grad_norm': 6.766172885894775, 'learning_rate': 3e-05, 'epoch': 1.52}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 

  0%|          | 0/591 [49:41<?, ?it/s]        
[A
[A

{'eval_loss': 1.2215361595153809, 'eval_runtime': 31.8068, 'eval_samples_per_second': 1.383, 'eval_steps_per_second': 0.692, 'epoch': 1.52}



  0%|          | 0/591 [50:36<?, ?it/s]          

{'loss': 1.2101, 'grad_norm': 5.030069351196289, 'learning_rate': 3.1e-05, 'epoch': 1.57}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 

  0%|          | 0/591 [51:05<?, ?it/s]        
[A
[A

{'eval_loss': 1.2122708559036255, 'eval_runtime': 29.2292, 'eval_samples_per_second': 1.505, 'eval_steps_per_second': 0.753, 'epoch': 1.57}



  0%|          | 0/591 [52:00<?, ?it/s]          

{'loss': 1.2431, 'grad_norm': 4.910078525543213, 'learning_rate': 3.2000000000000005e-05, 'epoch': 1.62}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 

  0%|          | 0/591 [52:31<?, ?it/s]        
[A
[A

{'eval_loss': 1.2149604558944702, 'eval_runtime': 31.6011, 'eval_samples_per_second': 1.392, 'eval_steps_per_second': 0.696, 'epoch': 1.62}



  0%|          | 0/591 [53:34<?, ?it/s]          

{'loss': 1.1588, 'grad_norm': 8.086282730102539, 'learning_rate': 3.3e-05, 'epoch': 1.68}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 

  0%|          | 0/591 [54:09<?, ?it/s]        
[A
[A

{'eval_loss': 1.1996303796768188, 'eval_runtime': 35.071, 'eval_samples_per_second': 1.255, 'eval_steps_per_second': 0.627, 'epoch': 1.68}



  0%|          | 0/591 [55:12<?, ?it/s]          

{'loss': 1.2008, 'grad_norm': 4.994334697723389, 'learning_rate': 3.4000000000000007e-05, 'epoch': 1.73}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 

  0%|          | 0/591 [55:44<?, ?it/s]        
[A
[A

{'eval_loss': 1.1953353881835938, 'eval_runtime': 32.8596, 'eval_samples_per_second': 1.339, 'eval_steps_per_second': 0.67, 'epoch': 1.73}



  0%|          | 0/591 [56:40<?, ?it/s]          

{'loss': 1.265, 'grad_norm': 7.364628791809082, 'learning_rate': 3.5e-05, 'epoch': 1.78}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 

  0%|          | 0/591 [57:09<?, ?it/s]        
[A
[A

{'eval_loss': 1.1806330680847168, 'eval_runtime': 28.9508, 'eval_samples_per_second': 1.52, 'eval_steps_per_second': 0.76, 'epoch': 1.78}



  0%|          | 0/591 [58:03<?, ?it/s]          

{'loss': 1.0934, 'grad_norm': 6.48333740234375, 'learning_rate': 3.6e-05, 'epoch': 1.83}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 

  0%|          | 0/591 [58:32<?, ?it/s]        
[A
[A

{'eval_loss': 1.1788538694381714, 'eval_runtime': 29.0507, 'eval_samples_per_second': 1.515, 'eval_steps_per_second': 0.757, 'epoch': 1.83}



  0%|          | 0/591 [59:26<?, ?it/s]          

{'loss': 1.3797, 'grad_norm': 7.552213668823242, 'learning_rate': 3.7e-05, 'epoch': 1.88}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 

  0%|          | 0/591 [59:55<?, ?it/s]        
[A
[A

{'eval_loss': 1.1735936403274536, 'eval_runtime': 29.4355, 'eval_samples_per_second': 1.495, 'eval_steps_per_second': 0.747, 'epoch': 1.88}



  0%|          | 0/591 [1:00:49<?, ?it/s]        

{'loss': 1.0762, 'grad_norm': 6.152806758880615, 'learning_rate': 3.8e-05, 'epoch': 1.93}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 

  0%|          | 0/591 [1:01:18<?, ?it/s]      
[A
[A

{'eval_loss': 1.1603676080703735, 'eval_runtime': 29.4076, 'eval_samples_per_second': 1.496, 'eval_steps_per_second': 0.748, 'epoch': 1.93}



  0%|          | 0/591 [1:02:12<?, ?it/s]        

{'loss': 1.282, 'grad_norm': 11.420744895935059, 'learning_rate': 3.9000000000000006e-05, 'epoch': 1.98}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 

  0%|          | 0/591 [1:02:42<?, ?it/s]      
[A
[A

{'eval_loss': 1.1564639806747437, 'eval_runtime': 29.3958, 'eval_samples_per_second': 1.497, 'eval_steps_per_second': 0.748, 'epoch': 1.98}



  0%|          | 0/591 [1:03:33<?, ?it/s]        

{'loss': 1.0989, 'grad_norm': 6.430358409881592, 'learning_rate': 4e-05, 'epoch': 2.03}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 

  0%|          | 0/591 [1:04:02<?, ?it/s]      
[A
[A

{'eval_loss': 1.154097318649292, 'eval_runtime': 29.1743, 'eval_samples_per_second': 1.508, 'eval_steps_per_second': 0.754, 'epoch': 2.03}



  0%|          | 0/591 [1:04:56<?, ?it/s]          

{'loss': 1.101, 'grad_norm': 6.321652412414551, 'learning_rate': 4.1e-05, 'epoch': 2.08}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   

  0%|          | 0/591 [1:05:26<?, ?it/s]      
[A
[A

{'eval_loss': 1.1480485200881958, 'eval_runtime': 29.2479, 'eval_samples_per_second': 1.504, 'eval_steps_per_second': 0.752, 'epoch': 2.08}



  0%|          | 0/591 [1:06:19<?, ?it/s]          

{'loss': 1.1594, 'grad_norm': 5.721968173980713, 'learning_rate': 4.2e-05, 'epoch': 2.13}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   

  0%|          | 0/591 [1:06:48<?, ?it/s]      
[A
[A

{'eval_loss': 1.1592764854431152, 'eval_runtime': 28.8516, 'eval_samples_per_second': 1.525, 'eval_steps_per_second': 0.763, 'epoch': 2.13}



  0%|          | 0/591 [1:07:42<?, ?it/s]          

{'loss': 0.9795, 'grad_norm': 5.2383856773376465, 'learning_rate': 4.3e-05, 'epoch': 2.18}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   

  0%|          | 0/591 [1:08:11<?, ?it/s]      
[A
[A

{'eval_loss': 1.141778826713562, 'eval_runtime': 29.6549, 'eval_samples_per_second': 1.484, 'eval_steps_per_second': 0.742, 'epoch': 2.18}



  0%|          | 0/591 [1:09:04<?, ?it/s]          

{'loss': 1.0088, 'grad_norm': 5.766260147094727, 'learning_rate': 4.4000000000000006e-05, 'epoch': 2.23}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   

  0%|          | 0/591 [1:09:33<?, ?it/s]      
[A
[A

{'eval_loss': 1.1457139253616333, 'eval_runtime': 29.0587, 'eval_samples_per_second': 1.514, 'eval_steps_per_second': 0.757, 'epoch': 2.23}



  0%|          | 0/591 [1:10:28<?, ?it/s]          

{'loss': 0.9999, 'grad_norm': 6.884564399719238, 'learning_rate': 4.5e-05, 'epoch': 2.28}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   

  0%|          | 0/591 [1:10:57<?, ?it/s]      
[A
[A

{'eval_loss': 1.1371440887451172, 'eval_runtime': 29.1564, 'eval_samples_per_second': 1.509, 'eval_steps_per_second': 0.755, 'epoch': 2.28}



  0%|          | 0/591 [1:11:50<?, ?it/s]          

{'loss': 1.041, 'grad_norm': 4.660024166107178, 'learning_rate': 4.600000000000001e-05, 'epoch': 2.34}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   

  0%|          | 0/591 [1:12:20<?, ?it/s]      
[A
[A

{'eval_loss': 1.1355899572372437, 'eval_runtime': 29.6696, 'eval_samples_per_second': 1.483, 'eval_steps_per_second': 0.741, 'epoch': 2.34}



  0%|          | 0/591 [1:13:13<?, ?it/s]          

{'loss': 1.1928, 'grad_norm': 8.649693489074707, 'learning_rate': 4.7e-05, 'epoch': 2.39}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   

  0%|          | 0/591 [1:13:42<?, ?it/s]      
[A
[A

{'eval_loss': 1.1443477869033813, 'eval_runtime': 28.9605, 'eval_samples_per_second': 1.519, 'eval_steps_per_second': 0.76, 'epoch': 2.39}



  0%|          | 0/591 [1:14:37<?, ?it/s]          

{'loss': 1.0757, 'grad_norm': 4.943569660186768, 'learning_rate': 4.8e-05, 'epoch': 2.44}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   

  0%|          | 0/591 [1:15:05<?, ?it/s]      
[A
[A

{'eval_loss': 1.1312355995178223, 'eval_runtime': 28.8602, 'eval_samples_per_second': 1.525, 'eval_steps_per_second': 0.762, 'epoch': 2.44}



  0%|          | 0/591 [1:15:59<?, ?it/s]          

{'loss': 1.1339, 'grad_norm': 6.466842174530029, 'learning_rate': 4.9e-05, 'epoch': 2.49}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   

  0%|          | 0/591 [1:16:28<?, ?it/s]      
[A
[A

{'eval_loss': 1.1256303787231445, 'eval_runtime': 29.5127, 'eval_samples_per_second': 1.491, 'eval_steps_per_second': 0.745, 'epoch': 2.49}



  0%|          | 0/591 [1:17:22<?, ?it/s]          

{'loss': 1.0285, 'grad_norm': 5.981884956359863, 'learning_rate': 5e-05, 'epoch': 2.54}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   

  0%|          | 0/591 [1:17:52<?, ?it/s]      
[A
[A

{'eval_loss': 1.1225203275680542, 'eval_runtime': 30.3687, 'eval_samples_per_second': 1.449, 'eval_steps_per_second': 0.724, 'epoch': 2.54}



  0%|          | 0/591 [1:18:48<?, ?it/s]          

{'loss': 1.077, 'grad_norm': 5.860714912414551, 'learning_rate': 4.4505494505494504e-05, 'epoch': 2.59}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   

  0%|          | 0/591 [1:19:17<?, ?it/s]      
[A
[A

{'eval_loss': 1.1208128929138184, 'eval_runtime': 28.8841, 'eval_samples_per_second': 1.523, 'eval_steps_per_second': 0.762, 'epoch': 2.59}



  0%|          | 0/591 [1:20:11<?, ?it/s]          

{'loss': 1.0029, 'grad_norm': 6.450997829437256, 'learning_rate': 3.901098901098901e-05, 'epoch': 2.64}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   

  0%|          | 0/591 [1:20:40<?, ?it/s]      
[A
[A

{'eval_loss': 1.1147303581237793, 'eval_runtime': 29.1478, 'eval_samples_per_second': 1.51, 'eval_steps_per_second': 0.755, 'epoch': 2.64}



  0%|          | 0/591 [1:21:33<?, ?it/s]          

{'loss': 0.9472, 'grad_norm': 4.971694469451904, 'learning_rate': 3.3516483516483513e-05, 'epoch': 2.69}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   

  0%|          | 0/591 [1:22:01<?, ?it/s]      
[A
[A

{'eval_loss': 1.1109912395477295, 'eval_runtime': 28.5871, 'eval_samples_per_second': 1.539, 'eval_steps_per_second': 0.77, 'epoch': 2.69}



  0%|          | 0/591 [1:22:54<?, ?it/s]          

{'loss': 1.072, 'grad_norm': 4.803797245025635, 'learning_rate': 2.8021978021978025e-05, 'epoch': 2.74}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   

  0%|          | 0/591 [1:23:23<?, ?it/s]      
[A
[A

{'eval_loss': 1.102827548980713, 'eval_runtime': 28.7297, 'eval_samples_per_second': 1.532, 'eval_steps_per_second': 0.766, 'epoch': 2.74}



  0%|          | 0/591 [1:24:16<?, ?it/s]          

{'loss': 1.0198, 'grad_norm': 5.427087783813477, 'learning_rate': 2.252747252747253e-05, 'epoch': 2.79}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   

  0%|          | 0/591 [1:24:45<?, ?it/s]      
[A
[A

{'eval_loss': 1.1069703102111816, 'eval_runtime': 29.4439, 'eval_samples_per_second': 1.494, 'eval_steps_per_second': 0.747, 'epoch': 2.79}



  0%|          | 0/591 [1:25:38<?, ?it/s]          

{'loss': 1.0545, 'grad_norm': 4.155777931213379, 'learning_rate': 1.7032967032967035e-05, 'epoch': 2.84}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   

  0%|          | 0/591 [1:26:06<?, ?it/s]      
[A
[A

{'eval_loss': 1.0993844270706177, 'eval_runtime': 28.5846, 'eval_samples_per_second': 1.539, 'eval_steps_per_second': 0.77, 'epoch': 2.84}



  0%|          | 0/591 [1:27:03<?, ?it/s]          

{'loss': 1.1951, 'grad_norm': 5.650148868560791, 'learning_rate': 1.153846153846154e-05, 'epoch': 2.89}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   

  0%|          | 0/591 [1:27:33<?, ?it/s]      
[A
[A

{'eval_loss': 1.0960335731506348, 'eval_runtime': 30.5516, 'eval_samples_per_second': 1.44, 'eval_steps_per_second': 0.72, 'epoch': 2.89}



  0%|          | 0/591 [1:28:26<?, ?it/s]          

{'loss': 0.9972, 'grad_norm': 4.580667972564697, 'learning_rate': 6.043956043956044e-06, 'epoch': 2.94}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   

  0%|          | 0/591 [1:28:56<?, ?it/s]      
[A
[A

{'eval_loss': 1.096256136894226, 'eval_runtime': 29.8274, 'eval_samples_per_second': 1.475, 'eval_steps_per_second': 0.738, 'epoch': 2.94}



  0%|          | 0/591 [1:29:49<?, ?it/s]          

{'loss': 0.908, 'grad_norm': 6.631828308105469, 'learning_rate': 5.494505494505495e-07, 'epoch': 2.99}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   

  0%|          | 0/591 [1:30:18<?, ?it/s]      
[A
[A

{'eval_loss': 1.0958267450332642, 'eval_runtime': 28.6908, 'eval_samples_per_second': 1.534, 'eval_steps_per_second': 0.767, 'epoch': 2.99}



100%|██████████| 591/591 [1:25:39<00:00,  8.70s/it]


{'train_runtime': 5139.0251, 'train_samples_per_second': 0.229, 'train_steps_per_second': 0.115, 'train_loss': 1.73107911376582, 'epoch': 3.0}


In [67]:
# Encode the input prompt
input_text = "Once there was a tree"
inputs = tokenizer.encode(input_text, return_tensors="pt")

# Generate a response
outputs = model.generate(inputs, max_length=150, num_return_sequences=1, repetition_penalty=2.0, temperature=0.7)

# Decode the generated response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the result
print(response)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Once there was a tree, and he had seen it before.
And then came the man who knew nothing of this thing; for that is what we have said in our book: "The Man Who Wasn't There." And so on to another day when they were gone together--and still no one could hear them out --but all these things happened at once with their own hands! They are not known by name or place but as if I'd been told from my mother's house where she kept her children alive (I don' t know how much). But now you see me coming home again? Why did anyone come back after dinner tonight?" Then asked Mr Browning-Smith : "'Why do people stay up here long enough?'
