In [10]:
from datasets import load_dataset

Tinyds = load_dataset("roneneldan/TinyStories")

In [4]:
# Check the structure of the train and validation splits
print("Train split example:", Tinyds['train'][0])  # Print the first example of the train split
print("Validation split example:", Tinyds['validation'][0])  # Print the first example of the validation0split

# Check the length of the splits to ensure they contain data
print("Train split length:", len(Tinyds['train']))
print("Validation split length:", len(Tinyds['validation']))


Train split example: {'text': 'One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on her shirt.\n\nLily went to her mom and said, "Mom, I found this needle. Can you share it with me and sew my shirt?" Her mom smiled and said, "Yes, Lily, we can share the needle and fix your shirt."\n\nTogether, they shared the needle and sewed the button on Lily\'s shirt. It was not difficult for them because they were sharing and helping each other. After they finished, Lily thanked her mom for sharing the needle and fixing her shirt. They both felt happy because they had shared and worked together.'}
Validation split example: {'text': 'Spot. Spot saw the shiny car and said, "Wow, Kitty, your car is so bright and clean!" Kitty smiled and replied, "Thank you, Spot. I polish it every day."\n\nAfter playing with the car, Kitty and Spot felt thirsty. They found

In [9]:
from datasets import load_dataset


# Get the text from both the train and validation splits
train_texts = Tinyds['train']['text']  # Assuming 'text' is the column name
validation_texts = Tinyds['validation']['text']

# Check the first 3 texts in the train and validation splits
print("First 3 train texts:", train_texts[:3])
print("First 3 validation texts:", validation_texts[:3])

# Concatenate the texts from train and validation splits and also keep then separate in txt file

all_texts = train_texts + validation_texts

# Save a smaller portion to a .txt file to verify output
with open('tinystories_trainset100.txt', 'w') as f:
    for story in train_texts[:100]:  # Writing only the first 1ß0 stories
        f.write(story + '\n')  # Each story on a new line



First 3 train texts: ['One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on her shirt.\n\nLily went to her mom and said, "Mom, I found this needle. Can you share it with me and sew my shirt?" Her mom smiled and said, "Yes, Lily, we can share the needle and fix your shirt."\n\nTogether, they shared the needle and sewed the button on Lily\'s shirt. It was not difficult for them because they were sharing and helping each other. After they finished, Lily thanked her mom for sharing the needle and fixing her shirt. They both felt happy because they had shared and worked together.', 'Once upon a time, there was a little car named Beep. Beep loved to go fast and play in the sun. Beep was a healthy car because he always had good fuel. Good fuel made Beep happy and strong.\n\nOne day, Beep was driving in the park when he saw a big tree. The tree had

In [11]:
import sentencepiece as spm

# Define paths and parameters
input_text = r"C:\Users\Daniel\Documents\Daniel_things\Universität_Halle\Data_Mining_und_maschinelle_Lernen\FinalProect\CHASESQLTtry2\Ngrams\tinystories_trainset100.txt"  # Path to the prepared text file
model_prefix = 'tinystories_tokenizer'  # Prefix for model files
vocab_size = 32768  # Vocabulary size (same as in Nguyen's model)

# Train SentencePiece tokenizer
spm.SentencePieceTrainer.Train(
    input=input_text,
    model_prefix=model_prefix,
    vocab_size=vocab_size,
    character_coverage=0.9995,  # Covers almost all characters
    model_type="unigram",  # Use "unigram" model type, or try "bpe" or "word" if needed
    user_defined_symbols=["<|endoftext|>"]  # Add special tokens if needed
)

print("Tokenizer training complete!")


RuntimeError: Internal: D:\a\sentencepiece\sentencepiece\src\trainer_interface.cc(662) [(trainer_spec_.vocab_size()) == (model_proto->pieces_size())] Vocabulary size too high (32768). Please set it to a value <= 1477.

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from datasets import load_dataset
import torch

# Load the GPT2 model (TinyStories-GPT2-3M)
model = GPT2LMHeadModel.from_pretrained("gpt2")  # Replace with the TinyStories-GPT2-3M model if it's available
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")  # This is GPT2's tokenizer

# Prepare dataset
dataset = load_dataset("roneneldan/TinyStories")
train_texts = dataset["train"]["text"]
val_texts = dataset["validation"]["text"]

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

train_dataset = dataset["train"].map(tokenize_function, batched=True)
val_dataset = dataset["validation"].map(tokenize_function, batched=True)

# Prepare for training
train_dataset = train_dataset.shuffle().select([i for i in list(range(10000))])  # Using a subset for testing
val_dataset = val_dataset.shuffle().select([i for i in list(range(1000))])

# Define DataLoader for batching during training
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=4)

# Prepare model and optimizer
model.to("cuda")  # Move model to GPU if available
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)

# Training loop
epochs = 3
for epoch in range(epochs):
    model.train()
    for batch in train_dataloader:
        inputs = batch["input_ids"].to("cuda")
        labels = batch["labels"].to("cuda")
        optimizer.zero_grad()

        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch} completed with loss: {loss.item()}")
