In [None]:
!pip install -q -U transformers datasets

In [None]:
from transformers import AutoTokenizer
from typing import Any, List, Tuple, Optional
from datasets import load_dataset, Dataset

# Load the tokenizer for the specific model you want to use
tokenizer = AutoTokenizer.from_pretrained("gpt2")  # Replace "bert-base-uncased" with the model name you want
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
def transformers_tokenizer_function(examples: Any, tokenizer: tokenizer, columns_to_tokenize: List[str]) -> Any:
    # Tokenizes all columns specified in columns_to_tokenize and returns the processed output
    tokenized_output = {}

    for col in columns_to_tokenize:
        # Make sure to pass the text data from each column to the tokenizer
        if col in examples:
            tokenized_batch = tokenizer(examples[col], padding='max_length', truncation=True, return_tensors='pt')
            for key, value in tokenized_batch.items():
                if key not in tokenized_output:
                    tokenized_output[key] = []
                tokenized_output[key].extend(value.numpy().tolist())  # Convert tensor to list and add to outputs
        else:
            raise ValueError(f"The specified column '{col}' is not in the dataset.")

    return tokenized_output
# Now we'll integrate the tokenization function into the preprocess_dataset function
def preprocess_dataset(
    dataset_name: str,
    tokenizer: AutoTokenizer,  # Pass the initialized tokenizer object from transformers library
    columns_to_tokenize: List[str],
    random_seed: Optional[int] = None,
    split_ratio: float = 0.8,
    limit: Optional[int] = None
) -> Tuple[Dataset, Dataset]:
    # Same preprocess function as described before, but passing the tokenizer object and using to initialize the tokenize_function
    dataset = load_dataset(dataset_name)
    func_to_apply = lambda examples: transformers_tokenizer_function(examples, tokenizer, columns_to_tokenize)
    dataset = dataset.map(func_to_apply, batched=True)

    if limit is not None:
        for split in dataset.keys():
            dataset[split] = dataset[split].select(range(min(limit, len(dataset[split]))))

    if random_seed is not None:
        dataset = dataset.shuffle(seed=random_seed)

    train_test_split = dataset['train'].train_test_split(train_size=split_ratio, seed=random_seed)
    return train_test_split['train'], train_test_split['test']

# Example usage of the preprocess_dataset function:
column_names = ['task']  # Replace with the actual column names from your dataset

# Replace 'your_dataset_name' with the actual name of your dataset and 'bert-base-uncased' with your desired model name
train_ds, test_ds = preprocess_dataset(
    dataset_name="TuringsSolutions/NYTWritingStyleGuide",
    tokenizer=tokenizer,
    columns_to_tokenize=column_names,
    random_seed=42,
    split_ratio=0.8
)

print(f"Training Dataset Size: {len(train_ds)}")
print(f"Testing Dataset Size: {len(test_ds)}")