In [None]:
!pip install -q -U datasets

In [None]:
from datasets import load_dataset, Dataset, DatasetDict
from typing import Any, Dict, Union, Optional
import logging

# Configure logging to display only warnings and errors
logging.basicConfig(level=logging.WARNING)

# Type alias for the possible return types of the load_dataset function
DatasetReturnType = Union[Dataset, DatasetDict]

def load_dataset_with_config(
    dataset_name: str,
    dataset_config: Union[str, Dict[str, Any], None] = None,
    split: Optional[str] = None,
    **kwargs: Any
) -> Optional[DatasetReturnType]:
    """
    Download and load a dataset by name and split with error handling and custom configuration.

    :param dataset_name: The name of the dataset to load.
    :param dataset_config: The configuration of the dataset. This can be a string, a dictionary, or None.
    :param split: The split of the dataset to load. If None, the default split is used.
    :param kwargs: Additional keyword arguments to pass to the load_dataset function.
    :return: A Dataset or DatasetDict object containing the loaded dataset, or None if an error occurs.
    """
    try:
        # Load the dataset with the specified parameters
        if isinstance(dataset_config, str):
            dataset = load_dataset(dataset_name, dataset_config, split=split, **kwargs)
        elif isinstance(dataset_config, dict):
            dataset = load_dataset(dataset_name, data_files=dataset_config, split=split, **kwargs)
        else:
            dataset = load_dataset(dataset_name, split=split, **kwargs)

        return dataset
    except Exception as e:
        # Log the error message
        logging.error(f"Failed to load dataset '{dataset_name}': {e}")
        return None

# Example usage:
if __name__ == "__main__":
    # Load a dataset with a specific name and split, catching any errors that might occur.
    dataset = load_dataset_with_config('squad', split='validation[:10%]')

    if dataset:
        # Successful dataset retrieval, you can now work with the dataset
        print(f"Dataset loaded successfully: {dataset}")
    else:
        # Dataset loading failed, handle the situation appropriately
        print("Failed to load the dataset.")

In [None]:
from typing import List, Dict, Callable, Any, Optional

def filter_dataset(dataset: List[Dict[str, Any]], predicate: Callable[[Dict[str, Any]], bool]) -> Optional[List[Dict[str, Any]]]:
    """
    Filter a dataset by a predicate function.

    :param dataset: The dataset to filter, represented as a list of dictionaries.
    :param predicate: A function that takes a dictionary representing a sample and returns True
                      if the sample should be included in the filtered dataset, False otherwise.
    :return: A list of dictionaries representing the filtered samples, or None if an error occurs.
    """
    try:
        # Use a list comprehension to filter the dataset using the predicate
        filtered_dataset = [sample for sample in dataset if predicate(sample)]
        return filtered_dataset
    except Exception as e:
        # Log the error and return None to indicate that filtering failed
        print(f"An error occurred while filtering the dataset: {e}")
        return None

# Example usage:
if __name__ == "__main__":
    # An example dataset consisting of dictionaries
    example_dataset = [
        {'name': 'Alice', 'age': 30},
        {'name': 'Bob', 'age': 25},
        {'name': 'Charlie', 'age': 35}
    ]

    # A predicate function that filters samples based on the 'age' attribute
    age_predicate = lambda sample: sample['age'] > 30

    # Filter the dataset using the predicate function
    filtered_dataset = filter_dataset(example_dataset, age_predicate)

    if filtered_dataset is not None:
        print("Filtered dataset:")
        for sample in filtered_dataset:
            print(sample)
    else:
        # Handle the situation where filtering failed
        print("Failed to filter the dataset.")

In [None]:
from typing import Callable, List
from datasets import Dataset

def filter_dataset(dataset: Dataset, predicate: Callable[[dict], bool]) -> List[dict]:
    """
    Filter a dataset by a predicate.

    :param dataset: The dataset to filter.
    :param predicate: A function that takes in a dictionary representing a sample and returns True if the sample should be included in the filtered dataset and False otherwise.
    :return: A list of dictionaries representing the filtered samples.
    """
    filtered_samples = []
    for sample in dataset:
        if predicate(sample):
            filtered_samples.append(sample)
    return filtered_samples

In [None]:
from datasets import load_dataset, DatasetDict, Dataset, logging
from typing import Any, Dict, Optional, Tuple
from pprint import pprint

# Set the logging level to warning to avoid too much verbosity
logging.set_verbosity_warning()

# Define a function to safely load a dataset
def safe_load_dataset(name: str, split: Optional[str] = None) -> Tuple[Optional[Dataset], Optional[Dict[str, Any]]]:
    try:
        # Attempt to load the dataset
        dataset = load_dataset(name, split=split)
        info = dataset.info.dict if isinstance(dataset, Dataset) else None
        return dataset, info
    except Exception as e:
        # Handle exceptions that may occur during dataset loading
        print(f"An error occurred while loading the dataset: {e}")
        return None, None

# Define a function to print various information about a dataset
def print_dataset_info(dataset: Dataset) -> None:
    # Print basic information about the dataset
    print("👉 Dataset:")
    print(dataset)
    print(f"👉 Dataset length: {len(dataset)}")

    # Print the first item of the dataset
    print("\n👉 First item 'dataset[0]':")
    pprint(dataset[0])

    # Print a slice of the dataset
    print("\n👉 Slice of the two items 'dataset[10:12]':")
    pprint(dataset[10:12])

    # Print column names and features
    print("\nColumn names:")
    pprint(dataset.column_names)
    print("Features:")
    pprint(dataset.features)

    # Print the length of each context string in the dataset
    print("\nLength of context strings in the dataset:")
    dataset.map(lambda example: print(len(example['context']), end=','))

# Main function to execute the dataset operations
def main():
    dataset_name = 'squad'
    dataset_split = 'validation[:10%]'

    # Load the dataset safely
    dataset, info = safe_load_dataset(dataset_name, split=dataset_split)

    # If the dataset was loaded successfully, print its information
    if dataset is not None and info is not None:
        pprint(info)
        print_dataset_info(dataset)

if __name__ == "__main__":
    main()

In [None]:
import re

def contains_keyword(sample: dict) -> bool:
    keyword = "election"
    return re.search(keyword, sample["title"], re.IGNORECASE) is not None

dataset = load_dataset("ag_news", split="test")
filtered_samples = filter_dataset(dataset, contains_keyword)
print(len(filtered_samples))

In [None]:
from typing import Optional, Union
from datasets import DatasetInfo

def get_dataset_info(dataset_name: str,
                     with_details: Optional[bool] = True) -> Union[str, DatasetInfo]:
    """
    Retrieve the details of a dataset by name.

    :param dataset_name: The name of the dataset to retrieve.
    :param with_details: Whether to retrieve the full dataset details or just the name. Defaults to True.
    :return: The name of the dataset if with_details is False, or a DatasetInfo object containing the full dataset details if with_details is True.
    """
    datasets = list_datasets(with_details=with_details)
    dataset_index = datasets.index(dataset_name)
    if with_details:
        return datasets[dataset_index]
    else:
        return dataset_name

In [None]:
squad_dataset_name = 'squad'
squad_dataset_info = get_dataset_info(squad_dataset_name, with_details=True)
squad_license = extract_attribute(squad_dataset_info, 'license')
print(squad_license)

In [None]:
from typing import List, Optional, Union
from datasets import Dataset, DatasetDict

def load_dataset(dataset_name: str,
                 dataset_config: Optional[Union[str, dict]] = None,
                 split: Optional[str] = None,
                 **kwargs) -> Union[Dataset, DatasetDict]:
    """
    Download and load a dataset by name and split.

    :param dataset_name: The name of the dataset to load.
    :param dataset_config: The configuration of the dataset. This can be a string, a dictionary, or None.
    :param split: The split of the dataset to load. If None, the default split is used.
    :param kwargs: Additional keyword arguments to pass to the load_dataset function.
    :return: A Dataset or DatasetDict object containing the loaded dataset.
    """
    if dataset_config is None:
        return load_dataset(dataset_name, split=split, **kwargs)
    elif isinstance(dataset_config, str):
        return load_dataset(dataset_name, name=dataset_config, split=split, **kwargs)
    elif isinstance(dataset_config, dict):
        return load_dataset(dataset_name, config=dataset_config, split=split, **kwargs)
    else:
        raise ValueError("dataset_config must be a string, a dictionary, or None.")