In [None]:
from pathlib import Path
from typing import Dict, Any, List,Union,Optional
from datasets import (load_dataset, 
                      DatasetDict,
                      concatenate_datasets
                      )


#Load the datset
def load_and_prepare_dataset(
    input_source: Union[str, Path, Dict[str, List[Union[str, Path]]]],
    split_ratios: tuple = (0.8, 0.1, 0.1),
    seed: int = 42,
    streaming: bool = False
) -> DatasetDict:
    """
    Load a dataset from various input sources and prepare it by splitting into train, test, and eval sets.

    :param input_source: A dataset name, path to a folder, a single file, multiple files, or a dictionary specifying train, test, and eval files.
    :param split_ratios: A tuple containing the ratios for train, test, and eval splits (default is (0.8, 0.1, 0.1)).
    :param seed: A random seed for reproducibility of the split (default is 42).
    :param streaming: Whether to use streaming to handle large files (default is False).
    :return: A DatasetDict containing the split datasets.
    
    Example:
    # Example usage with streaming for large files:
    # dataset_dict = load_and_prepare_dataset({
    #     'train': ['train_file_1.csv', 'train_file_2.csv'],
    #     'test': ['test_file.csv'],
    #     'eval': ['eval_file.csv']
    # }, streaming=True)
    # print(dataset_dict)
    OUTPUT1:
    DatasetDict({
    train: DatasetDict({
        train: Dataset({
            features: ['act', 'prompt'],
            num_rows: 459
        })
    })
    test: DatasetDict({
        train: Dataset({
            features: ['act', 'prompt'],
            num_rows: 459
        })
    })
    eval: DatasetDict({
        train: Dataset({
            features: ['act', 'prompt'],
            num_rows: 153
        })
    })
    })
    EXAMPLE2:
    dataset=load_and_prepare_dataset('fka/awesome-chatgpt-prompts')
    DatasetDict({
    train: Dataset({
        features: ['act', 'prompt'],
        num_rows: 122
    })
    test: Dataset({
        features: ['act', 'prompt'],
        num_rows: 15
    })
    eval: Dataset({
        features: ['act', 'prompt'],
        num_rows: 16
    })
    })
    EXAMPLE3:
    datset_path=load_and_prepare_dataset('/content/awesome-chatgpt-prompts')
DatasetDict({
    train: Dataset({
        features: ['act', 'prompt'],
        num_rows: 122
    })
    test: Dataset({
        features: ['act', 'prompt'],
        num_rows: 15
    })
    eval: Dataset({
        features: ['act', 'prompt'],
        num_rows: 16
    })
    })

    """
    # Load dataset from different types of input sources
    if isinstance(input_source, (str, Path)):
        # Dataset name, single file or path to folder
        dataset = load_dataset(input_source, streaming=streaming)
        dataset = DatasetDict(dataset)
    elif isinstance(input_source, dict):
        # Dictionary with specified train, test, and eval files
        formats = ['csv', 'json', 'jsonl', 'parquet', 'txt']
        datasets = {}
        for split, files in input_source.items():
            format_detected = None
            for fmt in formats:
                if any(str(file).endswith(fmt) for file in files):
                    format_detected = fmt
                    break
            if format_detected is None:
                raise ValueError(f"No supported file format detected for files: {files}")
            datasets[split] = load_dataset(format_detected, data_files=files, streaming=streaming)
        dataset = DatasetDict(datasets)
    else:
        raise ValueError("Input source should be a dataset name, path to a folder, a single file, multiple files, or a dictionary.")

    # Perform the split if needed and if not in streaming mode
    if not streaming:
        train_size, test_size, eval_size = split_ratios
        assert 0.0 < train_size < 1.0 and 0.0 < test_size < 1.0 and 0.0 < eval_size < 1.0 and (train_size + test_size + eval_size) == 1.0, \
            "Split ratios must be between 0 and 1 and sum up to 1."

        if "train" not in dataset or "test" not in dataset or "eval" not in dataset:
            # Assuming all splits are to be derived from the 'train' dataset
            full_dataset = concatenate_datasets(list(dataset.values())) if isinstance(dataset, dict) else dataset
            split_dataset = full_dataset.train_test_split(train_size=train_size, seed=seed)
            test_eval_split = split_dataset['test'].train_test_split(test_size=test_size / (test_size + eval_size), seed=seed)

            dataset = DatasetDict({
                "train": split_dataset["train"],
                "test": test_eval_split["train"],
                "eval": test_eval_split["test"]
            })

    return dataset

In [None]:


def get_number_of_trainable_parameters(model):
    r"""
    Returns the number of trainable parameters and number of all parameters in the model.
    """
    # note: same as PeftModel.get_nb_trainable_parameters
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        num_params = param.numel()
        # if using DS Zero 3 and the weights are initialized empty
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel

        # Due to the design of 4bit linear layers from bitsandbytes
        # one needs to multiply the number of parameters by 2 to get
        # the correct number of parameters
        if param.__class__.__name__ == "Params4bit":
            num_params = num_params * 2

        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params
    print("Total no of training_parameters:",trainable_params)
    print("Total no of parameters is :",all_param)
    print("percantage of trainable parameters is",100*((trainable_params)/(all_param)))
    return trainable_params, all_param

In [None]:
import yaml
from transformers import (
    PreTrainedTokenizer,
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    BitsAndBytesConfig,
    SchedulerType,
    TrainingArguments,
    default_data_collator,
    get_scheduler,      
    set_seed,
)

# Define a function to read arguments from a YAML file
def load_arguments_from_yaml(yaml_file_path:str):
    with open(yaml_file_path, 'r') as stream:
        try:
            arguments = yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            print(exc)
            arguments = {}
    return arguments


def load_model(model_name_or_path: Union[str,List]) -> AutoModelForCausalLM:
    """
    Function to load a transformers model.
    
    Args:
      model_name_or_path (Union[str, Path]): The name or path of the model.

    Returns:
        model (AutoModelForCausalLM): The loaded model.
    """
    model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
    return model

def create_tokenizer(
    tokenizer_name_or_path: Union[str,List] ) -> AutoTokenizer:
    """
    Initializes and returns a tokenizer based on the specified pretrained model or path.

    Args:
        tokenizer_name_or_path (str): The name or path of the tokenizer's pretrained model.

    Returns:
        AutoTokenizer: The initialized tokenizer.
    """
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
    
    # Set special tokens if they are not already set
    special_tokens = {
        'pad_token': tokenizer.eos_token,
        'bos_token': tokenizer.eos_token,
        'eos_token': tokenizer.eos_token,
        'unk_token': tokenizer.eos_token,
        'sep_token': tokenizer.eos_token,
        'cls_token': tokenizer.eos_token,
        'mask_token':tokenizer.eos_token
    }
    for token_name, token_value in special_tokens.items():
        if getattr(tokenizer, f"{token_name}_id") is None:
            setattr(tokenizer, token_name, token_value)
    
    return tokenizer

In [1]:
import yaml
from typing import Any, Dict
from pathlib import Path

def load_yaml(file_path: Path) -> Dict[str, Any]:
    """
    Loads a YAML file and returns its contents as a dictionary.

    Parameters:
    - file_path (Path): A Path object representing the path to the YAML file.

    Returns:
    - Dict[str, Any]: A dictionary containing the parameters from the YAML file.

    Raises:
    - FileNotFoundError: If the YAML file does not exist at the specified path.
    - yaml.YAMLError: If the YAML file contains invalid syntax.
    """
    if not file_path.exists():
        raise FileNotFoundError(f"The file '{file_path}' was not found.")

    if not file_path.suffix in ['.yml', '.yaml']:
        raise ValueError(f"The file '{file_path}' does not have a .yml or .yaml extension.")

    try:
        with file_path.open('r') as file:
            parameters = yaml.safe_load(file)
            if parameters is None:
                # In case the YAML file is empty, return an empty dictionary
                return {}
            if not isinstance(parameters, dict):
                raise TypeError("The top level of the YAML file should be a dictionary.")
            return parameters
    except yaml.YAMLError as e:
        raise yaml.YAMLError(f"Error parsing YAML file: {e}")

# Example usage:
# Assuming you have a YAML file named 'config.yml' in the current directory.
if __name__ == "__main__":
    try:
        config_path = Path('file_operations-/File_and_Operations/transformers_peft_trl/Fine-tuning/General_fine_tuning/Pre_training.yml')
        config_params = load_yaml(config_path)
        print("YAML file loaded successfully:")
        print(config_params)
    except Exception as e:
        print(f"An error occurred: {e}")


YAML file loaded successfully:
{'description': 'Pre-training on the custom dataset custom model.', 'model_name_or_path': '', 'output_dir': '', 'dataset_name_or_path': '', 'tokenizer_name_or_path': '', 'config_name': '', 'max_seq_length': 1024, 'tokenizer_name': '', 'chars_per_token': 3.5, 'fim_rate': 0.5, 'fim_spm_rate': 0.5, 'seed': 0}


In [7]:
import argparse
description=""
def parse_arguments() -> argparse.Namespace:
    """
    Parses command-line arguments.

    Returns:
    - argparse.Namespace: An object containing the parsed command-line arguments.
    """
    parser = argparse.ArgumentParser(
        description=f"""
               {description}
                   """)
    parser.add_argument('--yaml_path', type=str, help="Path to the YAML file to load.")
    parser.add_argument('--data_name', type=str, help="Path to the")

    args=parser.parse_args()
    return args


| Class                                      | Used For                                    |
|--------------------------------------------|---------------------------------------------|
| AggregationStrategy                        | Strategy for handling grouped token outputs. |
| Any                                        | Denotes any type - type hint.                |
| ArgumentHandler                            | Parses custom command-line arguments.        |
| AudioClassificationPipeline                | Pipeline for audio classification tasks.     |
| AutoConfig                                 | Automatically loads model configurations.    |
| AutoFeatureExtractor                       | Automatically creates feature extractors.    |
| AutoImageProcessor                         | Automatically processes images for models.   |
| AutoModel                                  | Generic model loading (auto-detection).      |
| AutoModelForAudioClassification            | Audio classification model loading.          |
| AutoModelForCTC                            | Models for Connectionist Temporal Classification. |
| AutoModelForCausalLM                       | Causal language modeling autoloading.        |
| AutoModelForDepthEstimation                | Depth estimation model autoloading.          |
| AutoModelForDocumentQuestionAnswering      | Document QA model autoloading.               |
| AutoModelForImageClassification            | Image classification model autoloading.      |
| AutoModelForImageSegmentation              | Image segmentation model autoloading.        |
| AutoModelForImageToImage                   | Image-to-image tasks model autoloading.      |
| AutoModelForMaskGeneration                 | Mask generation model autoloading.           |
| AutoModelForMaskedLM                       | Masked language modeling autoloading.        |
| AutoModelForObjectDetection                | Object detection model autoloading.          |
| AutoModelForQuestionAnswering              | Question answering model autoloading.        |
| AutoModelForSemanticSegmentation           | Semantic segmentation model autoloading.     |
| AutoModelForSeq2SeqLM                      | Sequence-to-sequence autoloading.            |
| AutoModelForSequenceClassification         | Sequence classification model autoloading.    |
| AutoModelForSpeechSeq2Seq                  | Speech sequence-to-sequence autoloading.     |
| AutoModelForTableQuestionAnswering         | Table QA model autoloading.                  |
| AutoModelForTextToSpectrogram              | Text to spectrogram model autoloading.       |
| AutoModelForTextToWaveform                 | Text to waveform model autoloading.          |
| AutoModelForTokenClassification            | Token classification model autoloading.      |
| AutoModelForVideoClassification            | Video classification model autoloading.      |
| AutoModelForVision2Seq                     | Vision to sequence tasks autoloading.        |
| AutoModelForVisualQuestionAnswering        | Visual QA model autoloading.                 |
| AutoModelForZeroShotImageClassification    | Zero-shot image classification autoloading.  |
| AutoModelForZeroShotObjectDetection        | Zero-shot object detection autoloading.      |
| AutoTokenizer                              | Automatically creates tokenizers.            |
| AutomaticSpeechRecognitionPipeline         | ASR pipeline tasks.                          |
| BaseImageProcessor                         | Base class for image processing.             |
| CONFIG_NAME                                | Constant for configuration file name.        |
| Conversation                               | Handles conversation data for models.        |
| ConversationalPipeline                     | Pipeline for conversational tasks.           |
| CsvPipelineDataFormat                      | CSV format handler for pipeline data.        |
| DepthEstimationPipeline                    | Pipeline for depth estimation tasks.         |
| Dict                                       | Dictionary type - type hint.                 |
| DocumentQuestionAnsweringPipeline          | Pipeline for document QA tasks.              |
| FEATURE_EXTRACTOR_MAPPING                  | Mapping of feature extractors.               |
| FeatureExtractionPipeline                  | Pipeline for feature extraction tasks.       |
| FillMaskPipeline                           | Pipeline for fill-mask tasks.                |
| HUGGINGFACE_CO_RESOLVE_ENDPOINT            | Endpoint for resolving Hugging Face hub.     |
| IMAGE_PROCESSOR_MAPPING                    | Mapping of image processors.                 |
| ImageClassificationPipeline                | Pipeline for image classification tasks.     |
| ImageSegmentationPipeline                  | Pipeline for image segmentation tasks.       |
| ImageToImagePipeline                       | Pipeline for image-to-image tasks.           |
| ImageToTextPipeline                        | Pipeline for image-to-text tasks.            |
| JsonPipelineDataFormat                     | JSON format handler for pipeline data.       |
| List                                       | List type - type hint.                       |
| MULTI_MODEL_CONFIGS                        | Configurations for multi-model support.      |
| MaskGenerationPipeline                     | Pipeline for mask generation tasks.          |
| NO_FEATURE_EXTRACTOR_TASKS                 | Tasks without feature extractor.             |
| NO_IMAGE_PROCESSOR_TASKS                   | Tasks without image processor.               |
| NO_TOKENIZER_TASKS                         | Tasks without tokenizer.                     |
| NerPipeline                                | Pipeline for named entity recognition.       |
| ObjectDetectionPipeline                    | Pipeline for object detection tasks.         |
| Optional                                   | Optional type - type hint.                   |
| PIPELINE_REGISTRY                          | Registry of pipeline tasks.                  |
| Path                                       | Filesystem path type - type hint.            |
| PipedPipelineDataFormat                    | Piped format for pipeline data.              |
| Pipeline                                   | Base class for all pipelines.                |
| PipelineDataFormat                         | Handler for pipeline data formats.           |
| PipelineException                          | Custom exception for pipeline errors.        |
| PipelineRegistry                           | Manages registered pipeline tasks.           |
| PreTrainedFeatureExtractor                 | Base for pretrained feature extractors.      |
| PreTrainedTokenizer                        | Base for pretrained tokenizers.              |
| PretrainedConfig                           | Base for model configurations.               |
| QuestionAnsweringArgumentHandler           | QA argument parser for pipeline.             |
| QuestionAnsweringPipeline                  | Pipeline for question answering tasks.       |
| SUPPORTED_TASKS                            | Supported tasks for pipelines.               |
| SummarizationPipeline                      | Pipeline for text summarization tasks.       |
| TASK_ALIASES                               | Aliases for different tasks.                 |
| TFAutoModel                                | TensorFlow model autoloading.                |
| TFAutoModelForCausalLM                     | TF causal LM model autoloading.              |
| TFAutoModelForImageClassification          | TF image classification model autoloading.   |
| TFAutoModelForMaskedLM                     | TF masked LM model autoloading.              |
| TFAutoModelForQuestionAnswering            | TF question answering model autoloading.     |
| TFAutoModelForSeq2SeqLM                    | TF seq2seq LM model autoloading.             |
| TFAutoModelForSequenceClassification       | TF sequence classification autoloading.      |
| TFAutoModelForTableQuestionAnswering       | TF table QA model autoloading.               |
| TFAutoModelForTokenClassification          | TF token classification model autoloading.   |
| TFAutoModelForVision2Seq                   | TF vision to sequence model autoloading.     |
| TFAutoModelForZeroShotImageClassification  | TF zero-shot image classification autoloading. |
| TOKENIZER_MAPPING                          | Mapping of tokenizers.                       |
| TYPE_CHECKING                              | Constant for type checking at runtime.       |
| TableQuestionAnsweringArgumentHandler      | Table QA argument parser for pipeline.       |
| TableQuestionAnsweringPipeline             | Pipeline for table QA tasks.                 |
| Text2TextGenerationPipeline                | Pipeline for text-to-text generation.        |
| TextClassificationPipeline                 | Pipeline for text classification tasks.      |
| TextGenerationPipeline                     | Pipeline for text generation tasks.          |
| TextToAudioPipeline                        | Pipeline for text-to-audio tasks.            |
| TokenClassificationArgumentHandler         | Token classification argument parser.        |
| TokenClassificationPipeline                | Pipeline for token classification tasks.     |
| TranslationPipeline                        | Pipeline for translation tasks.              |
| Tuple                                      | Tuple type - type hint.                      |
| Union                                      | Union type - type hint.                      |
| VideoClassificationPipeline                | Pipeline for video classification tasks.     |
| VisualQuestionAnsweringPipeline            | Pipeline for visual QA tasks.                |
| ZeroShotAudioClassificationPipeline        | Pipeline for zero-shot audio classification. |
| ZeroShotClassificationArgumentHandler      | Zero-shot classification argument parser.    |
| ZeroShotClassificationPipeline             | Pipeline for zero-shot classification tasks. |
| ZeroShotImageClassificationPipeline        | Pipeline for zero-shot image classification. |
| ZeroShotObjectDetectionPipeline            | Pipeline for zero-shot object detection.     |
| __builtins__                               | Built-in objects in the module.              |
| __cached__                                 | Cached file path for the module.             |
| __doc__                                    | Module documentation string.                 |
| __file__                                   | Path to the module file.                     |
| __loader__                                 | Loader for the module.                       |
| __name__                                   | Name of the module.                          |
| __package__                                | Package name for the module.                 |
| __path__                                   | Path for package module search.              |
| __spec__                                   | Specification for the module.                |
| audio_classification                       | Module for audio classification functions.   |
| audio_utils                                | Utilities for audio processing.              |
| automatic_speech_recognition               | ASR module functions and utilities.          |
| base                                       | Base module for classes and functions.       |
| cached_file                                | Function to cache files.                     |


In [6]:

import argparse
import inspect
from typing import Callable

def display_function_details(func: Callable):
    """Display detailed information about a single function."""
    if func is not None:
        # Retrieve the name of the function for display purposes
        func_name = func.__name__
        # Get the signature of the function
        try:
            sig = inspect.signature(func)
        except ValueError:
            sig = "Not available"
        # Get the docstring of the function
        docstring = inspect.getdoc(func) or "Not available"

        print(f"Function: {func_name}")
        print(f"Signature: {sig}")
        print(f"Docstring:\n{docstring}")
        print("-" * 80)
    else:
        print("The specified function does not exist or is not callable.")

def explore_specific_function(module, function_name: str):
    """Explore a specific function within the given module."""
    # Get the attribute from the module matching the function_name
    func = getattr(module, function_name, None)
    if callable(func):
        display_function_details(func)
    else:
        print(f"No callable function named '{function_name}' found in the module.")

# Skill 13, 22: Writing Modular Code and Understanding Namespaces
if __name__ == "__main__":
    function_to_explore = 'add_argument'
    explore_specific_function(argparse.ArgumentParser, function_to_explore)

Function: add_argument
Signature: (self, *args, **kwargs)
Docstring:
add_argument(dest, ..., name=value, ...)
add_argument(option_string, option_string, ..., name=value, ...)
--------------------------------------------------------------------------------
