In [None]:
import inspect
from transformers import TrainingArguments

# Retrieve the full argument specification for the TrainingArguments initializer
args_spec = inspect.getfullargspec(TrainingArguments.__init__)

print("Arguments for TrainingArguments:")
# Exclude 'self' from the arguments list as it's not an actual parameter
for arg in args_spec.args[1:]:  # args[0] is 'self', which we skip
    print(arg)


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

MAX_NEW_TOKENS = 128
model_name = 'decapoda-research/llama-7b-hf'

text = 'Hamburg is in which country?\n'
tokenizer = AutoTokenizer.from_pretrained(model_name)
input_ids = tokenizer(text, return_tensors="pt").input_ids

free_in_GB = int(torch.cuda.mem_get_info()[0]/1024**3)
max_memory = f'{int(torch.cuda.mem_get_info()[0]/1024**3)-2}GB'

n_gpus = torch.cuda.device_count()
max_memory = {i: max_memory for i in range(n_gpus)}

model = AutoModelForCausalLM.from_pretrained(
  model_name,
  device_map='auto',
  load_in_8bit=True,
  max_memory=max_memory
)
generated_ids = model.generate(input_ids, max_length=MAX_NEW_TOKENS)
print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))

In [1]:
import inspect
from transformers import TrainingArguments

# Retrieve the full argument specification for the TrainingArguments initializer
args_spec = inspect.getfullargspec(TrainingArguments.__init__)

print("Arguments for TrainingArguments:")
# Exclude 'self' from the arguments list as it's not an actual parameter
for arg in args_spec.args[1:]:  # args[0] is 'self', which we skip
    print(arg)

# If you want to include default values and other metadata, you could do:
if args_spec.defaults:
    # The last 'len(defaults)' arguments have default values
    defaults_offset = len(args_spec.args) - len(args_spec.defaults)
    for idx, arg in enumerate(args_spec.args[1:]):
        if idx >= defaults_offset:
            default = args_spec.defaults[idx - defaults_offset]
            print(f"{arg} (default: {default})")
        else:
            print(arg)
else:
    for arg in args_spec.args[1:]:
        print(arg)

Arguments for TrainingArguments:
output_dir
overwrite_output_dir
do_train
do_eval
do_predict
evaluation_strategy
prediction_loss_only
per_device_train_batch_size
per_device_eval_batch_size
per_gpu_train_batch_size
per_gpu_eval_batch_size
gradient_accumulation_steps
eval_accumulation_steps
eval_delay
learning_rate
weight_decay
adam_beta1
adam_beta2
adam_epsilon
max_grad_norm
num_train_epochs
max_steps
lr_scheduler_type
lr_scheduler_kwargs
warmup_ratio
warmup_steps
log_level
log_level_replica
log_on_each_node
logging_dir
logging_strategy
logging_first_step
logging_steps
logging_nan_inf_filter
save_strategy
save_steps
save_total_limit
save_safetensors
save_on_each_node
save_only_model
no_cuda
use_cpu
use_mps_device
seed
data_seed
jit_mode_eval
use_ipex
bf16
fp16
fp16_opt_level
half_precision_backend
bf16_full_eval
fp16_full_eval
tf32
local_rank
ddp_backend
tpu_num_cores
tpu_metrics_debug
debug
dataloader_drop_last
eval_steps
dataloader_num_workers
past_index
run_name
disable_tqdm
remove_u

In [None]:
import os
import warnings
import torch 
from tqdm import tqdm
from accelerate import Accelerator
from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    TaskType,
    get_peft_model,
    prepare_model_for_int8_training,
)
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    HfArgumentParser,
    TrainingArguments,
    Trainer,
    default_data_collator,
    BitsAndBytesConfig
)
from dataclasses import dataclass, field
from typing import Optional
from dataclass_csv import DataclassReader
from torch.utils.data import Dataset, DataLoader
from enum import Enum
from datasets import load_dataset,DatasetDict
from typing import Union , Dict,Optional,Any,List

In [None]:
def advanced_data_loader(input: Union[str, Dict[str, str]], format: Optional[str] = None, split_ratios: Optional[Dict[str, float]] = None) -> Optional[DatasetDict]:
    """
    Loads a dataset from a given input path or dictionary specifying file paths and splits it.

    :param input: A string representing the dataset name or directory, or a dictionary containing file paths.
    :param format: The format of the dataset if loading from a file (e.g., 'csv' or 'json').
    :param split_ratios: A dictionary with keys 'train', 'test', and 'eval' containing split ratios.
    :return: A loaded and split dataset or None in case of failure.
    """
    if split_ratios is None:
        split_ratios = {'train': 0.8, 'test': 0.1, 'eval': 0.1}

    try:
        # Load the dataset
        if isinstance(input, dict) and format in ['csv', 'json']:
            dataset = load_dataset(format, data_files=input)
        elif isinstance(input, str) and format == 'text':
            dataset = load_dataset(format, data_dir=input)
        elif isinstance(input, str) and format is None:
            dataset = load_dataset(input)
        else:
            warnings.warn("Invalid input or format. Please provide a valid dataset name, directory, or file paths.")
            return None
    except FileNotFoundError as e:
        warnings.warn(str(e))
        return None

    # Split the dataset
    if dataset:
        split_dataset = dataset['train'].train_test_split(test_size=split_ratios['test'] + split_ratios['eval'])
        test_eval_dataset = split_dataset['test'].train_test_split(test_size=split_ratios['eval'] / (split_ratios['test'] + split_ratios['eval']))
        dataset = DatasetDict({
            'train': split_dataset['train'],
            'test': test_eval_dataset['train'],
            'eval': test_eval_dataset['test']
        })

    print("Splits: ", dataset.keys())
    print("Columns: ", {split: dataset[split].column_names for split in dataset.keys()})
    return dataset

In [None]:
import subprocess
# The URL of the PDF file
url = "https://arxiv.org/pdf/2401.07324v1.pdf"
# The name of the output HTML file
output = "output.html"
# Run the pdf2htmlEX tool with the URL and the output file name as arguments
subprocess.run(["pdf2htmlEX", url, output])
# Open the output HTML file in the default browser
import webbrowser
webbrowser.open(output)

In [None]:
def create_tokenizer(tokenizer_name_or_path: str = 'gpt2') -> AutoTokenizer:
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = tokenizer.eos_token_id
    if tokenizer.bos_token_id is None:
        tokenizer.bos_token_id = tokenizer.pad_token_id
    if tokenizer.eos_token_id is None:
        tokenizer.eos_token_id = tokenizer.pad_token_id
    if tokenizer.unk_token_id is None:
        tokenizer.unk_token_id = tokenizer.pad_token_id
    if tokenizer.sep_token_id is None:
        tokenizer.sep_token_id = tokenizer.pad_token_id
    if tokenizer.cls_token_id is None:
        tokenizer.cls_token_id = tokenizer.pad_token_id
    if tokenizer.mask_token_id is None:
        tokenizer.mask_token_id = tokenizer.pad_token_id
    return tokenizer


In [None]:
model_name_or_path=''
TEXT_COLUMN=''
LABEL_COLUMN=''
MAX_LENGTH=''
BATCH_SIZE=16
tokenizer_name_or_path=''
tokenizer=create_tokenizer(tokenizer_name_or_path)


In [None]:
def Pre_Preocessing_function(Dataset:Union[str, Dict[str, str],List[str,str]],Columns:List[str],tokenizer:AutoTokenizer):
    COLUMNS=Dataset['train'].column_names
    BATCH_SIZE=len(Dataset['train'][COLUMNS[0]])
    if Columns==None:
        for column in  COLUMNS:
            inputs=[f" {column} : {x} : " for x in Dataset['train'][column]]
    else:
        for column in Columns:
            inputs=[f" {column} : {x} : " for x in Dataset['train'][column]]
    targets = [str(x) for x in Dataset['train'][LABEL_COLUMN]]
    model_inputs = tokenizer(inputs)
    labels = tokenizer(targets, add_special_tokens=False)  # don't add bos token because we concatenate with inputs
    for i in range(BATCH_SIZE):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i] + [tokenizer.eos_token_id]
        model_inputs["input_ids"][i] = sample_input_ids + label_input_ids
        labels["input_ids"][i] = [-100] * len(sample_input_ids) + label_input_ids
        model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i])
    for i in range(BATCH_SIZE):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i]
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
            MAX_LENGTH - len(sample_input_ids)
        ) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (MAX_LENGTH - len(sample_input_ids)) + model_inputs[
            "attention_mask"
        ][i]
        labels["input_ids"][i] = [-100] * (MAX_LENGTH - len(sample_input_ids)) + label_input_ids
        model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:MAX_LENGTH])
        model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:MAX_LENGTH])
        labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:MAX_LENGTH])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs       
    

In [None]:
dataset=advanced_data_loader('')

In [None]:
processed_datasets = dataset.map(
    Pre_Preocessing_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)
train_dataset = processed_datasets["train"]
train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=BATCH_SIZE, pin_memory=True
)

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)
model=AutoModelForCausalLM.from_pretrained(model_name_or_path, quantization_config=bnb_config)
peft_config =LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=64,
    lora_alpha=256,
    lora_dropout=0.2,
    fan_in_fan_out=True,
    bias="all",
    modules_to_save=["classifier/score", "pooler"],
    init_lora_weights="gaussian",
    # target_modules=["q_proj", "k_proj"],
    # modules_to_save=["lm_head"],

    # layers_to_transform=[2, 4, 6],
    # layers_pattern="custom_pattern",
    # rank_pattern={
    #     "model.decoder.layers.0.encoder_attn.k_proj": 16,
    #     "model.decoder.layers.2.encoder_attn.k_proj": 32
    # },
    # alpha_pattern={
    #     "model.decoder.layers.0.encoder_attn.k_proj": 64,
    #     "model.decoder.layers.4.encoder_attn.k_proj": 128
    # },
    # megatron_config={
    #     "hidden_size": 4096,
    #     "num_attention_heads": 32,
    #     "num_layers": 24
    # },
    # megatron_core="custom_megatron_core",
    # loftq_config=LoraConfig(
    #     quantization_bits=8,
    #     quantization_range=128
    # )
)
model = get_peft_model(model, peft_config)
