In [None]:
!pip install -q -U transformers

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import os

#13B model
model_name = "TheBloke/vicuna-13B-1.1-HF"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
output_path = "./local_vicuna13B"
os.makedirs(output_path, exist_ok=True)
tokenizer.save_pretrained(output_path)
model.save_pretrained(output_path,max_shard_size="100GB")

#7B model
model_name = "TheBloke/vicuna-7B-1.1-HF"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
output_path = "./local_vicuna7B"
os.makedirs(output_path, exist_ok=True)
tokenizer.save_pretrained(output_path)
model.save_pretrained(output_path,max_shard_size="100GB")

In [None]:
import os
import logging
from transformers import AutoTokenizer, AutoModelForCausalLM

# Set up logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# Define model names and paths
models_dir = "path/to/models"  # Path to directory containing the pre-trained models
models = [
    {"name": "TheBloke/vicuna-13B-1.1-HF", "size": "13B"},
    {"name": "TheBloke/vicuna-7B-1.1-HF", "size": "7B"},
]

# Load and save the pre-trained models
for model in models:
    try:
        model_name = model["name"]
        output_path = os.path.join(models_dir, f"local_vicuna_{model['size']}B")

        logging.info(f"Loading and saving {model['size']}B model...")
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(model_name)

        # Save the tokenizer and model to the output path
        os.makedirs(output_path, exist_ok=True)
        tokenizer.save_pretrained(output_path)
        model.save_pretrained(output_path, max_shard_size="100GB")

        logging.info(f"Saved {model['size']}B model to {output_path}.")
    except Exception as e:
        logging.error(f"Error loading and saving {model['size']}B model: {e}")

In this Python script, we use the powerful HuggingFace transformers library to preprocess a text dataset for further natural language processing tasks. This script is useful, especially when working with transformer models, that require their input text to be tokenized in a very specific way. Here is the detailed explanation of the code:

# `Primary libraries used`:

- transformers to access pre-trained models and tokenizers.
- argparse to manage command-line arguments.
- pandas and numpy for data processing and computations.
- tqdm for a progress display during loop execution.
- os for environment variables and path manipulations.

### Command-Line Arguments:

The script starts by defining command-line arguments using the argparse library. These arguments include a model name ("-m"/"--model_name"), a directory containing the dataset ("-d"/"--dataset"), and a pad token id ("--pad_token_id"). If these parameters aren't provided, default values are used.

Access Token:
An access token is obtained from the environment variable "HF_TOKEN". This is particularly useful when you need to authenticate your script with the Hugging Face Model Hub.

## `Tokenizer`:
We then get the tokenizer corresponding to the model name provided as an argument. This tokenizer is fetched from the pre-trained models available in the transformers library. The 'use_auth_token' flag is set to True to use the Hugging Face access token for authentication.

# `Preparing Dataset`:
We read the dataset files named as "train.csv" and "validation.csv" from the provided directory using pandas. These dataframes are then concatenated to form a combined dataframe.

# `Tokenization & Calculation`:
We iterate over each row of the 'text' column in the combined dataframe. For each text input, we use our tokenizer to tokenize the text and get the input IDs. We also calculate the length (or size) of the tokenized input. If a pad token id is provided, the script checks that it does not appear in the tokenized input.

# `Token size statistics`:
Finally, we find the maximum, minimum, mean, and median size of all tokenized inputs in the dataset and print these statistics. This analysis is important to understand the distribution of lengths of the tokenized sequences in your dataset.

The script ends by printing these calculated statistics. These statistics give insights into the sizes of the tokenized texts and can be useful when defining the maximum sequence length for your transformer model.



In [None]:
from transformers import AutoTokenizer, BatchEncoding
import argparse
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import logging
from multiprocessing import Pool

# Command-line arguments setup
parser = argparse.ArgumentParser()
parser.add_argument("-m","--model_name", type=str, required=True)
parser.add_argument("-d","--data_dir", type=str, required=True)
parser.add_argument("-t","--train_file", type=str, default="train.csv")
parser.add_argument("-v","--valid_file", type=str, default="validation.csv")
parser.add_argument("--pad_token_id", type=int, default=None)
parser.add_argument("--max_length", type=int, default=128)
args = parser.parse_args()

# Logging setup
logging.basicConfig(level=logging.INFO)

# Fetch access token
access_token = os.getenv("HF_TOKEN", "")

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(args.model_name, use_auth_token=True, use_fast=True)

# Load datasets
dataset_dir = os.path.realpath(args.data_dir)
train_file = os.path.join(dataset_dir, args.train_file)
valid_file = os.path.join(dataset_dir, args.valid_file)

try:
    train_df = pd.read_csv(train_file)
    valid_df = pd.read_csv(valid_file)
    combined_df = pd.concat([train_df, valid_df])
except FileNotFoundError:
    logging.error("One of the data files does not exist.")
    raise

if 'text' not in combined_df:
    logging.error("Column named 'text' does not exist in the dataset.")
    raise ValueError("Column named 'text' does not exist in the dataset.")

# Clean text data, removing unnecessary characters, handling special cases, etc.
# Add your text cleaning logic here

# Tokenize data in batches for efficiency
texts = combined_df['text'].tolist()
encoded_inputs: BatchEncoding = tokenizer(texts, truncation=True, padding='max_length', max_length=args.max_length, return_tensors='np')

assert args.pad_token_id is None or args.pad_token_id not in encoded_inputs['input_ids'], "Pad token is in tokenized text."

# Calculate token sizes
all_sizes = [len(input_ids) for input_ids in encoded_inputs['input_ids'].tolist()]

logging.info("Max size: %s", max(all_sizes))
logging.info("Min size: %s", min(all_sizes))
logging.info("Mean size: %s", np.mean(all_sizes))
logging.info("Median size: %s", np.median(all_sizes))

In [None]:
import pandas as pd
import os
import re

# Constants
EMPTY_STRING = ''
END_OF_TEXT_PATTERN = re.compile(r'<\|endoftext\|>')
TRAIN_FILENAME = 'train.csv'
VALIDATION_FILENAME = 'validation.csv'
OUTPUT_DIR = './OPT/'

def read_csv_with_error_handling(file_path):
    try:
        if not os.path.isfile(file_path):
            raise FileNotFoundError(f"File not found: {file_path}")

        df = pd.read_csv(file_path)

        # Check if 'text' column exists
        if 'text' not in df.columns:
            raise KeyError(f"'text' column is missing from the input file {file_path}")

        # Check if 'text' column has the correct data type
        if not pd.api.types.is_string_dtype(df['text']):
            raise TypeError(f"'text' column must be of string type in the file {file_path}")

        return df
    except (FileNotFoundError, KeyError, TypeError) as e:
        print(e)
        return pd.DataFrame()

def replace_empty_strings(df, column_name='text'):
    if column_name in df.columns:
        df[column_name] = df[column_name].str.replace(END_OF_TEXT_PATTERN, EMPTY_STRING, regex=True)

def save_to_csv(df, file_path):
    df.to_csv(file_path, index=False)

def process_and_save_files(input_dir, output_dir):
    os.makedirs(output_dir, exist_ok=True)

    # Process training data
    train_file_path = os.path.join(input_dir, TRAIN_FILENAME)
    train_df = read_csv_with_error_handling(train_file_path)
    replace_empty_strings(train_df)
    save_to_csv(train_df, os.path.join(output_dir, TRAIN_FILENAME))

    # Process validation data
    validation_file_path = os.path.join(input_dir, VALIDATION_FILENAME)
    valid_df = read_csv_with_error_handling(validation_file_path)
    replace_empty_strings(valid_df)
    save_to_csv(valid_df, os.path.join(output_dir, VALIDATION_FILENAME))

if __name__ == '__main__':
    input_dir = os.path.join('.', 'GPT')
    output_dir = os.path.realpath(OUTPUT_DIR)
    process_and_save_files(input_dir, output_dir)

# inference

In [None]:
! pip install -q -U deepspeed

In [None]:
! pip install -q -U transformers

In [None]:
import os
import deepspeed
import torch
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForCausalLM

# Set the model name directly since we won't be using command-line arguments
model_name = 'EleutherAI/gpt-j-6B'

# Assuming LOCAL_RANK and WORLD_SIZE are set outside of this cell
local_rank = int(os.getenv('LOCAL_RANK', '0'))
world_size = int(os.getenv('WORLD_SIZE', '1'))

model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(model_name)
generator = pipeline('text-generation', model=model, tokenizer=tokenizer, device=local_rank, torch_dtype=torch.float16)

# Initialize DeepSpeed Inference
generator.model = deepspeed.init_inference(generator.model,
                                           mp_size=world_size,
                                           dtype=torch.half,
                                           replace_method='auto',
                                           max_tokens=2048,
                                           replace_with_kernel_inject=True)
torch.cuda.synchronize()

# Generate some text
input_text = "DeepSpeed is"
generated_text = generator(input_text, do_sample=True, max_length=2047, min_length=2047, top_k=50, top_p=0.95, temperature=0.9)

# Printing the output
if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
    print(generated_text)