In [1]:
import argparse
import bitsandbytes as bnb
from datasets import load_dataset
from functools import partial
import os
from peft import PeftModel
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed, Trainer, TrainingArguments, BitsAndBytesConfig, \
    DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset

In [2]:
!huggingface-cli login --token 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

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/yb970/.cache/huggingface/token
Login successful


In [3]:
# Reload tokenizer to save it
base_model = "meta-llama/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code = True)
tokenizer.pad_token = tokenizer.eos_token

In [4]:
new_model = "brettbbb/llama2_finetune_synthetic_medium"
base_model = "meta-llama/Llama-2-7b-hf"
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    base_model,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto",
)
model = PeftModel.from_pretrained(base_model, new_model, )
model = model.merge_and_unload()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
from datasets import load_dataset
dataset = load_dataset("BENBENBENb/sythetic_casual_relation_medium_scale")

Found cached dataset json (/home/yb970/.cache/huggingface/datasets/BENBENBENb___json/BENBENBENb--sythetic_casual_relation_medium_scale-82ba2ae7f2379a93/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)


  0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'date_publish', 'outlet', 'headline', 'lead', 'body', 'authors', 'domain', 'url', 'political_leaning', 'n_words', 'n_tokens', 'GPT_causal_graph'],
        num_rows: 660
    })
    validation: Dataset({
        features: ['id', 'date_publish', 'outlet', 'headline', 'lead', 'body', 'authors', 'domain', 'url', 'political_leaning', 'n_words', 'n_tokens', 'GPT_causal_graph'],
        num_rows: 220
    })
    test: Dataset({
        features: ['id', 'date_publish', 'outlet', 'headline', 'lead', 'body', 'authors', 'domain', 'url', 'political_leaning', 'n_words', 'n_tokens', 'GPT_causal_graph'],
        num_rows: 220
    })
})


In [7]:
def create_prompt_formats(sample):
    """
    construct the prompt.
    """
    INSTRUCTION = "Below is an excerpt from a news article. Return the cause and effect phrases."
    context = sample['body']
    response = sample['GPT_causal_graph']
    
    formatted_prompt = f"### Instruction: {INSTRUCTION} ### Context: {context} ### Response: {response} ### End"
    sample["formatted_prompt"] = formatted_prompt

    return sample

In [8]:
def get_max_length(model):
    conf = model.config
    max_length = None
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length


def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    return tokenizer(
        batch["formatted_prompt"],
        max_length=max_length,
        truncation=True,
        padding = True
    )


# SOURCE https://github.com/databrickslabs/dolly/blob/master/training/trainer.py
def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, seed, dataset: str):
    """Format & tokenize it so it is ready for training
    :param tokenizer (AutoTokenizer): Model Tokenizer
    :param max_length (int): Maximum number of tokens to emit from tokenizer
    """

    # Add prompt to each sample
    print("Preprocessing dataset...")
    dataset = dataset.map(create_prompt_formats)#, batched=True)

    # Apply preprocessing to each batch of the dataset & and remove 'instruction', 'context', 'response', 'category' fields
    _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched=True,
#         remove_columns=['Unnamed: 0.1', 'Unnamed: 0', 'outlet', 'headline', 'body', 'political_leaning', 'gpt_causal_graph'],
    )

    # Filter out samples that have input_ids exceeding max_length
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)

    # Shuffle dataset
    dataset = dataset.shuffle(seed=seed)

    return dataset

In [9]:
## Preprocess dataset
max_length = get_max_length(model)
seed = 1
dataset = preprocess_dataset(tokenizer, max_length, seed, dataset)

Loading cached processed dataset at /home/yb970/.cache/huggingface/datasets/BENBENBENb___json/BENBENBENb--sythetic_casual_relation_medium_scale-82ba2ae7f2379a93/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-6473ccb92de423f8.arrow
Loading cached processed dataset at /home/yb970/.cache/huggingface/datasets/BENBENBENb___json/BENBENBENb--sythetic_casual_relation_medium_scale-82ba2ae7f2379a93/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-f0d7124eaf37d753.arrow
Loading cached processed dataset at /home/yb970/.cache/huggingface/datasets/BENBENBENb___json/BENBENBENb--sythetic_casual_relation_medium_scale-82ba2ae7f2379a93/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-92ce8f4badc4bc3a.arrow


Found max lenth: 4096
Preprocessing dataset...


Loading cached processed dataset at /home/yb970/.cache/huggingface/datasets/BENBENBENb___json/BENBENBENb--sythetic_casual_relation_medium_scale-82ba2ae7f2379a93/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-9cb326012f4c4022.arrow
Loading cached processed dataset at /home/yb970/.cache/huggingface/datasets/BENBENBENb___json/BENBENBENb--sythetic_casual_relation_medium_scale-82ba2ae7f2379a93/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-1374c0820e1bcfff.arrow
Loading cached processed dataset at /home/yb970/.cache/huggingface/datasets/BENBENBENb___json/BENBENBENb--sythetic_casual_relation_medium_scale-82ba2ae7f2379a93/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-4a1027b995f1a54f.arrow
Loading cached processed dataset at /home/yb970/.cache/huggingface/datasets/BENBENBENb___json/BENBENBENb--sythetic_casual_relation_medium_scale-82ba2ae7f2379a93/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7

## INFERENCE

In [18]:
input_text = dataset['validation'][1]['formatted_prompt'].split('### Response:')[0]
print(f"input text: {input_text}")
print("-"*30)
outputs = model.generate(inputs=inputs, max_length=2000, num_return_sequences=1, repetition_penalty = 1.3)
print(f"generated text:")
for i, output in enumerate(outputs):
  print(f"{i}: {tokenizer.decode(output)}")

input text: ### Instruction: Below is an excerpt from a news article. Return the cause and effect phrases. ### Context: Sector has best three months for growth since 2014 despite slight slip in Markit/Cips PMI figure for December
Britain’s manufacturers finished the year on a positive footing, with the strongest three months for growth since 2014 amid a boom in global growth.
The Markit/Cips UK manufacturing PMI barometer of factory sentiment showed activity recorded an average rate of 57 in the three months to December, in the strongest reading since the three months to June 2014. While the gauge dipped to 56.3 last month from 58.2 in November, it remained firmly above the 50 level indicating expansion.
Activity and new orders have expanded throughout the past 17 months, as firms reported increasing production in response to new orders and the launch of new product lines. However, there was a slight slowdown in December among firms in the consumer goods sector.
The survey comes as Bri

In [15]:
len(dataset['validation'])

220

In [None]:
import csv
from tqdm import tqdm

# Assuming dataset['validation'] is a list of dictionaries
validation_data = dataset['validation']

# Specify the CSV file path
csv_file_path = 'output_data.csv'

# Specify the features
features = ['id', 'date_publish', 'outlet', 'headline', 'lead', 'body', 'authors', 'domain', 'url', 'political_leaning', 'n_words', 'n_tokens', 'GPT_causal_graph', 'formatted_prompt', 'decoded_output']

# Write data to CSV file
with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
    # Create a CSV writer object
    csv_writer = csv.DictWriter(csvfile, fieldnames=features)

    # Write the header
    csv_writer.writeheader()

    # Use tqdm to add a progress bar
    for i, row in enumerate(tqdm(validation_data, desc='Processing rows', unit='row')):
        # Extract input text
        input_text = row['formatted_prompt'].split('### Response:')[0]

        # Generate output
        inputs = tokenizer.encode(input_text, return_tensors='pt').to('cuda')
        outputs = model.generate(inputs=inputs, max_length=2000, num_return_sequences=1, repetition_penalty=1.3)
        decoded_output = tokenizer.decode(outputs[0])

        # Update the row with the generated 'decoded_output'
        row['decoded_output'] = decoded_output

        # Filter the row to include only the specified features
        filtered_row = {key: row[key] for key in features}

        # Write the filtered row to CSV file
        csv_writer.writerow(filtered_row)

        # Manually flush the buffer to update the file every 10 rows
        if (i + 1) % 10 == 0:
            csvfile.flush()


Processing rows:  23%|██▎       | 50/220 [27:58<1:24:25, 29.80s/row]