In [1]:
!pip install -r fine_tune_llama_requirements.txt

Collecting accelerate@ git+https://github.com/huggingface/accelerate.git
  Cloning https://github.com/huggingface/accelerate.git to /tmp/pip-install-4a496wuv/accelerate_c0fa84be9f6640a6ab6fc3faae933c75
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/accelerate.git /tmp/pip-install-4a496wuv/accelerate_c0fa84be9f6640a6ab6fc3faae933c75
  Resolved https://github.com/huggingface/accelerate.git to commit 8f9673f509c75defa4642826a3eafcd3f9735437
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting transformers@ git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-install-4a496wuv/transformers_4921c4d3971d44b4b094f656e1183912
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-install-4a496wuv/transform

In [2]:
# import argparse
# import bitsandbytes as bnb
import yaml
import torch

from transformers import BartTokenizer, BartForConditionalGeneration, T5Tokenizer, T5ForConditionalGeneration, AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, AutoPeftModelForCausalLM
from random import sample

# Usage
config_file = 'config.yaml'
selected_model = 'Llama-2-7b-hf-c2t-2-005'
with open(config_file, 'r') as file:
    config = yaml.safe_load(file)
model_config = config['models'].get(selected_model)
evaluation_config = config['evaluation']


if model_config:
    model_name = model_config['model_name']
    source_directory = model_config['source_directory']
    destination_directory = model_config['destination_directory']
    generated_file = evaluation_config['generated_file']
    target_file = evaluation_config["target_file"]
    print(f"Model configuration for {selected_model} loaded successfully.")
    print(f"model_name={model_name}")
    print(f"source_directory={source_directory}")
    print(f"destination_directory={destination_directory}")
    print(f"generated_file={generated_file}")
    print(f"target_file={target_file}")
else:
    print(f"Model configuration for {selected_model} not found.")


Model configuration for Llama-2-7b-hf-c2t-2-005 loaded successfully.
model_name=
source_directory=/notebooks/models/fine-tuned/Llama-2-7b-hf-c2t-2-005
destination_directory=/notebooks/models/fine-tuned/Llama-2-7b-hf-c2t-2-006
generated_file=/notebooks/evaluation/OWID/Llama-2-7b-hf-c2t-2-005/generated.txt
target_file=/notebooks/evaluation/OWID/Llama-2-7b-hf-c2t-2-005/target.txt


In [3]:
# device = torch.device("cuda")

# if 't5' in selected_model:
#     ModelClass = T5ForConditionalGeneration
#     TokenizerClass = T5Tokenizer
# elif 'bart' in selected_model:
#     ModelClass = BartForConditionalGeneration
#     TokenizerClass = BartTokenizer
# elif 'Llama' in selected_model:
#     ModelClass = AutoModelForCausalLM
#     TokenizerClass = AutoTokenizer
# else:
#     raise ValueError("Unsupported model type")
    
# try:
#     model = ModelClass.from_pretrained(source_directory).to(device)
#     tokenizer = TokenizerClass.from_pretrained(source_directory)
# except EnvironmentError:
#     model = ModelClass.from_pretrained(model_name).to(device)
#     tokenizer = TokenizerClass.from_pretrained(model_name)

#     # model.save_pretrained(source_directory)
#     # tokenizer.save_pretrained(source_directory)

n_gpus = torch.cuda.device_count()
max_memory = f'{40960}MB'

try:
    model = AutoModelForCausalLM.from_pretrained(
        source_directory,
        # device_map="auto", # dispatch efficiently the model on the available ressources
        max_memory = {i: max_memory for i in range(n_gpus)},
    )
    tokenizer = AutoTokenizer.from_pretrained(
        source_directory,
        use_fast=False,
        add_eos_token=True
    )
    print("Local model loaded successfully")

except EnvironmentError:
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        # device_map="auto", # dispatch efficiently the model on the available ressources
        max_memory = {i: max_memory for i in range(n_gpus)},
        token='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',
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_name, 
        token='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',
        use_fast=False,
        add_eos_token=True
    )
    print("Model loaded successfully")

# # Needed for LLaMA tokenizer
# tokenizer.pad_token = tokenizer.eos_token
    
tokenizer.pad_token_id = 18610

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Local model loaded successfully


In [4]:
# def execute_task(text, task="summarize: "):
#     input_text = task + text
#     input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=1024, truncation=True).to(device)
#     outputs = model.generate(input_ids, max_length=1024)
#     return tokenizer.decode(outputs[0], skip_special_tokens=True)

# def process_generated_text(generated_text):
#     # Removing the formatted_prompt from the generated_text
#     summary_start_index = generated_text.find('[/INST]') + len('[/INST]')
#     summary = generated_text[summary_start_index:].strip()
    
#     # Encode and decode to handle special characters
#     summary = summary.encode('utf-8', 'replace').decode('utf-8')

#     # Find the first occurrence of any of the specified markers
#     markers = ['[INST]', '[/INST]', '<s>', '</s>']
#     cutoff_indices = [summary.find(marker) for marker in markers if summary.find(marker) != -1]

#     # Check if any markers are found
#     if cutoff_indices:
#         cutoff_index = min(cutoff_indices)
#         summary = summary[:cutoff_index].strip()

#     # print(summary)
#     return summary   

def process_generated_text(generated_text):
    # Markers to look for
    markers = ['[INST]', '[/INST]']

    # Find the last occurrence of any of the specified markers
    last_marker_index = -1
    last_marker_length = 0
    for marker in markers:
        index = generated_text.rfind(marker)
        if index > last_marker_index:
            last_marker_index = index
            last_marker_length = len(marker)
    
    # If no markers found, return the full text
    if last_marker_index == -1:
        return generated_text.strip()
    
    # Calculate the start index of the summary
    summary_start_index = last_marker_index + last_marker_length

    # Get the substring from the last marker to the end of the text
    summary = generated_text[summary_start_index:].strip()
    
    # Encode and decode to handle special characters
    summary = summary.encode('utf-8', 'replace').decode('utf-8')

    # Find the first occurrence of any markers in the remaining text and cut off at that point
    cutoff_indices = [summary.find(marker) for marker in markers if summary.find(marker) != -1]

    # Check if any markers are found in the remaining text
    if cutoff_indices:
        cutoff_index = min(cutoff_indices)
        summary = summary[:cutoff_index].strip()

    return summary


def execute_task(chart_content):
    formatted_prompt = f"""<s>[INST] From the below input full content of a chart, write a summary that reflects the meaning and trend of the chart.
    Chart content: {chart_content} [/INST] """
    
    # print(f"formatted_prompt:\n{formatted_prompt}\n")
    
    pipe = pipeline(
        task="text-generation", 
        model=model, 
        tokenizer=tokenizer, 
        device=0, 
        max_length=500 + len(formatted_prompt)
    )
    result = pipe(formatted_prompt)
    generated_text = result[0]['generated_text']
    
    # print(f"generated_text:\n{generated_text}\n")
    
    return process_generated_text(generated_text)

    

# def create_prompt_formats(chart_content):
#     INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
#     INSTRUCTION_KEY = "### Instruction:"
#     INPUT_KEY = "### Input:"
#     RESPONSE_KEY = "### Response:"
    
#     blurb = f"{INTRO_BLURB}"
#     instruction = f"{INSTRUCTION_KEY}\nFrom the input full content of a chart, write a summary that reflects the meaning and trend of the chart."
#     input_context = f"{INPUT_KEY}\n{chart_content}"
#     response = f"{RESPONSE_KEY} "
    
#     parts = [part for part in [blurb, instruction, input_context, response] if part]

#     formatted_prompt = "\n\n".join(parts)
    
#     return formatted_prompt

# sample_text = """
# The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France.
# It is named after the engineer Gustave Eiffel, whose company designed and built the tower.
# Locally nicknamed "La dame de fer" (French for "Iron Lady"), it was constructed from 1887 to 1889
# as the entrance arch to the 1889 World's Fair and is a global cultural icon of France and one of
# the most recognizable structures in the world. The Eiffel Tower is the most-visited paid monument
# in the world; 6.91 million people ascended it in 2015.
# """

# print(execute_task(sample_text))

In [5]:
# chart_sum = """
# This chart is a vertical bar graph with the following components:

# Title:

# "Gross Domestic Product From IT and Service Industry of the UK (as a % of GDP)"
# Axes:

# Vertical Axis (Y-axis): Represents the percentage of GDP, with increments of 2% ranging from 0% to 16%.
# Horizontal Axis (X-axis): Represents the years for which data is provided, specifically 1992, 1994, 1996, 1998, and 2000.
# Bars:

# There are two sets of bars for each year, representing two different industries:
# a. IT Industry (colored in blue):
# - 1992: Approximately 4%
# - 1994: Approximately 6%
# - 1996: Approximately 8%
# - 1998: Approximately 10%
# - 2000: Approximately 12%
# b. Service Industry (colored in purple):
# - 1992: Approximately 6%
# - 1994: Approximately 8%
# - 1996: Approximately 10%
# - 1998: Approximately 14%
# - 2000: Approximately 16%
# Legend:

# A legend is present to denote the color coding of the bars:
# Blue bar represents the IT Industry.
# Purple bar represents the Service Industry.
# Relationships:

# The height of each bar represents the contribution of each industry to the UK's GDP in the respective year.
# The bars are paired by year, allowing for comparison between the two industries over time.
# Layout:

# The bars are placed side by side for each year to facilitate easy comparison between the two industries.
# Color Scheme:

# Two colors are used for the bars to distinguish between the two industries, with blue for IT and purple for Service.
# Background:

# The chart background is white, with black grid lines corresponding to the percentages on the y-axis, enhancing readability.
# This set of details can be used to reconstruct the graph or to understand the distribution and trends in GDP contribution by the IT and Service industries in the UK over the specified years.
# """

# print(execute_task(chart_sum))

In [6]:
# import os
# import csv

# def prepare_inputs_from_statista_dataset(indices):
#     base_path = "/notebooks/Chart-to-text/statista_dataset/dataset/"
#     data_path = os.path.join(base_path, "data")
#     title_path = os.path.join(base_path, "titles")

#     inputs = []

#     for index in indices:
#         data_file = os.path.join(data_path, f"{index}.csv")
#         title_file = os.path.join(title_path, f"{index}.txt")

#         # Read title
#         with open(title_file, 'r') as f:
#             title = f.read().strip()

#         # Read data table
#         rows = []
#         with open(data_file, 'r') as f:
#             csv_reader = csv.reader(f)
#             for row in csv_reader:
#                 rows.append(" | ".join(row))

#         # Concatenate the title with table content
#         input = title + "\n" + "\n".join(rows)
#         inputs.append(input)

#     return inputs

In [7]:
# import random

# indices = random.sample(range(1, 20001), 10)
# inputs = prepare_inputs_from_statista_dataset(indices)

# results = [execute_task(input) for input in inputs]

# for i, input in enumerate(inputs):
#     print(f"Input {indices[i]}:\n{input}\n")
#     print(f"Summary {indices[i]}:\n{results[i]}\n")
#     print('-'*50)



In [None]:
import json
from tqdm import tqdm

def load_json_data(json_file):
    with open(json_file, 'r') as file:
        return json.load(file)

def convert_data_to_text(item):
    x_label = item["x_label"]
    y_labels = item["y_label"]
    data = item["data"]
    title = item["title"]

    # Convert data to text format
    data_text = f"{x_label}: {', '.join(map(str, data[x_label]))}\n"
    for y_label in y_labels:
        data_text += f"{y_label}: {', '.join(map(str, data[y_label]))}\n"
    
    return f"{title}\n{data_text}"

# File paths
json_file = '/notebooks/ChartSumm/test_k.json'

# Load data
full_data = load_json_data(json_file)

n = len(full_data)
num_parts = 1 # 3
part_len = n // num_parts
i = 0
start = i * part_len
end = (i+1) * part_len if i+1 != num_parts else n
# start = 0
# end = 100
data = full_data[start:end]
print(n, num_parts, part_len, start, end)

generated_texts = []
summaries = []

# Process and write to files
with open(generated_file, 'w', encoding='utf-8') as gen_file, open(target_file, 'w', encoding='utf-8') as tar_file:
    for item in tqdm(data, desc="Processing data"):
        input_text = convert_data_to_text(item)
        generated_text = execute_task(input_text).replace('\n', ' ')
        summary = item["summary"].replace('\n', ' ')
        # print(f"input:\n{input_text}\n")
        # print(f"output:\n{generated_text}\n")
        # print(f"expected:\n{summary}\n")
        # print(f"{'-'*50}\n")

        generated_texts.append(generated_text)
        summaries.append(summary)
        
    gen_file.write('\n'.join(generated_texts) + '\n')
    tar_file.write('\n'.join(summaries) + '\n')
    gen_file.close()
    tar_file.close()


4338 1 4338 0 4338


Processing data: 100%|█████████▉| 4337/4338 [3:42:40<00:02,  2.09s/it]  

In [None]:
# for text in generated_texts:
#     print(text)