In [1]:
from open_flamingo import create_model_and_transforms

model, image_processor, tokenizer = create_model_and_transforms(
    clip_vision_encoder_path="ViT-L-14",
    clip_vision_encoder_pretrained="openai",
    lang_encoder_path="anas-awadalla/mpt-7b",
    tokenizer_path="anas-awadalla/mpt-7b",
    cross_attn_every_n_layers=4
)

# grab model checkpoint from huggingface hub
from huggingface_hub import hf_hub_download
import torch

checkpoint_path = hf_hub_download("openflamingo/OpenFlamingo-9B-vitl-mpt7b", "checkpoint.pt")
model.load_state_dict(torch.load(checkpoint_path), strict=False)

  from .autonotebook import tqdm as notebook_tqdm
Using pad_token, but it is not set yet.
Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


You are using config.init_device='cpu', but you can also use config.init_device="meta" with Composer + FSDP for fast initialization.


Loading checkpoint shards: 100%|██████████| 3/3 [03:09<00:00, 63.23s/it]


Flamingo model initialized with 1384781840 trainable parameters


_IncompatibleKeys(missing_keys=['vision_encoder.class_embedding', 'vision_encoder.positional_embedding', 'vision_encoder.proj', 'vision_encoder.conv1.weight', 'vision_encoder.ln_pre.weight', 'vision_encoder.ln_pre.bias', 'vision_encoder.transformer.resblocks.0.ln_1.weight', 'vision_encoder.transformer.resblocks.0.ln_1.bias', 'vision_encoder.transformer.resblocks.0.attn.in_proj_weight', 'vision_encoder.transformer.resblocks.0.attn.in_proj_bias', 'vision_encoder.transformer.resblocks.0.attn.out_proj.weight', 'vision_encoder.transformer.resblocks.0.attn.out_proj.bias', 'vision_encoder.transformer.resblocks.0.ln_2.weight', 'vision_encoder.transformer.resblocks.0.ln_2.bias', 'vision_encoder.transformer.resblocks.0.mlp.c_fc.weight', 'vision_encoder.transformer.resblocks.0.mlp.c_fc.bias', 'vision_encoder.transformer.resblocks.0.mlp.c_proj.weight', 'vision_encoder.transformer.resblocks.0.mlp.c_proj.bias', 'vision_encoder.transformer.resblocks.1.ln_1.weight', 'vision_encoder.transformer.resbloc

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model.eval()
model.to(device, torch.float16)

In [15]:
from datasets import load_dataset
import torch
from PIL import Image

vali_dataset = load_dataset('alexshengzhili/SciCapInstructed-graph-only-qa', split='1_percent_as_validation')
data = vali_dataset.filter(lambda x: x['q_a_pairs'] is not None and len(x['q_a_pairs']) > 0)



In [25]:
context_data = load_dataset('alexshengzhili/SciCapInstructed-graph-only-qa', split='1_percent_as_validation[100:]')
first_100 = load_dataset('alexshengzhili/SciCapInstructed-graph-only-qa', split='1_percent_as_validation[:100]')



In [65]:
context_data

Dataset({
    features: ['image_file', 'id', 'caption', 'conversations', 'first_mention', 'response', 'title', 'abstract', 'q_a_pairs'],
    num_rows: 2902
})

In [None]:
from PIL import Image
import requests
from tqdm import tqdm
import random
import json

def get_input_example_for_contextual_lerning(context_data, num_examples):
    # Pick num_examples random examples after 100
    #example_index = random.randint(0, len(context_data), num_examples)
    example_indexes = random.sample(range(len(context_data)), num_examples)
    questions = []
    answers = []
    img_paths = []
    image_root_folder = '/home/ubuntu/imgs/train/'
    for example_idx in example_indexes:
        example = context_data[example_idx]
        question = example['q_a_pairs'][0][0]
        answer = example['q_a_pairs'][0][1]
        img_path = image_root_folder + example['image_file']
        questions.append(question)
        answers.append(answer)
        img_paths.append(img_path)
    return questions, answers, img_paths


def get_input(example):
    question = example['q_a_pairs'][0][0]
    image_root_folder = '/home/ubuntu/imgs/train/'
    image_filepath = example['image_file']
    return question, image_root_folder + image_filepath

tokenizer.padding_side = "left" # For generation padding tokens should be on the left

def generate_text(example, num_examples):
    """
    Step 0: pick num_examples random examples
    ""
    Step 1: Load images
    """
    questions, answers, img_paths = get_input_example_for_contextual_lerning(data, num_examples)
    demo_examples = [f"question: {q} answer: {a}" for q, a in zip(questions, answers)]
    demo_images = [Image.open(img_path) for img_path in img_paths]
    # Step 1: Load query image
    question, img_path = get_input(example)
    query_image = Image.open(img_path)
    # query = json.dumps({"question:": question, "answer:": ''})
    query = f"question: {question} answer: "
    """
    Step 2: Preprocess images
    Details: For OpenFlamingo, we expect the image to be a torch tensor of shape 
    batch_size x num_media x num_frames x channels x height x width. 
    In this case batch_size = num_examples + 1, num_media = 1, num_frames = 1,
    channels = 3, height = 224, width = 224.
    """
    if num_examples > 0:
        vision_x = [image_processor(img).unsqueeze(0) for img in demo_images]
        vision_x.append(image_processor(query_image).unsqueeze(0))
        vision_x = torch.cat(vision_x, dim=0)
        vision_x = vision_x.unsqueeze(1).unsqueeze(0).to(device, torch.float16)
    else:
        vision_x = image_processor(query_image).unsqueeze(0)
        vision_x = vision_x.unsqueeze(1).unsqueeze(0).to(device, torch.float16)


    """
    Step 3: Preprocess question
    Details: In the text we expect an <image> special token to indicate where an image is.
    We also expect an <|endofchunk|> special token to indicate the end of the text 
    portion associated with an image.
    """

    if num_examples == 0:
        lang_x = tokenizer(
            [f"<image>{query}"],
            return_tensors="pt",
        )
    else:
        lang_x = tokenizer(
            [f"<image>{'<|endofchunk|>'.join(demo_examples)}<|endofchunk|><image>{query}"],
            return_tensors="pt",
        )
    """
    Step 4: Generate text
    """
    generated_text = model.generate(
        vision_x=vision_x,
        lang_x=lang_x["input_ids"].to(device),
        attention_mask=lang_x["attention_mask"].to(device),
        max_new_tokens=100,
        num_beams=1,
    )

    output = tokenizer.decode(generated_text[0])
    print("Generated text: ", output)
    return output

# generate_text(first_100[3], 10)

responses = []
with torch.no_grad() and open("open_flaming_6shot", "w") as f:
    for i in tqdm(range(len(first_100))):
        responses.append(generate_text(first_100[i], 6))


In [67]:
responses_model_6_shot = [item.rsplit('answer:', 1)[-1] for item in responses]


In [72]:
responses_model_6_shot

[' （1）The first graph shows the error in the interpolant as a function of the regularization strength for a fixed number of pages. （2）The second graph shows the error in the interpolant as a function of the number of pages for a fixed regularization strength.<|endofchunk|>',
 ' The graph shows the probability of error for the first-order and second-order EA decoders, as a function of the number of measurements m. The measurements are damped by i.i.d',
 ' （1）The CRR of the camera is higher when the camera is located closer to the transmitter. （2）The CRR of the camera is lower when the camera is located farther from the transmitter. （3）The CRR of the camera is higher when the transmitter is located closer to the receiver. （4）The CRR of the camera is lower when the transmitter is located farther from the receiver.<|endofchunk|>',
 ' \tThe findings in Figure 8 suggest that the performance of facial landmark detection algorithms can be improved by using a combination of different techniques

In [73]:
import argparse
import json
import os

import openai
import tqdm
import time
from dotenv import load_dotenv

load_dotenv()  # take environment variables from .env.
openai.api_key = os.getenv("OPENAI_API_KEY")
system_message = """
You are a helpful and precise assistant for checking the quality of the answer.
You are given the graph's caption, the context of the graph, the abstract, tthe title

And then you are given the question, the reference answer, and the answer generated by the model. Please
think about how helpful the model answer is to the user and rate the model answer on a scale of 0 to 10, 
where 0 is not helpful at all and 10 is very helpful. Just return the floating number between 0 and 10.
"""

def construct_input_string(first_100, index):
    content = dict()
    cur_example = first_100[index]
    content['title'] = cur_example['title']
    content['abstract'] = cur_example['abstract']
    content['caption'] = cur_example['caption']
    content['Question to the model'] = cur_example['q_a_pairs'][0][0]
    content['reference_answer'] = cur_example['q_a_pairs'][0][1]
    content['Candidate model answer'] = responses_model_6_shot[index]
    return json.dumps(content)


def get_openai_response(content_string):
    openai_response = openai.ChatCompletion.create(
                    model='gpt-4',
                    messages=[{
                        'role': 'system',
                        'content': system_message
                    }, {
                        'role': 'user',
                        'content': content_string
                    }],
                    temperature=0.2,  # TODO: figure out which temperature is best for evaluation
                    max_tokens=500,
                )['choices'][0]['message']['content']
    return openai_response

openai_responses = []
for i in range(len(data)):
    content_string = construct_input_string(first_100, i)
    openai_response = get_openai_response(content_string)
    print(openai_response)
    openai_responses.append(openai_response)
    time.sleep(2)

    
openai_responses_float = [float(str) for str in openai_responses]
rated_data = data.add_column("openflamingo_answer_6_shot", responses_model_6_shot)
rated_data = rated_data.add_column("openai_rating", openai_responses_float)

output_file_path = "openfliamgo_answer_and_openai_rating.jsonl"

with open(output_file_path, 'w') as f:
    for example in rated_data:
        json_str = json.dumps(example)
        f.write(json_str + '\n')

import numpy as np
mean, std = np.mean(openai_responses_float), np.std(openai_responses_float)

0.0
0
0.0
3.0
0.0
8.5
0
6.5
0.0
7.5
0
0.0
0
0
3.0
0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
8.5
3.0
5.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
2.0
3.0
0
0.0
0.0
0.0
0.0
1.0
0.0
0
0.0
0.0
0.0
0.0
0
6.5
0.0
0.0
3.0
0.0
0.0
0.0
0.0
0.0
2.0
0.0
5.0
0.0
0.0
0.0
1.0
2.0
0.0
0
0


In [53]:
import numpy as np
mean, std = np.mean(openai_responses_float), np.std(openai_responses_float)
print(mean, std)

NameError: name 'openai_responses_float' is not defined

In [52]:
mean, st
d = np.mean(openai_responses_float), np.std(openai_responses_float)

NameError: name 'np' is not defined

In [49]:
len(first_100), len(response_model)

(100, 100)