-
Notifications
You must be signed in to change notification settings - Fork 13
/
gpt4v.py
187 lines (165 loc) · 5.19 KB
/
gpt4v.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
# Copyright (c) Meta Platforms, Inc. and affiliates.
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import argparse
import json
import os
import traceback
from pathlib import Path
from typing import List, Optional
import numpy as np
import tqdm
from openeqa.utils.openai_utils import (
call_openai_api,
prepare_openai_vision_messages,
set_openai_key,
)
from openeqa.utils.prompt_utils import load_prompt
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser()
parser.add_argument(
"--dataset",
type=Path,
default="data/open-eqa-v0.json",
help="path to EQA dataset (default: data/open-eqa-v0.json)",
)
parser.add_argument(
"--model",
type=str,
default="gpt-4-vision-preview",
help="OpenAI model (default: gpt-4-vision-preview)",
)
parser.add_argument(
"--frames-directory",
type=Path,
default="data/frames/",
help="path image frames (default: data/frames/)",
)
parser.add_argument(
"--num-frames",
type=int,
default=50,
help="num frames in gpt4v (default: 50)",
)
parser.add_argument(
"--image-size",
type=int,
default=512,
help="image size (default: 512)",
)
parser.add_argument(
"--seed",
type=int,
default=1234,
help="gpt seed (default: 1234)",
)
parser.add_argument(
"--temperature",
type=float,
default=0.2,
help="gpt temperature (default: 0.2)",
)
parser.add_argument(
"--max-tokens",
type=int,
default=128,
help="gpt maximum tokens (default: 128)",
)
parser.add_argument(
"--output-directory",
type=Path,
default="data/results",
help="output directory (default: data/results)",
)
parser.add_argument(
"--force",
action="store_true",
help="continue running on API errors (default: false)",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="only process the first 5 questions",
)
args = parser.parse_args()
args.output_directory.mkdir(parents=True, exist_ok=True)
args.output_path = args.output_directory / (
args.dataset.stem + "-{}-{}.json".format(args.model, args.seed)
)
return args
def ask_question(
question: str,
image_paths: List,
image_size: int = 512,
openai_key: Optional[str] = None,
openai_model: str = "gpt-4-vision-preview",
openai_seed: int = 1234,
openai_max_tokens: int = 128,
openai_temperature: float = 0.2,
force: bool = False,
) -> Optional[str]:
try:
set_openai_key(key=openai_key)
prompt = load_prompt("gpt4v")
prefix, suffix = prompt.split("User Query:")
suffix = "User Query:" + suffix.format(question=question)
messages = prepare_openai_vision_messages(
prefix=prefix, suffix=suffix, image_paths=image_paths, image_size=image_size
)
output = call_openai_api(
messages=messages,
model=openai_model,
seed=openai_seed,
max_tokens=openai_max_tokens,
temperature=openai_temperature,
)
return output
except Exception as e:
if not force:
traceback.print_exc()
raise e
def main(args: argparse.Namespace):
# check for openai api key
assert "OPENAI_API_KEY" in os.environ
# load dataset
dataset = json.load(args.dataset.open("r"))
print("found {:,} questions".format(len(dataset)))
# load results
results = []
if args.output_path.exists():
results = json.load(args.output_path.open())
print("found {:,} existing results".format(len(results)))
completed = [item["question_id"] for item in results]
# process data
for idx, item in enumerate(tqdm.tqdm(dataset)):
if args.dry_run and idx >= 5:
break
# skip completed questions
question_id = item["question_id"]
if question_id in completed:
continue # skip existing
# extract scene paths
folder = args.frames_directory / item["episode_history"]
frames = sorted(folder.glob("*-rgb.png"))
indices = np.round(np.linspace(0, len(frames) - 1, args.num_frames)).astype(int)
paths = [str(frames[i]) for i in indices]
# generate answer
question = item["question"]
answer = ask_question(
question=question,
image_paths=paths,
image_size=args.image_size,
openai_model=args.model,
openai_seed=args.seed,
openai_max_tokens=args.max_tokens,
openai_temperature=args.temperature,
force=args.force,
)
# store results
results.append({"question_id": question_id, "answer": answer})
json.dump(results, args.output_path.open("w"), indent=2)
# save at end (redundant)
json.dump(results, args.output_path.open("w"), indent=2)
print("saving {:,} answers".format(len(results)))
if __name__ == "__main__":
main(parse_args())