In [1]:
import json
import os
import re
import sys
import tqdm
from PIL import Image

import torch
import numpy as np
from torchvision import transforms
import argparse
from trainer import Qwen2VLForConditionalGeneration_SelfFilter
from transformers.modeling_utils import load_sharded_checkpoint

In [2]:
def load_stage1_model(
    model_path, feature_extractor_setting='clip', device_map="auto", device="cuda", **kwargs
):
    kwargs = {"device_map": device_map, **kwargs}

    if device != "cuda":
        kwargs["device_map"] = {"": device}

    kwargs["torch_dtype"] = torch.float16

    # note that we do not need vision tower here, and it is not loaded.
    if feature_extractor_setting == "clip":
        model = Qwen2VLForConditionalGeneration_SelfFilter.from_pretrained(
            model_path, low_cpu_mem_usage=True, **kwargs
        ).to(device)
    elif feature_extractor_setting == "scores":
        model = LlavaLlamaForCausalLM_SelfFilter_Scores.from_pretrained(
            model_path, ignore_mismatched_sizes=True, low_cpu_mem_usage=True, **kwargs
        ).to(device)
    else:
        print("Unknown feature extractor setting: ", feature_extractor_setting)
        raise NotImplementedError

    non_lora_state_path = os.path.join(model_path, "non_lora_trainables.bin")
    if os.path.exists(non_lora_state_path):
        non_lora_state_dict = torch.load(non_lora_state_path, map_location="cpu")
        model.load_state_dict(non_lora_state_dict, strict=False)
        print('score_net has been well loaded.')
    else:
        print("Warning: non_lora_trainables.bin 파일이 없습니다.")

    return model

In [3]:
def load_scores(score_names):
    def norm_scores(score_dict: dict):
        min_score = min(score_dict.values())
        max_score = max(score_dict.values())
        normed_score_dict = {
            i[0]: (i[1] - min_score) / (max_score - min_score) * 2 - 1
            for i in score_dict.items()
        }
        return normed_score_dict

    score_dicts = []

    for score_name in score_names:
        with open(score_name, "r") as f:
            score_dict = json.load(f)
            score_dicts.append(norm_scores(score_dict))

    return score_dicts

def produce_scores_difficulty(model, save_path: str):
    difficulty_dict = {}

    score_files = [
        "llava_imagereward.json",
        "llava_clipscore.json",
        #"data/scores/gpt-3.5-turbo-1106/processed_score.json",
    ]
    score_dicts = [json.load(open(file, "r")) for file in score_files]

    for unique_idx in score_dicts[0]:
        scores = [[score_dict[str(unique_idx)] for score_dict in score_dicts]]
        scores = torch.tensor(scores).cuda().half()
        difficulty_dict[unique_idx] = -model.predict_weights(scores).item()

    with open(save_path, "w") as f:
        json.dump(difficulty_dict, f)

    print("Scores difficulty generated and saved.")

    return difficulty_dict

In [4]:
def produce_clip_difficulty(model, save_path: str):
    difficulty_dict = {}
    clip_feat = torch.load("/workspace/Self-Filter/llava_clip_feature.pt")

    for unique_idx in clip_feat:
        dtype = model.get_score_net_dtype()
        scores = clip_feat[unique_idx].unsqueeze(dim=0).cuda().to(dtype=dtype)
        difficulty_dict[unique_idx] = -model.predict_weights(scores).item()

    with open(save_path, "w") as f:
        json.dump(difficulty_dict, f)

    print("CLIP difficulty generated and saved")

    return difficulty_dict

In [5]:
def get_difficulty_score(
    model_path: str, feature_extractor_setting: str, save_path: str
):
    print("Loading stage 1 model...", flush=True)
    model = load_stage1_model(model_path, feature_extractor_setting)
    print("Model loaded.", flush=True)

    if feature_extractor_setting == "scores":
        return produce_scores_difficulty(model, save_path)
    else:
        return produce_clip_difficulty(model, save_path)


def dist_filter(
    raw_annotation_path, difficulty_dict, filter_num, save_path, gamma=1, k_nearest=10
):

    with open(raw_annotation_path, "r") as f:
        raw_annotation = json.load(f)
    new_annotation = []

    feat_dict = torch.load("../data/llava_clip_feature.pt")
    feat_len = len(feat_dict)
    feat_matrix = torch.stack(
        [feat_dict[str(i)].cuda() for i in range(feat_len)], dim=0
    )
    feat_matrix_norm = torch.norm(feat_matrix, dim=-1, keepdim=False)

    for i in tqdm.tqdm(range(filter_num)):
        lst = sorted(difficulty_dict.items(), key=lambda x: x[1], reverse=True)

        unique_idx, difficulty = lst[0]

        example = raw_annotation[int(unique_idx)]
 

        example.pop("unique_idx")
        new_annotation.append(example)

        difficulty_dict.pop(unique_idx)

        tgt_feat = feat_matrix[int(unique_idx)].unsqueeze(dim=0)
        tgt_norm = feat_matrix_norm[int(unique_idx)].unsqueeze(dim=0)

        sims = (feat_matrix * tgt_feat).sum(dim=-1) / feat_matrix_norm / tgt_norm

        sorted_sim, indices = torch.sort(sims, descending=True)

        success_cnt = 0

        for j in range(len(difficulty_dict)):
            if success_cnt >= k_nearest:
                break

            cur_unique_idx = str(indices[j].item())

            if cur_unique_idx not in difficulty_dict:
                continue

            cur_sim = sorted_sim[j].item()
            penalty = difficulty * (cur_sim**2) * gamma
            difficulty_dict[cur_unique_idx] -= penalty
            success_cnt += 1

        assert success_cnt == k_nearest

    with open(save_path, "w") as f:
        json.dump(new_annotation, f)

    print("Annotation filtered and saved.")

In [12]:
stage1_model_path = "../checkpoint/qwen2vl_clip_lora"

difficulty_dict = get_difficulty_score(
    stage1_model_path,
    'clip',
    "../data/difficulty_clip_qwenvl2_lora.json"
)

Loading stage 1 model...


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Some weights of Qwen2VLForConditionalGeneration_SelfFilter were not initialized from the model checkpoint at /workspace/Self-Filter/checkpoint/Qwen2-VL-7B-Instruct and are newly initialized: ['score_net.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


score_net has been well loaded.
Model loaded.
CLIP difficulty generated and saved


In [8]:

dist_filter(
    raw_annotation_path = '../data/llava_instruct_80k_add_idx.json',
    difficulty_dict = difficulty,
    filter_num = 30000,
    save_path = "../data/qwen2vl_filtered_lora_30k.json",
    gamma = 1,
    k_nearest = 10,
)


100%|██████████| 30000/30000 [08:12<00:00, 60.95it/s]


Annotation filtered and saved.
