In [1]:
from tqdm import tqdm
from transformers import BertModel, BertTokenizer
import torch
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
text_list = [
        "a group of people in a gym with a basketball net",
        "a group of people in a gym with a basketball net",
        "a girl wearing headphones in a gym room",
        "a girl in a school uniform with headphones on",
        "a girl with headphones on in a gym",
        "a girl in a school uniform with the words'i love you '",
        "a girl with headphones standing in front of a basketball court",
        "a person standing in a gym with the words in korean",
        "a girl in headphones is sitting on a basketball court",
        "a man in a wheelchair is playing basketball in a gym",
        "a person in a wheelchair in a gym with the words'i love you'written on it",
        "a woman in a wheelchair is performing a dance",
        "a girl in a wheelchair with the words'i love you'written on it",
        "a person in a wheelchair with headphones on",
        "a young woman wearing headphones in front of a wall",
        "a black background with the words in korean and english",
        "a person in a wheelchair in a gym with other people in the background",
        "a woman holding a tennis racquet in front of a net",
        "two young men and a woman in suits talking to each other",
        "a young man with black hair stares at something",
        "a young man with black hair and a white shirt",
        "a young man with black hair is staring into the camera",
        "a young man with black hair staring at something",
        "a close up of a person with long hair",
        "a close up of an asian woman's face",
        "a close up of a young woman with dark hair",
        "an asian woman looking down at her cell phone",
        "an asian woman looks at her cell phone in a scene from the korean drama",
        "a close up of a woman's face with dark hair",
        "a young girl sitting in a wheelchair in a gym",
        "a woman wearing headphones sitting next to a dog",
        "a woman wearing headphones is petting a dog",
        "a woman sitting at a table with a dog in front of her",
        "a person doing a handstand in the air",
        "a person doing a trick on a skateboard with the sun in the background",
        "a man doing a handstand in front of the sun",
        "a man wearing headphones and a brown jacket",
        "a man with headphones on standing in front of a window",
        "a woman in a wheel chair with the words this is new normal",
        "this is new normal - official trailer [ hd ]",
        "a woman in a wheel chair with the words'the best moments'written on it",
        "a woman in a white dress is sitting on a chair",
        "a man sitting on top of a chair in front of a neon sign",
        "[ single ] this is a natural - i'm a natural",
        "the korean song is written in korean and english",
        "the korean poster for the upcoming film,'i'm not afraid to see what you",
        "a man in a wheel chair with the words'this is what i want to do '",
        "an advertisement for the korean movie, with a man in a wheel chair",
        "a man in a wheel chair with the words in korean",
        "the poster for the korean movie's upcoming film",
        "a man in a wheel chair with the words in korean"
    ]

In [3]:
search_queries = [
    "Search for the scene with a group of people in the gym.",
    "Find the moment featuring a girl wearing headphones in the gym.",
    "Locate the scene of a man in a wheelchair playing basketball.",
    "Retrieve the part where a girl sits on the basketball court.",
    "Show me the scene with a woman in a wheelchair performing a dance.",
    "Find the scene where a young man with black hair stares into the camera.",
    "Display the close-up of a young woman with dark hair.",
    "Search for the girl in a school uniform with the words 'I love you'.",
    "Locate the scene of a woman holding a tennis racquet.",
    "Show me the moment when a person is doing a handstand in the air.",
    "Retrieve the scene with a young girl sitting in a wheelchair.",
    "Find the part of a woman petting a dog while wearing headphones.",
    "Search for the woman in a wheelchair with the words 'this is new normal'.",
    "Locate the scene of a man wearing headphones standing by a window.",
    "Show me the moment of a man doing a trick on a skateboard.",
    "Retrieve the scene with the words 'the best moments' on a wheelchair.",
    "Search for the young woman looking at her cell phone.",
    "Display the poster for the upcoming Korean movie.",
    "Locate the advertisement for the Korean movie with a man in a wheelchair.",
    "Find the moment when the words 'this is what I want to do' are shown."
]

In [4]:
def cos_sim(A, B):
    return np.dot(A, B)/(np.linalg.norm(A)*np.linalg.norm(B))


model_name = 'bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

def sentence_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
from tqdm import tqdm

hit_thresholds = [0.6, 0.7, 0.8]  # 여러 hit_thres 값을 설정

for hit_thres in hit_thresholds:  # 각 hit_thres 값에 대해 반복
    hit_count = 0  # 각 임계값마다 초기화
    print(f"Processing for hit_thres = {hit_thres}")
    
    for query in tqdm(search_queries): 
        query_embedding = sentence_embedding(query)
        sim_list = []
        hit = False
        for scene_text in text_list:
            scene_embedding = sentence_embedding(scene_text)
            sim_list.append(cos_sim(query_embedding, scene_embedding))

            # Hit    
            if any(sim > hit_thres for sim in sim_list):
                hit = True
                break
        
        if hit:
            hit_count += 1

    print("Hit Rate : ", hit_count / len(search_queries))

Processing for hit_thres = 0.6


100%|██████████| 20/20 [00:02<00:00,  7.26it/s]


Hit Rate :  1.0
Processing for hit_thres = 0.7


100%|██████████| 20/20 [00:09<00:00,  2.06it/s]


Hit Rate :  0.85
Processing for hit_thres = 0.8


100%|██████████| 20/20 [00:16<00:00,  1.20it/s]

Hit Rate :  0.1



