# Setup

In [None]:
metadata_path = None
save_dir = None

In [None]:
import os
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import save_npz, load_npz, vstack, csr_matrix
import pickle

In [None]:
dir_path = os.getcwd()
parent_dir_path = os.path.dirname(dir_path)

if not metadata_path:
    metadata_path = f'{parent_dir_path}/final_metadata.json'
    
if not save_dir:
    save_dir = f'{dir_path}/metadata_encoded'
    
os.makedirs(save_dir, exist_ok=True)

# Load and Prepare Data

In [None]:
def load_metadata(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

metadata = load_metadata(metadata_path)

In [None]:
def prepare_data(metadata):
    object_texts = []
    ocr_texts = []
    tag_texts = []
    frame_ids = []
    
    for frame_id, frame_data in metadata.items():
        if 'detection' in frame_data and 'objects' in frame_data['detection']:
            objects = frame_data['detection']['objects']
            counts = frame_data['detection']['counts']
            object_text = ' '.join([f"{obj} " * counts[obj] for obj in objects.keys()])
            object_texts.append(object_text)
        else:
            object_texts.append('')
            
        ocr_texts.append(frame_data.get('ocr', ''))
        tag_texts.append(' '.join(frame_data.get('tags', [])))
        frame_ids.append(frame_id)
    
    return ocr_texts, object_texts, tag_texts, frame_ids

ocr_texts, object_texts, tag_texts, frame_ids = prepare_data(metadata)
print(f"Prepared data for {len(frame_ids)} frames")

Prepared data for 46996 frames


# TF-IDF Vectorization

* vectorizer.pkl:
    * contains vocabulary and IDF (Inverse Document Frequency)
    * Example
```
{
    'vocabulary_': {'person': 0, 'tie': 1, 'car': 2, ...},
    'idf_': [1.2, 1.5, 1.8, ...],
    ...
}
```
* vectors.npz
    * contains vector feature of TF-IDF in each frames
    * Example (when convert to dense matrix)
```
[
  [0.5, 0.8, 0.0, ...],
  [0.0, 0.3, 0.6, ...],
  ...
]
```

In [None]:
def create_tfidf_vectors(texts, name, output_dir):
    vectorizer = TfidfVectorizer(min_df=1)
    try:
        vectors = vectorizer.fit_transform(texts)
    except ValueError as e:
        print(f"Warning: Unable to create vectors for {name}. Error: {str(e)}")
        vectorizer = None
        vectors = csr_matrix((len(texts), 0))

    if vectorizer is not None:
        with open(os.path.join(output_dir, f'{name}_vectorizer.pkl'), 'wb') as f:
            pickle.dump(vectorizer, f)

    save_npz(os.path.join(output_dir, f'{name}_vectors.npz'), vectors)

    print(f"{name.capitalize()} vectorization complete. Shape: {vectors.shape}")

    return vectorizer, vectors

ocr_vectorizer, ocr_vectors = create_tfidf_vectors(ocr_texts, 'ocr', save_dir)
object_vectorizer, object_vectors = create_tfidf_vectors(
    object_texts, 'object', save_dir)
tag_vectorizer, tag_vectors = create_tfidf_vectors(tag_texts, 'tag', save_dir)

Ocr vectorization complete. Shape: (46996, 0)
Object vectorization complete. Shape: (46996, 90)
Tag vectorization complete. Shape: (46996, 0)


# Test

In [None]:
def search_similar_frames(query, vectorizers, vectors, frame_ids, top_k=5):
    similarities = []
    for vectorizer, vector in zip(vectorizers, vectors):
        if vectorizer:
            query_vector = vectorizer.transform([query])
            similarity = vector.dot(query_vector.T).toarray().flatten()
            similarities.append(similarity)

    if similarities:
        total_similarity = np.sum(similarities, axis=0)
        top_indices = total_similarity.argsort()[-top_k:][::-1]

        results = []
        for idx in top_indices:
            results.append({
                'frame_id': frame_ids[idx],
                'similarity': total_similarity[idx]
            })

        return results
    else:
        return []


query = "person tie"
results = search_similar_frames(
    query,
    [object_vectorizer, ocr_vectorizer, tag_vectorizer],
    [object_vectors, ocr_vectors, tag_vectors],
    frame_ids
)

print(f"Top 5 frames similar to '{query}':")
for result in results:
    print(
        f"Frame ID: {result['frame_id']}, Similarity: {result['similarity']}")

Top 5 frames similar to 'person tie':
Frame ID: L01_V005_023928, Similarity: 1.0000000000000002
Frame ID: L01_V003_004664, Similarity: 1.0000000000000002
Frame ID: L01_V008_027233, Similarity: 1.0000000000000002
Frame ID: L01_V008_026447, Similarity: 1.0000000000000002
Frame ID: L01_V008_026428, Similarity: 1.0000000000000002
