In [None]:
## This file produces data for the model to use

import os
import json
from dotenv import load_dotenv
import torch

## Load Config
with open('config/videos.json') as config_file:
    videos = json.load(config_file)
with open('config/name_to_url.json') as config_file:
    name_to_url = json.load(config_file)

load_dotenv(dotenv_path=".env")

In [None]:
from pinecone import Pinecone

pc = Pinecone(api_key=os.environ["PINECONE_KEY"])
index = pc.Index("inftest")

In [None]:
vectors = []
for embedding_file in os.listdir('data/embeddings'):
    if not embedding_file.endswith('.pt'):
        continue
    name = embedding_file[:-3]
    metadata = {"name": name, "url": name_to_url[name]}
    tensor = torch.load(f'data/embeddings/{embedding_file}')
    # max_pooled = torch.max(tensor, 1).squeeze(0)
    avg_pooled = torch.mean(tensor, 1).squeeze(0)
    vectors.append({"values": avg_pooled.numpy().tolist(), "id": name, "metadata": metadata})


In [None]:
index.upsert(vectors=vectors)

In [None]:

for embedding_file in os.listdir('data/embeddings'):
    if not embedding_file.endswith('.pt'):
        continue
    name = embedding_file[:-3]
    # Tensor is shape[1, 968, 4096]
    tensor = torch.load(f'data/embeddings/{embedding_file}')
    max_pooled = torch.max(tensor, 1)
    average_pool = torch.mean(tensor, 1)
    query = max_pooled.squeeze(0).numpy().tolist()
    response = index.query(vector=query, top_k=5, include_values=True, include_metadata=True)
    print(f'Querying {name}:')
    for i, obj in enumerate(response["matches"]):
        print(f'    Rank: {i+1}, Metadata: {obj["metadata"]}, Distance:, Score: {obj["score"]}')
