In [1]:
## This file produces data for the model to use

import os
import json
from dotenv import load_dotenv
import torch
import time
import math, random
from pymongo import MongoClient
from bson.objectid import ObjectId
from pymongo.server_api import ServerApi

## Load Config
with open('config/videos.json') as config_file:
    videos = json.load(config_file)
with open('config/name_to_url.json') as config_file:
    name_to_url = json.load(config_file)
with open('config/alltopics.json') as config_file:
    alltopics = json.load(config_file)

load_dotenv(dotenv_path="/home/ubuntu/InferenceTest1/rec/inference/.env")


True

In [13]:
from pinecone import Pinecone

pc = Pinecone(api_key=os.environ["PINECONE_KEY"])
index = pc.Index("pretechnigala")
DB_NAME = "preTechnigalaClean_db"
COLLECTION_NAME = "video_metadata"
MONGO_DB_CLIENT = MongoClient(os.getenv("MONGODB_URI"), server_api=ServerApi('1'))
OUTPUT_DIR = "/home/ubuntu/InferenceTest1/rec/inference/data/validation"
QUERY_MODE = "max"
VECTOR_MODE = "max"
PATH_TO_EMBEDDINGS = f"/home/ubuntu/InferenceTest1/rec/inference/data/validation/embeddings/{QUERY_MODE}"

In [11]:
#### PERFORMS QUERIES BASED ON JUST THE VIDEO EMBEDDINGS
with open(f'{OUTPUT_DIR}/video_query/{VECTOR_MODE}-v_{QUERY_MODE}-q.txt', 'w') as f:
    for embedding_file in os.listdir(PATH_TO_EMBEDDINGS):
        if not embedding_file.endswith('.pt'):
            continue
        name = embedding_file[4:-3]
        query = torch.load(f'{PATH_TO_EMBEDDINGS}/{embedding_file}').to('cpu').numpy().tolist()
        response = index.query(vector=query, top_k=5, include_values=True, include_metadata=True, filter={"mode": f"{VECTOR_MODE}_pool"})
        
        f.write(f'Querying {name}:\n')
        for i, obj in enumerate(response["matches"]):
            youtubeURL = "None"
            doc = MONGO_DB_CLIENT[DB_NAME][COLLECTION_NAME].find_one({"_id": ObjectId(obj["metadata"]["videoID"])})
            if doc:
                youtubeURL = doc["youtubeURL"]
            f.write(f'    Rank: {i+1}, Distance: {obj["score"]}\n')
            f.write(f'        Title: {obj["metadata"]["title"]}, URL: {youtubeURL}\n')
            f.write(f'        Topics: {[alltopics[topic] for topic in obj["metadata"]["topics"]]}\n')
            f.write(f'        infTopics: {obj["metadata"]["inferenceTopics"]} \n')
            f.write(f'        infComplexities: {obj["metadata"]["inferenceComplexities"]}\n')

In [14]:
ATTEMPT = 0
HISTORY_LENGTH = 5
QUERIES = 10

os.makedirs(f'{OUTPUT_DIR}/history_query/{VECTOR_MODE}-v_{QUERY_MODE}-q', exist_ok=True)
all_vectors = [] #NAME, VECTOR
for embedding_file in os.listdir(PATH_TO_EMBEDDINGS):
    if not embedding_file.endswith('.pt'):
        continue
    all_vectors.append((embedding_file[4:-3], torch.load(f'{PATH_TO_EMBEDDINGS}/{embedding_file}').to('cpu')))

## Randomly make a "watch history" by taking the average of 5 random videos
with open(f'{OUTPUT_DIR}/history_query/{VECTOR_MODE}-v_{QUERY_MODE}-q/{ATTEMPT}.yaml', 'w') as f:
    for run in range(QUERIES):
        history = random.sample(all_vectors, HISTORY_LENGTH)
        ## Query is the average pool of the history vectors
        query = torch.mean(torch.stack([vec for name, vec in history]), dim=0).numpy().tolist()

        response = index.query(vector=query, top_k=5, include_values=True, include_metadata=True, filter={"mode": f"{VECTOR_MODE}_pool"})
        f.write(f'Querying History {run}. Videos: {[name for name, vec in history]}\n')
        for i, obj in enumerate(response["matches"]):
            youtubeURL = "None"
            doc = MONGO_DB_CLIENT[DB_NAME][COLLECTION_NAME].find_one({"_id": ObjectId(obj["metadata"]["videoID"])})
            if doc:
                youtubeURL = doc["youtubeURL"]
            f.write(f'    Rank: {i+1}, Distance: {obj["score"]}\n')
            f.write(f'        Title: {obj["metadata"]["title"]}, URL: {youtubeURL}\n')
            f.write(f'        Topics: {[alltopics[topic] for topic in obj["metadata"]["topics"]]}\n')
            f.write(f'        infTopics: {obj["metadata"]["inferenceTopics"]} \n')
            f.write(f'        infComplexities: {obj["metadata"]["inferenceComplexities"]}\n')
        ATTEMPT += 1
        print(f"Attempt {ATTEMPT} of {QUERIES} complete")
    

Attempt 1 of 10 complete
Attempt 2 of 10 complete
Attempt 3 of 10 complete
Attempt 4 of 10 complete
Attempt 5 of 10 complete
Attempt 6 of 10 complete
Attempt 7 of 10 complete
Attempt 8 of 10 complete
Attempt 9 of 10 complete
Attempt 10 of 10 complete
