In [1]:
## This file produces data for the model to use

import os
import json
from dotenv import load_dotenv
import torch
import time
import math, random
from pymongo import MongoClient
from bson.objectid import ObjectId
from pymongo.server_api import ServerApi

## Load Config
with open('config/videos.json') as config_file:
    videos = json.load(config_file)
with open('config/name_to_url.json') as config_file:
    name_to_url = json.load(config_file)
with open('config/alltopics.json') as config_file:
    alltopics = json.load(config_file)

load_dotenv(dotenv_path=".env")


True

In [2]:
from pinecone import Pinecone

CUR_DIR = os.getcwd()
pc = Pinecone(api_key=os.environ["PINECONE_KEY"])
index = pc.Index("pretechnigala")
DB_NAME = "preTechnigalaClean_db"
COLLECTION_NAME = "video_metadata"
MONGO_DB_CLIENT = MongoClient(os.getenv("MONGODB_URI"), server_api=ServerApi('1'))
OUTPUT_DIR = f"{CUR_DIR}/data/validation"
QUERY_MODE = "avg"  
VECTOR_MODE = "avg"
PATH_TO_EMBEDDINGS = f"{OUTPUT_DIR}/embeddings/{QUERY_MODE}"
PATH_TO_DB_VECTORS = "/Users/klinalb/Workspaces/dartmouth/CS98/discite/rec_engine/inference/pipeline/data/outputs"


  from tqdm.autonotebook import tqdm


In [None]:
#### PERFORMS QUERIES BASED ON JUST THE VIDEO EMBEDDINGS
with open(f'{OUTPUT_DIR}/video_query/{VECTOR_MODE}-v_{QUERY_MODE}-q.yaml', 'w') as f:
    for embedding_file in os.listdir(PATH_TO_EMBEDDINGS):
        if not embedding_file.endswith('.pt'):
            continue
        name = embedding_file[4:-3]
        query = torch.load(f'{PATH_TO_EMBEDDINGS}/{embedding_file}', map_location=torch.device('cpu')).tolist()
        response = index.query(vector=query, top_k=5, include_values=True, include_metadata=True, filter={"mode": f"{VECTOR_MODE}_pool"})
        
        f.write(f'Querying {name}:\n')
        for i, obj in enumerate(response["matches"]):
            youtubeURL = "None"
            doc = MONGO_DB_CLIENT[DB_NAME][COLLECTION_NAME].find_one({"_id": ObjectId(obj["metadata"]["videoID"])})
            if doc:
                youtubeURL = doc["youtubeURL"]
            f.write(f'    Rank: {i+1}, Distance: {obj["score"]}\n')
            f.write(f'        Title: {obj["metadata"]["title"]}, URL: {youtubeURL}\n')
            f.write(f'        Topics: {[alltopics[topic] for topic in obj["metadata"]["topics"]]}\n')
            f.write(f'        infTopics: {obj["metadata"]["inferenceTopics"]} \n')
            f.write(f'        infComplexities: {obj["metadata"]["inferenceComplexities"]}\n')

In [None]:
## History Query TESTS
## attempt 3. Checking just two videos, and the distance between them. 


ATTEMPT = 4
HISTORY_LENGTH = 2
TOP_K = 15
QUERIES = 10
SINK = torch.load(f'{PATH_TO_DB_VECTORS}/65d8fc3f95f306b28d1b88fe/{VECTOR_MODE}_pool.pt', map_location=torch.device('cpu'))

os.makedirs(f'{OUTPUT_DIR}/history_query/{VECTOR_MODE}-v_{QUERY_MODE}-q', exist_ok=True)
all_vectors = [] #NAME, VECTOR
for embedding_file in os.listdir(PATH_TO_EMBEDDINGS):
    if not embedding_file.endswith('.pt'):
        continue
    all_vectors.append((embedding_file[4:-3], torch.load(f'{PATH_TO_EMBEDDINGS}/{embedding_file}', map_location=torch.device('cpu'))))

## Randomly make a "watch history" by taking the average of 5 random videos
with open(f'{OUTPUT_DIR}/history_query/{VECTOR_MODE}-v_{QUERY_MODE}-q/{ATTEMPT}.yaml', 'w') as f:
    f.write(f'Parameters: Attempt {ATTEMPT}, History Length {HISTORY_LENGTH}, Top K {TOP_K}, Query Mode {QUERY_MODE}, Vector Mode {VECTOR_MODE}\n\n')
    for run in range(QUERIES):
        f.write(f'\n###################### Run {run+1} #######################\n\n')
        history = random.sample(all_vectors, HISTORY_LENGTH)
        ## Query is the average pool of the history vectors
        query = torch.mean(torch.stack([vec for name, vec in history]), dim=0).numpy().tolist()

        f.write(f'Distance Between History 0,1: {torch.dist(history[0][1], history[1][1])}\n')
        f.write(f'Distance from sink video to 0, 1, and query: {torch.dist(SINK, history[0][1])}, {torch.dist(SINK, history[1][1])}, {torch.dist(SINK, torch.tensor(query))}\n\n')

        response = index.query(vector=query, top_k=15, include_values=True, include_metadata=True, filter={"mode": f"{VECTOR_MODE}_pool"})
        f.write(f'Querying History {run}. Videos: {[name for name, vec in history]}\n')
        for i, obj in enumerate(response["matches"]):
            youtubeURL = "None"
            doc = MONGO_DB_CLIENT[DB_NAME][COLLECTION_NAME].find_one({"_id": ObjectId(obj["metadata"]["videoID"])})
            if doc:
                youtubeURL = doc["youtubeURL"]
            f.write(f'    Rank: {i+1}, Distance: {obj["score"]}\n')
            f.write(f'        Title: {obj["metadata"]["title"]}, URL: {youtubeURL}, ID: {obj["metadata"]["videoID"]}\n')
            f.write(f'        Topics: {[alltopics[topic] for topic in obj["metadata"]["topics"]]}\n')
            f.write(f'        infTopics: {obj["metadata"]["inferenceTopics"]} \n')
            f.write(f'        infComplexities: {obj["metadata"]["inferenceComplexities"]}\n')
        print(f"Run {run+1} of {QUERIES} complete")
    

In [None]:
###### TOPIC TEST
ATTEMPT = 0
os.makedirs(f'{OUTPUT_DIR}/topic_query/{VECTOR_MODE}-v_{QUERY_MODE}-q', exist_ok=True)
with open(f'{OUTPUT_DIR}/topic_query/{VECTOR_MODE}-v_{QUERY_MODE}-q/{0}.yaml', 'w') as f:
    for topic in alltopics:
            query = torch.load(f'{CUR_DIR}/data/topics/{topic}.pt', map_location=torch.device('cpu')).tolist()
            response = index.query(vector=query, top_k=5, include_values=True, include_metadata=True, filter={"mode": f"{VECTOR_MODE}_pool"})
            f.write(f'Querying {topic}. Topic Name: {alltopics[topic]}\n')
            for i, obj in enumerate(response["matches"]):
                youtubeURL = "None"
                doc = MONGO_DB_CLIENT[DB_NAME][COLLECTION_NAME].find_one({"_id": ObjectId(obj["metadata"]["videoID"])})
                if doc:
                    youtubeURL = doc["youtubeURL"]
                f.write(f'    Rank: {i+1}, Distance: {obj["score"]}\n')
                f.write(f'        Title: {obj["metadata"]["title"]}, URL: {youtubeURL}\n')
                f.write(f'        Topics: {[alltopics[topic] for topic in obj["metadata"]["topics"]]}\n')
                f.write(f'        infTopics: {obj["metadata"]["inferenceTopics"]} \n')
                f.write(f'        infComplexities: {obj["metadata"]["inferenceComplexities"]}\n')

In [3]:
## Load all vectors to map from ID to vector
def load_all_vectors(mode="avg"):
    all_vectors = {}
    # if isvectorized is true
    docs = MONGO_DB_CLIENT[DB_NAME][COLLECTION_NAME].find({"isVectorized": True})
    for doc in docs:
        all_vectors[str(doc["_id"])] = {"vector": torch.load(f'{PATH_TO_DB_VECTORS}/{doc["_id"]}/{mode}_pool.pt', map_location=torch.device('cpu')), "topics": doc["topicId"], "inferenceTopics": doc["inferenceTopics"], "inferenceComplexities": doc["inferenceComplexities"]}
    
    return all_vectors

vectors = load_all_vectors("avg")


In [4]:
##### UMAP Visualizations
import numpy as np
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import umap

## A list of torch tensors 
umap_vectors = [obj["vector"] for obj in vectors.values()]
reducer = umap.UMAP()
embedding = reducer.fit_transform([vec.numpy().tolist() for vec in umap_vectors])

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


: 