In [6]:

import os
import json
from dotenv import load_dotenv
import torch
import time
from pymongo import MongoClient
from bson.objectid import ObjectId
from pymongo.server_api import ServerApi

## Load Config
with open('config/videos.json') as config_file:
    videos = json.load(config_file)
with open('config/name_to_url.json') as config_file:
    name_to_url = json.load(config_file)
with open('config/alltopics.json') as config_file:
    alltopics = json.load(config_file)

load_dotenv(dotenv_path="/home/ubuntu/InferenceTest1/rec/inference/.env")


True

In [7]:
from pinecone import Pinecone

pc = Pinecone(api_key=os.environ["PINECONE_KEY"])
index = pc.Index("pretechnigala")
DB_NAME = "preTechnigalaClean_db"
COLLECTION_NAME = "video_metadata"
MONGO_DB_CLIENT = MongoClient(os.getenv("MONGODB_URI"), server_api=ServerApi('1'))


In [13]:
folder_no = 36
# folder_no = len(os.listdir(f'data/outputs'))
shots = ["mixtral8x7b", "full-stack"]
os.makedirs(f'data/outputs/{folder_no}/text', exist_ok=True)
os.makedirs(f'data/outputs/{folder_no}/embeddings', exist_ok=True)
os.makedirs(f'data/outputs/{folder_no}/results', exist_ok=True)

transcripts_dir = "data/transcripts/processed"
prompt_path = "data/prompts/prompt.txt"
oneshots_dir = "data/oneshots"
outputs_dir = f'data/outputs/{folder_no}'
path_to_embeddings = f'{outputs_dir}/embeddings'
mode = "max"
vectors = []



In [11]:
# Prepare embeddings for upload

for embedding_file in os.listdir(path_to_embeddings):
    if not embedding_file.endswith('.pt'):
        continue
    if mode not in embedding_file:
        continue

    name = embedding_file[4:-3]
    metadata = {"name": name, "url": name_to_url[name], "attempt_no": folder_no}
    tensor = torch.load(f'{path_to_embeddings}/{embedding_file}').to('cpu').numpy().tolist()
    vectors.append({"values": tensor, "id": name, "metadata": metadata})
    print(f'Loaded {name}, metadata: {metadata}')
    del tensor

    torch.cuda.empty_cache()

Loaded dpo, metadata: {'name': 'dpo', 'url': 'https://www.youtube.com/watch?v=E5kzAbD8D0w', 'attempt_no': 15}
Loaded full-stack, metadata: {'name': 'full-stack', 'url': 'https://www.youtube.com/watch?v=lauywdXKEXI', 'attempt_no': 15}
Loaded hashing, metadata: {'name': 'hashing', 'url': 'https://www.youtube.com/watch?v=p6wwj0ozifw', 'attempt_no': 15}
Loaded localized_deployment, metadata: {'name': 'localized_deployment', 'url': 'https://www.youtube.com/watch?v=J0NuOlA2xDc', 'attempt_no': 15}
Loaded mlops_llm_eval, metadata: {'name': 'mlops_llm_eval', 'url': 'https://www.youtube.com/watch?v=00AQjBDM46Q', 'attempt_no': 15}
Loaded mixtral8x7b, metadata: {'name': 'mixtral8x7b', 'url': 'https://www.youtube.com/watch?v=UiX8K-xBUpE', 'attempt_no': 15}
Loaded react, metadata: {'name': 'react', 'url': 'https://www.youtube.com/watch?v=HyWYpM_S-2c', 'attempt_no': 15}
Loaded react_svelte, metadata: {'name': 'react_svelte', 'url': 'https://www.youtube.com/watch?v=MnpuK0MK4yo', 'attempt_no': 15}
Load

In [12]:
# Upload

index.upsert(vectors=vectors)

{'upserted_count': 13}

<!-- inferenceComplexities: [ "0.61", "0.69", "0.53" ]
inferenceTopics: [ "Algorithms and Data Structures", "Web Development and Internet Technologies", "Computer Graphics and Visualization" ]
title: "Data structures: Introduction to graphs"
topics: [ "3", "1" ]
type: "avg_pool"
videoID: "65d8fc1895f306b28d1b886d" -->

In [19]:
query_mode = "max"
vector_mode = "avg"

with open(f'{outputs_dir}/results/{vector_mode}-v_{query_mode}-q.txt', 'w') as f:
    for embedding_file in os.listdir(path_to_embeddings):
        if not embedding_file.endswith('.pt'):
            continue
        if query_mode not in embedding_file:
            continue
        name = embedding_file[4:-3]
        query = torch.load(f'{path_to_embeddings}/{embedding_file}').to('cpu').numpy().tolist()
        response = index.query(vector=query, top_k=5, include_values=True, include_metadata=True, filter={"mode": f"{vector_mode}_pool"})
        
        f.write(f'Querying {name}:\n')
        for i, obj in enumerate(response["matches"]):
            youtubeURL = "None"
            doc = MONGO_DB_CLIENT[DB_NAME][COLLECTION_NAME].find_one({"_id": ObjectId(obj["metadata"]["videoID"])})
            if doc:
                youtubeURL = doc["youtubeURL"]
            f.write(f'    Rank: {i+1}, Distance: {obj["score"]}\n')
            f.write(f'        Title: {obj["metadata"]["title"]}, URL: {youtubeURL}\n')
            f.write(f'        Topics: {[alltopics[topic] for topic in obj["metadata"]["topics"]]}\n')
            f.write(f'        infTopics: {obj["metadata"]["inferenceTopics"]} \n')
            f.write(f'        infComplexities: {obj["metadata"]["inferenceComplexities"]}\n')