In [1]:
import os
import json
from dotenv import load_dotenv
import torch
import time
import math, random
from pymongo import MongoClient
from bson.objectid import ObjectId
from pymongo.server_api import ServerApi
from pinecone import Pinecone

## Load Config
with open('config/videos.json') as config_file:
    videos = json.load(config_file)
with open('config/name_to_url.json') as config_file:
    name_to_url = json.load(config_file)
with open('config/alltopics.json') as config_file:
    alltopics = json.load(config_file)
with open ('config/inferenceTopics.json') as config_file:
    inferenceTopicsJson = json.load(config_file)
with open('config/indexedtopics.json') as config_file:
    file = json.load(config_file)
    indexedtopics = {}
    for i in range(len(file)):
        obj = file[i]
        indexedtopics[obj["topic"]] = obj["subtopics"]

load_dotenv(dotenv_path=".env")

CUR_DIR = os.getcwd()
pc = Pinecone(api_key=os.environ["PINECONE_KEY"])
index = pc.Index("pretechnigala")
DB_NAME = "preTechnigalaClean_db"
DB_NAME1 = "preTechnigalaClean1_db"

COLLECTION_NAME = "video_metadata"
MONGO_DB_CLIENT = MongoClient(os.getenv("MONGODB_URI"), server_api=ServerApi('1'))
OUTPUT_DIR = f"{CUR_DIR}/data/topics"
QUERY_MODE = "avg"  
VECTOR_MODE = "avg"

  from tqdm.autonotebook import tqdm


In [15]:
DEFAULT_DURATION = 30
### CREATE SOME FAKE WATCH HISTORY OF LIMIT LENGTH
def generateHistoryJson(topic_id, limit):
    subtopics = indexedtopics[str(topic_id)] + [topic_id]
    docs = MONGO_DB_CLIENT[DB_NAME][COLLECTION_NAME].find(
        {"topicId": {"$in": subtopics}})
    doc_list = []
    for doc in docs:
        doc_list.append(doc)
    # pick 10 random docs from doc_list with no repeats
    clipIds = []
    videoIds = []
    durations = []
    for i in range(limit):
        doc = random.choice(doc_list)
        clipIds.append(str(doc["clips"][0]))
        videoIds.append(str(doc["_id"]))
        durations.append(DEFAULT_DURATION)
    return {"clipIds": clipIds, "videoIds": videoIds, "durations": durations}

In [18]:
topics = [1, 7, 13, 19, 25, 31, 37, 43, 49, 55, 61, 67]
histories = {}
for topic in topics:
    history = generateHistoryJson(topic, 8)
    histories[topic] = json.dumps(history)

{"clipIds": ["65e79710c819b19674dd5a57", "65e79711c819b19674dd5a5a", "65e7970ac819b19674dd5a1b", "65e796fdc819b19674dd5997", "65e796dfc819b19674dd5888", "65e7970cc819b19674dd5a29", "65e7970fc819b19674dd5a47", "65e79710c819b19674dd5a57", "65e79707c819b19674dd59fe", "65e7966dc819b19674dd5440"], "videoIds": ["65d8fcfd95f306b28d1b8a10", "65d8fd0595f306b28d1b8a19", "65d8fcf595f306b28d1b8a07", "65d8fcd395f306b28d1b89d6", "65d8fc7b95f306b28d1b8970", "65d8fcf995f306b28d1b8a0b", "65d8fcfc95f306b28d1b8a0f", "65d8fcfd95f306b28d1b8a10", "65d8fcf195f306b28d1b8a02", "65d8fcfa95f306b28d1b8a0d"], "durations": [30, 30, 30, 30, 30, 30, 30, 30, 30, 30]}


In [19]:
print(histories[13])

{"clipIds": ["65e79710c819b19674dd5a57", "65e79711c819b19674dd5a5a", "65e7970ac819b19674dd5a1b", "65e796fdc819b19674dd5997", "65e796dfc819b19674dd5888", "65e7970cc819b19674dd5a29", "65e7970fc819b19674dd5a47", "65e79710c819b19674dd5a57", "65e79707c819b19674dd59fe", "65e7966dc819b19674dd5440"], "videoIds": ["65d8fcfd95f306b28d1b8a10", "65d8fd0595f306b28d1b8a19", "65d8fcf595f306b28d1b8a07", "65d8fcd395f306b28d1b89d6", "65d8fc7b95f306b28d1b8970", "65d8fcf995f306b28d1b8a0b", "65d8fcfc95f306b28d1b8a0f", "65d8fcfd95f306b28d1b8a10", "65d8fcf195f306b28d1b8a02", "65d8fcfa95f306b28d1b8a0d"], "durations": [30, 30, 30, 30, 30, 30, 30, 30, 30, 30]}
