In [8]:
import os
import json
from dotenv import load_dotenv
import torch
import time
import math, random
from pymongo import MongoClient
from bson.objectid import ObjectId
from pymongo.server_api import ServerApi
from pinecone import Pinecone

## Load Config
with open('config/videos.json') as config_file:
    videos = json.load(config_file)
with open('config/name_to_url.json') as config_file:
    name_to_url = json.load(config_file)
with open('config/alltopics.json') as config_file:
    alltopics = json.load(config_file)
with open ('config/inferenceTopics.json') as config_file:
    inferenceTopicsJson = json.load(config_file)
with open('config/indexedtopics.json') as config_file:
    file = json.load(config_file)
    indexedtopics = {}
    for i in range(len(file)):
        obj = file[i]
        indexedtopics[obj["topic"]] = obj["subtopics"]

load_dotenv(dotenv_path=".env")

CUR_DIR = os.getcwd()
pc = Pinecone(api_key=os.environ["PINECONE_KEY"])
index = pc.Index("pretechnigala")
DB_NAME = "preTechnigalaClean_db"
DB_NAME1 = "preTechnigalaClean1_db"

COLLECTION_NAME = "video_metadata"
MONGO_DB_CLIENT = MongoClient(os.getenv("MONGODB_URI"), server_api=ServerApi('1'))
OUTPUT_DIR = f"{CUR_DIR}/data/topics"
QUERY_MODE = "avg"  
VECTOR_MODE = "avg"




In [28]:
### CREATE SOME FAKE WATCH HISTORY OF 10 VIDEOS
maintopic = 1


In [32]:
### CREATE SOME FAKE WATCH HISTORY OF 
def generateHistoryJson(doc_list, topic_id, limit):
    subtopics = indexedtopics[str(topic_id)] + [topic_id]
    docs = MONGO_DB_CLIENT[DB_NAME][COLLECTION_NAME].find(
        {"topicId": {"$in": subtopics}})
    doc_list = []
    for doc in docs:
        doc_list.append(doc)
        clipIds: list = []
        videoIds = []
        durations = []
        for i in range(limit):
            random_doc = random.choice(doc_list)
            clipIds.append(str(random_doc["clips"][1]))
            videoIds.append(str(random_doc["_id"]))
            durations.append(30)
            print(random_doc)
    return {"clipIds": clipIds, "videoIds": videoIds, "durations": durations}

SyntaxError: invalid syntax (2270914621.py, line 1)

In [54]:
good_ids = set()
for doc in MONGO_DB_CLIENT[DB_NAME][COLLECTION_NAME].find():
    good_ids.add(str(doc["_id"]))

bad_ids = set()
for doc in MONGO_DB_CLIENT[DB_NAME1][COLLECTION_NAME].find():
    if doc["_id"] not in good_ids:
        bad_ids.add(str(doc["_id"]))



In [55]:
DIR = f'{CUR_DIR}/pipeline/data/outputs'

vectors = []
for id in os.listdir(DIR):
    tensor = torch.load(f'{DIR}/{id}/avg_pool.pt', map_location=torch.device('cpu')).numpy().tolist()
    if id in good_ids:
        current_doc = MONGO_DB_CLIENT[DB_NAME][COLLECTION_NAME].find_one({"_id": ObjectId(id)})
        metadata_avg = {"mode": "avg_pool", "title": current_doc["title"], "topics": [str(topic) for topic in current_doc["topicId"]], "videoID": str(current_doc["_id"]), "inferenceTopics": [], "inferenceComplexities": []}
        vectors.append({"values": tensor, "id": f'{id}_avg', "metadata": metadata_avg})


In [56]:
# upsert 1/4
index.upsert(vectors[:100])
index.upsert(vectors[100:200])
index.upsert(vectors[200:])


{'upserted_count': 127}

In [None]:
index.delete(ids=[f'{str(bad_id)}_avg' for bad_id in bad_ids])

In [None]:
reverseInferenceTopicsJson = {}
for k,v in inferenceTopicsJson.items():
    reverseInferenceTopicsJson[v] = k
print (reverseInferenceTopicsJson)

In [None]:
## Add inferencetopicsIds field
for doc in MONGO_DB_CLIENT[DB_NAME][COLLECTION_NAME].find():
    inferenceTopicIds = []
    try:
        inferenceTopics = doc['inferenceTopics']
        inferenceTopicIds = []
        for topic in inferenceTopics:
            inferenceTopicIds.append(int(reverseInferenceTopicsJson[topic]))
        ## Add upload topicIds 
        doc['inferenceTopicIds'] = inferenceTopicIds
        MONGO_DB_CLIENT[DB_NAME][COLLECTION_NAME].update_one({'_id': ObjectId(doc['_id'])}, {"$set": doc}, upsert=False)
    except Exception as e:
        print(f"Error in doc: {doc['_id']}, {e}")
        doc['inferenceTopicIds'] = inferenceTopicIds
        MONGO_DB_CLIENT[DB_NAME][COLLECTION_NAME].update_one({'_id': ObjectId(doc['_id'])}, {"$set": doc}, upsert=False)
        continue


In [None]:
## If documentID does not have a matching one in Collection2, remove it CLEANS DB
COLLECTION2 = "inference_summary"
knownIds = set()
for doc in MONGO_DB_CLIENT[DB_NAME][COLLECTION2].find():
    knownIds.add(str(doc['_id']))
for doc in MONGO_DB_CLIENT[DB_NAME][COLLECTION_NAME].find():
    try:
        if str(doc['_id']) not in knownIds:
            print(f"Removing doc: {doc['_id']}")
            MONGO_DB_CLIENT[DB_NAME][COLLECTION_NAME].delete_one({'_id': ObjectId(doc['_id'])})

    except Exception as e:
        print(f"Error in doc: {doc['_id']}, {e}")
        continue


In [None]:
## make a copy of it in "video_metadata_backup" collection

for doc in MONGO_DB_CLIENT[DB_NAME][COLLECTION_NAME].find():
    try:
        MONGO_DB_CLIENT[DB_NAME]["video_metadata_backup"].insert_one(doc)
    except Exception as e:
        print(f"Error in doc: {doc['_id']}, {e}")
        continue


In [None]:
## For each doc, append id 65e619556bea5f71742c85e7 to 'clips'
for doc in MONGO_DB_CLIENT[DB_NAME][COLLECTION_NAME].find():
    try:
        doc['clips'] = [ObjectId("65e619556bea5f71742c85e7")]
        MONGO_DB_CLIENT[DB_NAME][COLLECTION_NAME].update_one({'_id': ObjectId(doc['_id'])}, {"$set": doc}, upsert=False)
    except Exception as e:
        print(f"Error in doc: {doc['_id']}, {e}")
        continue


In [None]:
import numpy as np

# Example arrays
array1 = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100])    
array2 = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

# Perform the weighted addition
result = array1 + 0.1 * array2

# Renormalize to range 0-1
min_val = np.min(result)
max_val = np.max(result)
renormalized = (result - min_val) / (max_val - min_val)

print("Original Result:", result)
print("Renormalized Result:", renormalized)
