In [4]:
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

import dotenv
import os

dotenv.load_dotenv(dotenv_path="../.env")
uri = os.getenv("MONGODB_URI")


In [5]:
client = MongoClient(uri, server_api=ServerApi('1'))
                          
# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your dceployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

database_name = "kevininfpipe"
collection_name = "video_metadata"

Pinged your dceployment. You successfully connected to MongoDB!


In [6]:
db = client[database_name]
collection = db[collection_name]

# if doc does not have a field called "isVectorized", add it and set it to False
# if doc does not have a field called "isClipped", add it and set it to False
# for doc in collection.find():
#     if "isVectorized" not in doc:
#         collection.update_one({"_id": doc["_id"]}, {"$set": {"isVectorized": False}})
#     if "isClipped" not in doc:
#         collection.update_one({"_id": doc["_id"]}, {"$set": {"isClipped": False}})
    
doc = collection.find_one({"isVectorized": False})  
print(doc)


{'_id': ObjectId('65d8fef4e8d94e84768ec9fb'), 'title': 'bytebytego', 'description': "A captivating journey into the unknown territories beneath the waves, showcasing the mysterious life forms and the uncharted territories of our planet's oceans.", 'youtubeURL': 'https://www.youtube.com/watch?v=UF9Iqmg94tk', 'uploadDate': '2024-02-23T12:00:00Z', 'uploader': '507f1f77bcf86cd799439011', 'duration': 3600, 'thumbnailURL': 'https://example-s3-link.com/thumbnail.jpg', 'topicId': '507f191e810c19729de860ea', 'clips': ['507f1f77bcf86cd799439012', '507f1f77bcf86cd799439013'], 'views': ['507f1f77bcf86cd799439014', '507f1f77bcf86cd799439015'], 'likes': ['507f1f77bcf86cd799439016', '507f1f77bcf86cd799439017'], 'dislikes': ['507f1f77bcf86cd799439018', '507f1f77bcf86cd799439019'], 'isVectorized': False, 'isClipped': False}


In [7]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

torch.cuda.empty_cache()
device = "cuda:0" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1", torch_dtype=torch.float16, attn_implementation="flash_attention_2").to(device)


  from .autonotebook import tqdm as notebook_tqdm
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.60s/it]


In [8]:
from inference import vectorize_pipeline

vectorize_pipeline(doc, device, model, tokenizer)

Folder Number: 65d8fef4e8d94e84768ec9fb
Downloading transcript for UF9Iqmg94tk to ../data/technigala/65d8fef4e8d94e84768ec9fb/raw_transcript.txt
Downloaded transcript for UF9Iqmg94tk to ../data/technigala/65d8fef4e8d94e84768ec9fb/raw_transcript.txt
    Tokens: 7960


[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Decoding finished: bytebytego in 68.57 seconds
cuda memory allocated: 14.00 GB cuda memory cached: 14.36 GB
cuda memory allocated: 14.00 GB cuda memory cached: 14.36 GB




Hidden states finished: bytebytego in 0.274 seconds
torch.Size([4096])
Target: bytebytego finished. Wrote to file.
cuda memory allocated: 14.00 GB cuda memory cached: 14.73 GB
Loaded vector, metadata: {'name': 'vector', 'url': 'https://www.youtube.com/watch?v=UF9Iqmg94tk', 'title': 'bytebytego', 'topics': '507f191e810c19729de860ea'}


In [9]:
from validator import VideoContent
import json

f_transcript = open(f"../data/technigala/{doc['_id']}/bytebytego.json", "r")
json_data = f_transcript.read()

try:
    transcript = VideoContent.parse_raw(json_data)
except Exception as e:
    print(e)

f_transcript.close()
