In [23]:
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from pinecone import Pinecone

import json
import dotenv
import os

dotenv.load_dotenv(dotenv_path="../.env")
uri = os.getenv("MONGODB_URI")


In [24]:
client = MongoClient(uri, server_api=ServerApi('1'))
                          
# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your dceployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

database_name = "preTechnigalaClean_db"
collection_name = "video_metadata"
db = client[database_name]
collection = db[collection_name]
PINECONE_INDEX_NAME = "pretechnigala"
PINECONE_CLIENT = Pinecone(api_key=os.getenv("PINECONE_KEY"))

Pinged your dceployment. You successfully connected to MongoDB!


In [29]:
index = PINECONE_CLIENT.Index(PINECONE_INDEX_NAME)
print(index.describe_index_stats())


{'dimension': 4096,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 380}},
 'total_vector_count': 380}


In [30]:
# GETTING ANALYTICS OF CUR STATE count true, false, and doesnt exist
count_vectorize_failed = [0, 0, 0]
count_is_vectorized = [0, 0, 0]
for doc in collection.find():
    if "isVectorized" in doc:
        if doc["isVectorized"] == True:
            count_is_vectorized[0] += 1
        elif doc["isVectorized"] == False:
            count_is_vectorized[1] += 1
        else:
            count_is_vectorized[2] += 1
    else:
        count_vectorize_failed[2] += 1
    
    if "vectorizeFailed" in doc:
        if doc["vectorizeFailed"] == True:
            count_vectorize_failed[0] += 1
        elif doc["vectorizeFailed"] == False:
            count_vectorize_failed[1] += 1
        else:
            count_vectorize_failed[2] += 1
    
count_docs = collection.count_documents({
                "$and": [ 
            {
                "$or": [
                    {"isVectorized": False},      
                    {"isVectorized": {"$exists": False}}
                ],
            },
            {
                "$or": [
                    {"vectorizeFailed": False},      
                    {"vectorizeFailed": {"$exists": False}}
                ],
            }
        ]}
)

In [31]:
print(f"count_vectorize_failed: {count_vectorize_failed}")
print(f"count_is_vectorized: {count_is_vectorized}")
print(f"count_docs: {count_docs}")

count_vectorize_failed: [4, 380, 11]
count_is_vectorized: [380, 0, 0]
count_docs: 7


In [1]:
### Clean transcripts
OUTPUT_DIR = "/home/ubuntu/InferenceTest1/rec/inference/pipeline/data"
DB_NAME = "preTechnigalaClean_db"
COLLECTION_NAME = "video_metadata"
import re
import nltk
import string
from nltk.corpus import stopwords

def clean_transcript(doc):
    """
    Clean the transcript of any unwanted characters
    Args:
        doc (dict): The document to clean
    """
    yt_id = doc["youtubeURL"].split("v=")[1]
    name = doc["_id"]
    print(f"Cleaning transcript yt_id: {yt_id}")
    # download_transcript(yt_id, f'{OUTPUT_DIR}/transcripts/raw/{name}.txt')
    with open(f'{OUTPUT_DIR}/transcripts/raw/{yt_id}.txt', 'r') as file:
        with open(f'{OUTPUT_DIR}/transcripts/clean/{yt_id}.txt', 'w') as file_clean:
            for i,line in enumerate(file):
                if i == 0:
                    continue
                ## Remove Timestamp
                pattern = r'\d+\.\d+\,\d+\.\d+\,'
                line = re.sub(pattern, '', line)
                line = line.replace("'", "")
                line = line.replace('"', "")
                ## if last char is a space, remove
                if line[-1] == " ":
                    line = line[:-1]
                line = line.replace("  ", " ")
                line = line.replace('\n', ' ')
                file_clean.write(line)

    nltk.download('stopwords')
    with open(f'{OUTPUT_DIR}/transcripts/clean/{yt_id}.txt', 'r') as file:
        with open(f'{OUTPUT_DIR}/transcripts/processed/{name}.txt', 'w') as file_processed:
            text = file.read()
            text = text.lower()
            text = text.translate(str.maketrans('', '', string.punctuation))
            stop_words = set(stopwords.words('english'))
            filtered_text = [word for word in text.split() if word not in stop_words]
            filtered_text = ' '.join(filtered_text)
            file_processed.write(text)


In [6]:
failed_documents = []
documents = collection.find()


In [7]:
from utils.transcript import download_transcript

os.makedirs("data/transcripts/raw", exist_ok=True)

for doc in documents:
    yt_id = doc["youtubeURL"].split("v=")[1]
    try:
        transcript = download_transcript(yt_id, f'data/transcripts/raw/{doc["_id"]}.txt')
    except Exception as e:
        print(e)
        failed_documents.append(doc)
        continue

Downloading transcript for oz9cEqFynHU to data/transcripts/raw/65d8fc6e95f306b28d1b8961.txt
Downloaded transcript for oz9cEqFynHU to data/transcripts/raw/65d8fc6e95f306b28d1b8961.txt
Downloading transcript for DuDz6B4cqVc to data/transcripts/raw/65d8fc1995f306b28d1b8870.txt
Downloaded transcript for DuDz6B4cqVc to data/transcripts/raw/65d8fc1995f306b28d1b8870.txt
Downloading transcript for RfXt_qHDEPw to data/transcripts/raw/65d8febe95f306b28d1b8c1d.txt
Downloaded transcript for RfXt_qHDEPw to data/transcripts/raw/65d8febe95f306b28d1b8c1d.txt
Downloading transcript for nKzEJWbkPbQ to data/transcripts/raw/65d8fc1795f306b28d1b886b.txt
Downloaded transcript for nKzEJWbkPbQ to data/transcripts/raw/65d8fc1795f306b28d1b886b.txt
Downloading transcript for -VgHk7UMPP4 to data/transcripts/raw/65d8fc1895f306b28d1b886c.txt
Downloaded transcript for -VgHk7UMPP4 to data/transcripts/raw/65d8fc1895f306b28d1b886c.txt
Downloading transcript for gXgEDyodOJU to data/transcripts/raw/65d8fc1895f306b28d1b88

In [None]:
id_to_id = {}
for doc in documents:
    id_to_id[doc["youtubeURL"].split("v=")[1]] = doc["_id"]
    

In [None]:
old_collection = client[database_name][collection_name]
new_collection = client["preTechnigalaClean1_db"][collection_name]

## copy over
for doc in old_collection.find():
    new_collection.insert_one(doc)

In [None]:
topicFile = open("../config/alltopics.json", "r")
topicNames = json.loads(topicFile.read())

In [None]:
topic_breakdown = [[] for _ in range(len(topicNames)+1)]
for doc in new_collection.find():
    for topic_id in doc["topicId"]:
        topic_breakdown[topic_id].append(doc)


In [None]:
for i in range(1,len(topicNames)):
    count = len(topic_breakdown[i])
    if count > 15:
        print(f"{i}   {topicNames[str(i)]}: {count}")
    else:
        print(f"{i}      {topicNames[str(i)]}: {count}")

In [None]:
for doc in topic_breakdown[47]:
    yt_link = doc["youtubeURL"]
    print(f'Name: {doc["title"]}\n  Link: {yt_link}\n  Topics: {[topicNames[str(i)] for i in doc["topicId"]]}')
