In [7]:
import os
import tiktoken
import numpy as np
from dotenv import load_dotenv
from openai import OpenAI
from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore
from langchain.document_loaders import YoutubeLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

In [5]:
load_dotenv()

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
UPSTASH_VECTOR_REST_URL = os.getenv("UPSTASH_VECTOR_REST_URL")
UPSTASH_VECTOR_REST_TOKEN = os.getenv("UPSTASH_VECTOR_REST_TOKEN")


In [3]:
openai_client = OpenAI(api_key=OPENAI_API_KEY)

In [4]:
# Free Llama 3.1 API via OpenRouter
# Use this instead of OpenAI if you don't have an OpenAI account with credits

openrouter_client = OpenAI(
  base_url="https://openrouter.ai/api/v1",
  api_key=os.getenv('OPENROUTER_API_KEY')
)

In [15]:
tokenizer = tiktoken.get_encoding('p50k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=100,
        chunk_overlap=0,
        length_function=tiktoken_len,
        separators=["\n\n", "\n", " ", ""]
)

In [5]:
def get_embedding(text, model='text-embedding-3-small'):
    response = openai_client.embeddings.create(input=text, model=model)
    return response.data[0].embedding

In [43]:
loader = YoutubeLoader.from_youtube_url("https://www.youtube.com/watch?v=fs8mcXSJeK0", add_video_info=True)
data = loader.load()

print(data)

transcript = data[0].page_content 

[Document(metadata={'source': 'fs8mcXSJeK0', 'title': 'Topic #6 is Ukraine-Russia War: Presidential Debate between Donald Trump & Kamala Harris', 'description': 'Unknown', 'view_count': 122023, 'thumbnail_url': 'https://i.ytimg.com/vi/fs8mcXSJeK0/hq720.jpg?sqp=-oaymwEmCIAKENAF8quKqQMa8AEB-AH-CYAC0AWKAgwIABABGBMgKyh_MA8=&rs=AOn4CLDsgIuUPMqw7Hi0h1IbfJnhe3nbtQ', 'publish_date': '2024-09-10 00:00:00', 'length': 540, 'author': 'WRAL'}, page_content="Here Again David mure and Lindsey Davis welcome back to this historic ABC News presidential debate tonight we're going to continue here and I want to turn to the war in Ukraine we're now two and a half years uh into this conflict Mr President it has been the position of the Biden Administration that we must defend Ukraine from Russia from Vladimir Putin to defend their sovereignty their democracy that it's in America's best interest to do so arguing that if Putin wins he may be embolden to move even further into other countries you have said you

In [44]:
query_result = get_embedding(transcript)

In [45]:
query_result

[-0.01580139994621277,
 -0.008311848156154156,
 0.028313620015978813,
 0.058760762214660645,
 -0.0130011523142457,
 -0.006506131961941719,
 0.02212418243288994,
 0.06311670690774918,
 0.023557644337415695,
 0.04604852572083473,
 0.05804958939552307,
 0.03429192677140236,
 -0.044670626521110535,
 -0.0324917696416378,
 0.01897946000099182,
 -0.011889942921698093,
 -0.04915991425514221,
 0.02695794403553009,
 0.015712503343820572,
 0.0036781036760658026,
 -0.01200106367468834,
 0.0179460346698761,
 -0.0022293643560260534,
 -0.004705972503870726,
 -0.00527824554592371,
 0.016657032072544098,
 0.0059227473102509975,
 -0.002334929071366787,
 -0.01115098875015974,
 0.008006265386939049,
 0.060049764811992645,
 -0.02116854302585125,
 -0.041136980056762695,
 -0.01903502084314823,
 -0.008367408066987991,
 -0.018412742763757706,
 0.029624847695231438,
 0.007250642403960228,
 0.040914736688137054,
 -0.017757128924131393,
 0.018046043813228607,
 0.0029752636328339577,
 0.019635073840618134,
 -0.003

In [46]:
texts = text_splitter.split_documents(data)

In [47]:
texts

[Document(metadata={'source': 'fs8mcXSJeK0', 'title': 'Topic #6 is Ukraine-Russia War: Presidential Debate between Donald Trump & Kamala Harris', 'description': 'Unknown', 'view_count': 122023, 'thumbnail_url': 'https://i.ytimg.com/vi/fs8mcXSJeK0/hq720.jpg?sqp=-oaymwEmCIAKENAF8quKqQMa8AEB-AH-CYAC0AWKAgwIABABGBMgKyh_MA8=&rs=AOn4CLDsgIuUPMqw7Hi0h1IbfJnhe3nbtQ', 'publish_date': '2024-09-10 00:00:00', 'length': 540, 'author': 'WRAL'}, page_content="Here Again David mure and Lindsey Davis welcome back to this historic ABC News presidential debate tonight we're going to continue here and I want to turn to the war in Ukraine we're now two and a half years uh into this conflict Mr President it has been the position of the Biden Administration that we must defend Ukraine from Russia from Vladimir Putin to defend their sovereignty their democracy that it's in America's best interest to do so arguing that if Putin wins he may be embolden to move even further into other countries you have said you

In [49]:
# Initialize Pinecone
vectorstore = PineconeVectorStore(index_name='langchain-fastapi', embedding=embeddings)

index_name = 'langchain-fastapi'

namespace = 'Ukraine-Russia War: Presidential Debate between Donald Trump & Kamala Harris'

In [50]:
for document in texts:
    print("\n\n\n----")

    print(document.metadata, document.page_content)

    print("\n\n\n----")




----
{'source': 'fs8mcXSJeK0', 'title': 'Topic #6 is Ukraine-Russia War: Presidential Debate between Donald Trump & Kamala Harris', 'description': 'Unknown', 'view_count': 122023, 'thumbnail_url': 'https://i.ytimg.com/vi/fs8mcXSJeK0/hq720.jpg?sqp=-oaymwEmCIAKENAF8quKqQMa8AEB-AH-CYAC0AWKAgwIABABGBMgKyh_MA8=&rs=AOn4CLDsgIuUPMqw7Hi0h1IbfJnhe3nbtQ', 'publish_date': '2024-09-10 00:00:00', 'length': 540, 'author': 'WRAL'} Here Again David mure and Lindsey Davis welcome back to this historic ABC News presidential debate tonight we're going to continue here and I want to turn to the war in Ukraine we're now two and a half years uh into this conflict Mr President it has been the position of the Biden Administration that we must defend Ukraine from Russia from Vladimir Putin to defend their sovereignty their democracy that it's in America's best interest to do so arguing that if Putin wins he may be embolden to move even further into other countries you have said you would solve this war 24 h

In [None]:
# Upset vector database with Upstach
# from upstash_vector import Vector
# 
# from upstash_vector import Index
# 
# index = Index(url=UPSTASH_VECTOR_REST_URL, token=UPSTASH_VECTOR_REST_TOKEN)
# 
# vectors = []
# 
# vectors.append(Vector(vector=embeddings))
# 
# vectors

In [54]:
vectorstore_from_texts = PineconeVectorStore.from_texts([f"Source: {t.metadata['source']}, Title: {t.metadata['title']} \n\nContent: {t.page_content}" for t in texts], embeddings, index_name=index_name, namespace=namespace)

In [52]:
# Initialize Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)

# Connect to your pinecone index
pinecone_index = pc.Index('langchain-fastapi')

In [60]:
query = "What did Vice President Kamala Harris say about the war in Ukraine?"

In [61]:
raw_query_embedding = openai_client.embeddings.create(
    input=[query],
    model="text-embedding-3-small"
)

query_embedding = raw_query_embedding.data[0].embedding

In [62]:
query_embedding

[-0.04447929561138153,
 -0.022471750155091286,
 0.01053429301828146,
 0.07448383420705795,
 -0.009901285171508789,
 0.0018198953475803137,
 0.012217036448419094,
 0.037157513201236725,
 0.04945895075798035,
 0.028822921216487885,
 0.01332479901611805,
 -0.043023381382226944,
 -0.04528110474348068,
 -0.05266618728637695,
 -0.027050502598285675,
 0.035406194627285004,
 -0.059798069298267365,
 0.014094957150518894,
 0.008292392827570438,
 0.014886216260492802,
 -0.0031017346773296595,
 0.03146045282483101,
 -0.011119823902845383,
 0.016679735854268074,
 0.011879432946443558,
 -0.051231373101472855,
 0.010755845345556736,
 -0.01847325637936592,
 -0.03734741732478142,
 0.004863604437559843,
 0.032789766788482666,
 -0.02204974554479122,
 -0.028316516429185867,
 0.03789602220058441,
 -0.020519979298114777,
 0.022366249933838844,
 0.0003804636071436107,
 0.010070087388157845,
 0.04941675066947937,
 0.025636786594986916,
 -0.02724040485918522,
 -0.0338025763630867,
 0.01731274276971817,
 0.0495

In [8]:
top_matches = pinecone_index.query(vector=query_embedding, top_k=10, include_metadata=True, namespace=namespace)

NameError: name 'pinecone_index' is not defined

In [65]:
top_matches

{'matches': [{'id': '1d5a10ff-a36c-446d-8618-4eaf0f90c6a9',
              'metadata': {'text': 'Source: fs8mcXSJeK0, Title: Topic #6 is '
                                   'Ukraine-Russia War: Presidential Debate '
                                   'between Donald Trump & Kamala Harris \n'
                                   '\n'
                                   'Content: Here Again David mure and Lindsey '
                                   'Davis welcome back to this historic ABC '
                                   "News presidential debate tonight we're "
                                   'going to continue here and I want to turn '
                                   "to the war in Ukraine we're now two and a "
                                   'half years uh into this conflict Mr '
                                   'President it has been the position of the '
                                   'Biden Administration that we must defend '
                                   'Uk