In [5]:
!pip install requests \
python-telegram-bot==13.11 \
loguru \
chalice \
boto3 \
wget \
tiktoken \
openai \
langchain \
python-dotenv \
pinecone-client



In [6]:
# All imports
import os
from dotenv import load_dotenv

# other imports
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from uuid import uuid4
import pinecone
from tqdm.auto import tqdm
import glob


In [4]:

# Load environment variables
load_dotenv()
pinecone_key = os.getenv("PINECONE_KEY")
openai_api_key = os.getenv("OPENAI_API_KEY")


In [None]:

# Load YouTube data
text_files = glob.glob('startup_failures/*.txt')  # adjust the path as needed

# other parts of your code

# Create a function to measure the length of our text in tokens
tokenizer = tiktoken.get_encoding('p50k_base')
def tiktoken_len(text):
    tokens = tokenizer.encode(text, disallowed_special=())
    return len(tokens)

# Initialize a text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)
# Different chunking strategies: https://www.pinecone.io/learn/chunking-strategies/

# Initialize the embedding model
model_name = 'text-embedding-ada-002'
embed = OpenAIEmbeddings(
    openai_api_key=openai_api_key
    , model=model_name
    )
# Initialize the Pinecone vector database
index_name = 'youtube-retrieval-augmentation'
pinecone.init(api_key=pinecone_key, environment="asia-southeast1-gcp-free")
# pinecone.create_index(name=index_name, metric='dotproduct', dimension=1536) # Uncomment this if creating for the first time
index = pinecone.GRPCIndex(index_name)


In [None]:

# Add data to the database
batch_limit = 100
texts = []
metadatas = []

for file in tqdm(text_files):
    with open(file, 'r') as f:
        lines = f.readlines()
        title = lines[0].strip()  # get the title from the first line
        transcript = ' '.join(line.strip() for line in lines[1:])  # get the transcript from the remaining lines
    video_id = os.path.basename(file).replace('.txt', '')
    metadata = {
        'video-id': video_id,
        'title': title
    }
    # print(transcript)
    record_texts = text_splitter.split_text(transcript)
    record_metadatas = [{"chunk": j, "text": text, **metadata} for j, text in enumerate(record_texts)]
    texts.extend(record_texts)
    metadatas.extend(record_metadatas)
    print(f'processing {video_id} with {len(texts)}' )
    if len(texts) >= batch_limit:
        print('len(texts) >= batch_limit')
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = embed.embed_documents(texts)
        index.upsert(vectors=zip(ids, embeds, metadatas))
        texts = []
        metadatas = []
    # Insert remaining data
    if texts:  # if there are any remaining texts
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = embed.embed_documents(texts)
        index.upsert(vectors=zip(ids, embeds, metadatas))
    print('done processing ' + video_id)
