<a href="https://colab.research.google.com/github/tractorjuice/BSHR_Loop/blob/main/demo02/Building_David_Shapiro_AI_Body_of_Knowledge_Part_3_Upsert_YouTube_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# David Shapiro AI Body of Knowledge Using Langchain & OpenAI
## Part 3, create the vector database

This example shows how to create and query an internal knowledge base using ChatGPT.

This does not requires a GPU runtime.

### Runtime Checks

In [None]:
try:
  gpu_info = !nvidia-smi
except:
  print('No GPU')
else:
  gpu_info = '\n'.join(gpu_info)
  if gpu_info.find('failed') >= 0:
    print('Not connected to a GPU')
  print(gpu_info)

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

## Set Up


###Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
import os

KB_FOLDER = "/content/gdrive/Shareddrives/AI/DavidShapiroKB"  # Google drive folder to save the knowledgebase
YT = os.path.join(KB_FOLDER, "youtube")  # Sub-directory for YouTube FAIS datastore files
YT_DATASTORE = os.path.join(YT, "datastore")  # Sub-directory for YouTube FAIS datastore files
YT_AUDIO = os.path.join(YT, "audio")  # Sub-directory for audio files
YT_TRANSCRIPTS = os.path.join(YT_AUDIO, "transcripts")  # Sub-directory for transcripts of audio files
YT_TRANSCRIPTS_TEXT = os.path.join(YT_TRANSCRIPTS, "full_text")  # Sub-directory for text of audio files
YT_TRANSCRIPTS_WHISPER = os.path.join(YT_TRANSCRIPTS, "whisper_chunks")  # Sub-directory for Whisper chunks of audio files
PODCAST = os.path.join(KB_FOLDER, "podcast")  # Sub-directory for YouTube FAIS datastore files
PODCAST_DATASTORE = os.path.join(PODCAST, "datastore")  # Sub-directory for YouTube FAIS datastore files
PODCAST_AUDIO = os.path.join(PODCAST, "audio")  # Sub-directory for YouTube FAIS datastore files
PODCAST_TRANSCRIPTS = os.path.join(PODCAST, "transcripts")  # Sub-directory for YouTube FAIS datastore files
PODCAST_TRANSCRIPTS_TEXT = os.path.join(PODCAST_TRANSCRIPTS, "full_text")  # Sub-directory for text of audio files
PODCAST_TRANSCRIPTS_WHISPER = os.path.join(PODCAST_TRANSCRIPTS, "whisper_chunks")  # Sub-directory for Whisper chunks of audio files

# Check if directory exists and if not, create it
if not os.path.exists(KB_FOLDER):
    os.makedirs(KB_FOLDER)

# Check if directory exists and if not, create it
if not os.path.exists(YT_DATASTORE):
    os.makedirs(YT_DATASTORE)

# Check if sub-directory exists and if not, create it
if not os.path.exists(YT_AUDIO):
    os.makedirs(YT_AUDIO)

# Check if sub-directory exists and if not, create it
if not os.path.exists(YT_TRANSCRIPTS):
    os.makedirs(YT_TRANSCRIPTS)

# Check if sub-directory exists and if not, create it
if not os.path.exists(YT_TRANSCRIPTS_TEXT):
    os.makedirs(YT_TRANSCRIPTS_TEXT)

# Check if sub-directory exists and if not, create it
if not os.path.exists(YT_TRANSCRIPTS_WHISPER):
    os.makedirs(YT_TRANSCRIPTS_WHISPER)

# Check if sub-directory exists and if not, create it
if not os.path.exists(PODCAST):
    os.makedirs(PODCAST)

# Check if sub-directory exists and if not, create it
if not os.path.exists(PODCAST_DATASTORE):
    os.makedirs(PODCAST_DATASTORE)

# Check if sub-directory exists and if not, create it
if not os.path.exists(PODCAST_AUDIO):
    os.makedirs(PODCAST_AUDIO)

# Check if sub-directory exists and if not, create it
if not os.path.exists(PODCAST_TRANSCRIPTS):
    os.makedirs(PODCAST_TRANSCRIPTS)

Install required dependencies

In [None]:
!pip install -q langchain
!pip install -q openai
!pip install -q tiktoken

Set up OPEN_API_KEY and necessary variables

In [None]:
import os
os.environ["OPENAI_API_KEY"] = "sk-....." # Your OpenAI API Key here

#MODEL = "gpt-3"
#MODEL = "gpt-3.5-turbo"
#MODEL = "gpt-3.5-turbo-0613"
#MODEL = "gpt-3.5-turbo-16k"
MODEL = "gpt-3.5-turbo-16k-0613"
#MODEL = "gpt-4"
#MODEL = "gpt-4-0613"
#MODEL = "gpt-4-32k-0613"

Initialise preferred vectorstore

In [None]:
vectorstore = 'FAIS' # Set to 'Pinecone' or 'FAISS' for the vector datbase. If using FAISS, no GPU required

In [None]:
if vectorstore == 'Pinecone':
    !pip install -q pinecone-client
    from langchain.vectorstores import Pinecone
    from tqdm.auto import tqdm
    import pinecone

    # initialize pinecone
    pinecone.init(
        api_key="a7c950e0-95b0-49db-a614-b8cb97a9af2a",  # find at app.pinecone.io
        environment="us-west4-gcp-free"  # next to api key in console
        )

    index_name = "knowledge" # Put your Pincecone index name here
    name_space = "wardleykb" # Put your Pincecone namespace here

else:
    !pip install -q faiss-cpu
    from langchain.vectorstores import FAISS

# Build the datastore

## Split text and create chunks, create metadata and upsert embeddings to vectorstore

In [None]:
import json
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
import tiktoken

In [None]:
#Required for YouTube title and author extraction
!pip install -q pytube
import pytube

### Upsert embeddings to preferred vector store

In [None]:
docs = []
metadatas = []
embedding_data = []
unique_video_ids = []
transcriptions = []
counter = 0
texts = []
start_times = []

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0, separator="\n")
embeddings = OpenAIEmbeddings()
embeddings_file = f'{YT_DATASTORE}/embeddings.json'

with open(f'{YT_AUDIO}/videos.txt', 'r') as file:
    for line in file:
        # Remove linebreak which is the last character of the string
        curr_place = line[:-1]
        # Add item to the list
        unique_video_ids.append(curr_place)

total_videos = len(unique_video_ids)

for video_id in unique_video_ids:
    counter = counter + 1
    transcript_filename = f'{YT_TRANSCRIPTS_WHISPER}/' + video_id + '_large.txt'
    url = "https://www.youtube.com/watch?v=" + video_id
    try:
        file = open(transcript_filename, 'r')
    except:
        print(f'{counter} of {total_videos}: File does not exist {transcript_filename}, skipping.')
    else:
        print(f'{counter} of {total_videos}: Loading {transcript_filename} ......\n')
        transcription = json.load(file)
        texts = []
        start_times = []
        docs = []
        metadatas = []

        for chunk in transcription['chunks']:
            if chunk['timestamp'][0] is not None:
                text = chunk['text']
                start = int(chunk['timestamp'][0])
                texts.append(text)
                start_times.append(start)

        yt = pytube.YouTube(url)
        try:
            video_title = yt.title
        except:
            video_title = ""
        try:
            video_author = yt.author
        except:
            video_author = ""

        for i, d in enumerate(texts):
            splits = text_splitter.split_text(d)
            docs.extend(splits)
            metadatas.extend([{ "source": "YouTube", "source_video": video_id, "start_time": start_times[i], "title": video_title, "author": video_author}])

        if vectorstore == 'Pinecone':
            try:
                vector_store = Pinecone.from_texts(docs, embeddings, metadatas=metadatas, index_name=index_name, namespace=name_space)
            except:
                print("Error upserting data into the vectorstore\n")
        else:
            try:
                vector_store = FAISS.from_texts(docs, embeddings, metadatas=metadatas)
                if os.path.exists(f"{YT_DATASTORE}/index.faiss"):
                    existing_index=FAISS.load_local(f"{YT_DATASTORE}", embeddings)
                    existing_index.merge_from(vector_store)
                    existing_index.save_local(f"{YT_DATASTORE}")
                else:
                    vector_store.save_local(f"{YT_DATASTORE}") # Download the files `$DATA_STORE_DIR/index.faiss` and `$DATA_STORE_DIR/index.pkl` to local

            except:
                print("Error upserting data into the vectorstore\n")

## Store the chunks for processing later

In [None]:
import json

unique_video_ids = []
transcriptions = []
counter = 0
texts = []

with open(f'{YT_AUDIO}/videos.txt', 'r') as file:
    for line in file:
        # Remove linebreak which is the last character of the string
        curr_place = line[:-1]
        # Add item to the list
        unique_video_ids.append(curr_place)

total_videos = len(unique_video_ids)

for video_id in unique_video_ids:
    counter = counter + 1
    transcript_filename = f'{YT_TRANSCRIPTS_WHISPER}/' + video_id + '_large.txt'
    url = "https://www.youtube.com/watch?v=" + video_id
    try:
        file = open(transcript_filename, 'r')
    except:
        print(f'{counter} of {total_videos}: File does not exist {transcript_filename}, skipping.')
    else:
        print(f'{counter} of {total_videos}: Loading {transcript_filename} ......\n')
        transcription = json.load(file)
        text = transcription['text']

        # Write text to file
        with open(f'{YT_TRANSCRIPTS_TEXT}/' + video_id + '_large.txt', 'w') as output_file:
            output_file.write(text)