# <span style="color:blue"> Building Knowledge Base</span>

In [None]:
!pip install pinecone-client
!pip install python-dotenv
!pip install gensim nltk
!pip install PyPDF2
!pip install openai==0.28
import openai
print(openai.__version__)
import time
from time import sleep
import gensim
from gensim.models import Word2Vec
import nltk
nltk.download('punkt')
import numpy as np
from pinecone import Pinecone, PodSpec
from dotenv import load_dotenv
import os
from PyPDF2 import PdfReader



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Create and connect a Pincone index
# Load environment variables from .env file
env = load_dotenv() # Copy .env file to the same directory before running

# Retrieve Pinecone API key from environment variables
pinecone_api_key = os.getenv("PINECONE_API_KEY")
# Retrieve OpenAI API key from environment variables
openai.api_key = os.getenv("OPENAI_API_KEY")

# initialize connection to pinecone (get API key at app.pinecone.io)
pc = Pinecone(api_key=pinecone_api_key, environment="us-west1-gcp-free")

# Specify the name of index
index_name = "cstugpt-kb"

# check if index already exists
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric='cosine',
        spec=PodSpec(environment="us-west1-gcp", pod_type="p1.x1")
        #metadata_config={'indexed': ['cstu_id']} #indexed key in the dictionary specifies cstu_id column be indexed, means that the values in the column will be stored in a way that makes them faster to search
    )

# Connect to the Pinecone index
index = pc.Index(index_name)

In [None]:
# Buiding own embedding model based on Word2Vector
def tokenize_text(text):
    return nltk.word_tokenize(text)

# Train a Word2Vec model
def train_word2vec_model(tokens):
    model = Word2Vec(tokens, vector_size=1536, min_count=1)
    return model

# Generate an embedding for a text
def generate_embedding(model, text):
    tokens = nltk.word_tokenize(text)
    word_vectors = [model.wv[token] for token in tokens if token in model.wv]
    if not word_vectors: # If no valid word vectors are found, return a vector of zeros
        return np.zeros(model.vector_size)
    embedding = np.mean(word_vectors, axis=0)
    return embedding

# Specify knowledge base file and parameters
cstu_file = r"cstugpt_kb.pdf"

# Create a reader object for the knowledge base file
print("Knowledge base file name:", cstu_file)
reader = PdfReader(cstu_file)
page_len = len(reader.pages)
print("Length of the knowledge base file:", page_len)

# Extract text from each page and concatenate
doc = " "
for i in range(page_len):
    doc += reader.pages[i].extract_text()
    print("Page completed:", i)
doc = doc.splitlines()
doc = [line for line in doc if line.strip()]

# Train Word2Vec model on entire corpus
tokens = [tokenize_text(text) for text in doc]
model = Word2Vec(tokens, vector_size=1536, min_count=1)
model.save("CSTU-embedding-model.mdl")

# Set chunking and embedding parameters
count = 0
chunk_size = 3
stride = 1
cstu_id = "cstu-kb"

# Connect to Pinecone index
index = pc.Index(index_name)

# Iterate over the document in chunks and upsert embeddings to index
i_begin=0; # beginning of the chunk
while i_begin <= (len(doc) - chunk_size):
    i_end = min(len(doc), i_begin + chunk_size)
    doc_chunk = doc[i_begin:i_end]
    texts = ""
    for x in doc_chunk:
        texts += x
    print("The", count+1, "doc chunk text:", texts)
    print("==========================================================")
    # Create embeddings of the chunk texts
    # embed = np.mean([model.wv[token] for token in tokenize_text(texts) if token in model.wv], axis=0)
    embed = generate_embedding(model, texts)
    # print("Embeds length:", len(embed))

    # Prepare metadata
    metadata = {"cstu_id": cstu_id + '_' + str(count), "text": texts}
    count += 1

    # Upsert to Pinecone and corresponding namespace
    index.upsert(vectors=[(metadata["cstu_id"], embed, metadata)], namespace="cstu")
    i_begin += stride

Knowledge base file name: cstugpt_ka.pdf
Length of the knowledge base file: 2
Page completed: 0
Page completed: 1
The 1 doc chunk text:  California Science And Technology University (CSTU)  documents : Contact Email: admission@cstu.edu ; Contact Office: (408) 400 -3948;  Address: 1601 McCarthy Boulevard, Milpitas, CA 95035;  The president of CSTU  is Prof. Glen Qin.  
The 2 doc chunk text: Contact Email: admission@cstu.edu ; Contact Office: (408) 400 -3948;  Address: 1601 McCarthy Boulevard, Milpitas, CA 95035;  The president of CSTU  is Prof. Glen Qin.  CSTU offer s programs: Bachelor /Master  of Science in Computer Systems and Engineering, 
The 3 doc chunk text: Address: 1601 McCarthy Boulevard, Milpitas, CA 95035;  The president of CSTU  is Prof. Glen Qin.  CSTU offer s programs: Bachelor /Master  of Science in Computer Systems and Engineering, Bachelor of Science in Business Administration, Bachelor /Master of Business Administration and 
The 4 doc chunk text: The president of CSTU

In [None]:
# Using OpenAI embedding model
embed_model = "text-embedding-ada-002"

# Specify knowledge base file and parameters
cstu_file = r"cstugpt_kb.pdf"

# Create a reader object for the knowledge base file
print("Knowledge base file name:", cstu_file)
reader = PdfReader(cstu_file)
page_len = len(reader.pages)
print("Length of the knowledge base file:", page_len)

# Extract text from each page and concatenate
doc = " "
for i in range(page_len):
    doc += reader.pages[i].extract_text()
    print("Page completed:", i)
doc = doc.splitlines()
doc = [line for line in doc if line.strip()]

# Set chunking and embedding parameters
count = 0
chunk_size = 3
stride = 1
cstu_id = "cstu-kb"

# Connect to Pinecone index
index = pc.Index(index_name)

# Iterate over the document in chunks and upsert embeddings to index
i_begin=0; # beginning of the chunk
while i_begin <= (len(doc) - chunk_size):
    i_end = min(len(doc), i_begin + chunk_size)
    doc_chunk = doc[i_begin:i_end]
    texts = ""
    for x in doc_chunk:
        texts += x
    print("The", count+1, "doc chunk text:", texts)
    print("==========================================================")

    try: res = openai.Embedding.create(input=texts, engine=embed_model)
    except:
            done = False
            while not done:
                sleep(5)
                try:
                    res = openai.Embedding.create(input=texts, engine=embed_model)
                    done = True
                except:
                    pass
    embed = res['data'][0]['embedding']

    # Prepare metadata
    metadata = {"cstu_id": cstu_id + '_' + str(count), "text": texts}
    count += 1

    # Upsert to Pinecone and corresponding namespace
    index.upsert(vectors=[(metadata["cstu_id"], embed, metadata)], namespace="cstu")
    i_begin += stride

Knowledge base file name: cstugpt_ka.pdf
Length of the knowledge base file: 2
Page completed: 0
Page completed: 1
The 1 doc chunk text:  California Science And Technology University (CSTU)  documents : Contact Email: admission@cstu.edu ; Contact Office: (408) 400 -3948;  Address: 1601 McCarthy Boulevard, Milpitas, CA 95035;  
The 2 doc chunk text: Contact Email: admission@cstu.edu ; Contact Office: (408) 400 -3948;  Address: 1601 McCarthy Boulevard, Milpitas, CA 95035;  The president of CSTU  is Prof. Glen Qin.  
The 3 doc chunk text: Address: 1601 McCarthy Boulevard, Milpitas, CA 95035;  The president of CSTU  is Prof. Glen Qin.  CSTU offer s programs: Bachelor /Master  of Science in Computer Systems and Engineering, 
The 4 doc chunk text: The president of CSTU  is Prof. Glen Qin.  CSTU offer s programs: Bachelor /Master  of Science in Computer Systems and Engineering, Bachelor of Science in Business Administration, Bachelor /Master of Business Administration and 
The 5 doc chunk text

KeyboardInterrupt: 

In [None]:
# Checking knowledge base
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.00405,
 'namespaces': {'cstu': {'vector_count': 405}},
 'total_vector_count': 405}

In [None]:
# Checking knowledge base
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.00042,
 'namespaces': {'cstu': {'vector_count': 42}},
 'total_vector_count': 42}

In [None]:
# Unused codes
def mls_upsert(cstu_file, index_name, name_space, cstu_id, chunk_size, stride):
   # create a reader object
    print("Knowledge base file name:", cstu_file)
    reader = PdfReader(cstu_file)
    page_len = len(reader.pages)
    print("length of the knowledge base file:", page_len)
    doc = ""
    for i in range(page_len):
        doc += reader.pages[i].extract_text()
        print("page completed:", i)
    doc = doc.splitlines()
    # Train Word2Vec model on entire corpus
    tokens = [tokenize_text(text) for text in doc]
    model = train_word2vec_model(tokens)
    model.save("CSTU-embedding-model.mdl")
    #Connect to index
    index = pc.Index(index_name)
    count = 0
    for i in range(0, len(doc), chunk_size):#The loop iterates over the document in steps of chunk_size
        #find begining and end of the chunk
        i_begin = max(0, i-stride)
        i_end = min(len(doc), i_begin+chunk_size)
        doc_chunk = doc[i_begin:i_end]
        print("-------------------------------------------------------------")
        print("The ", i//chunk_size + 1, " doc chunk text:", doc_chunk)
        texts = ""
        for x in doc_chunk:
            texts += x
        print("Texts:", texts)

        #Create embeddings of the chunk texts
        embed = generate_embedding(model, texts)
        print("Embeds length:", len(embed))

        # Meta data preparation
        metadata = {
            "cstu_id": cstu_id + '_' + str(count),
            "text": texts
        }
        count += 1
        print("Upserted vector count is: ", count)
        print("==========================================================")

        #upsert to pinecone and corresponding namespace
        #index.upsert(vectors=[(metadata["cstu_id"], embed, metadata)], namespace=name_space)

mls_upsert(r"cstugpt_kb.pdf", "cstugpt-kb", "cstu","cstu-kb", 8, 1)