In [None]:
import json
import html2text
import tiktoken
from datetime import datetime
from langchain.document_loaders import JSONLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from pathlib import Path

def read_and_chunk(file_path):
    text_array = []
    id_array = []
    meta_data_array = []

    data = json.loads(Path(file_path).read_text())
    h = html2text.HTML2Text()

    for i in range(len(data)):
        created_by = ' Created by: ' + data[i]['CreatedBy'] + ' ' if data[i]['CreatedBy'] else ' '    

        date_time = datetime.fromisoformat(data[i]['CreateDate'].replace("Z", "+00:00"))
        date_string = date_time.strftime("%B %d, %Y")
        date_string_two = date_time.strftime("%d-%m-%Y")
        date_string_three = date_time.strftime("%Y/%m/%d")
        date_string_four = date_time.strftime("%A, the %d of %B in the year %Y")

        created_date = ' Project log created date: ' + date_string + ' - ' + date_string_two + ' - ' + date_string_three + ' - ' + date_string_four + ' '

        short_description = data[i]['ShortDescription'] if data[i]['ShortDescription'] else  ' '

        id = str(data[i]['NodeID'])

        metadata = {
            'project-number': '11448',
            'id': id,
            'update-date': created_date
        }

        content = h.handle(data[i]['Description']) + short_description
        tokenizer = tiktoken.get_encoding('cl100k_base')

        def tiktoken_len(text):
            tokens = tokenizer.encode(
                text,
                disallowed_special=()
            )
            return len(tokens)
        tiktoken_len(content)

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=20,
            length_function=tiktoken_len,
            separators=["\n\n", "\n", " ", ""]
        )

        chunks = text_splitter.split_text(content)
        # iterate over the chunks and append the created date and created by to the end of the chunk
        for j in range(len(chunks)):
            compiled_chunk = created_by + created_date + chunks[j]
            clean_chunk = compiled_chunk.replace("\n", "; ").replace("  ", " ")

            text_array.append(clean_chunk)

            id_array.append(id + '_' + str(j))

            metadata['text'] = compiled_chunk
            meta_data_array.append(metadata)
    return id_array, text_array, meta_data_array

In [None]:
from tqdm.auto import tqdm
from time import sleep
import openai
import pinecone

EMBED_MODEL = "text-embedding-ada-002"
PINECONE_API_KEY = ''
PINECONE_ENV = ''
PINECONE_INDEX_NAME = ''
PINECONE_DIMENSION = 1536
OPENAI_API_KEY = ""

openai.api_key = OPENAI_API_KEY

batch_size = 100  # how many embeddings we create and insert at once

def create_and_upload_embeddings(id_array, text_array, meta_array):
    
    for i in tqdm(range(0, len(id_array), batch_size)):
        # find end of batch
        i_end = min(len(id_array), i+batch_size)
        meta_batch = meta_array[i:i_end]
        # get ids
        ids_batch = id_array[i:i_end]
        # get texts to encode
        texts = text_array[i:i_end]
        # create embeddings (try-except added to avoid RateLimitError)
        try:
            res = openai.Embedding.create(input=texts, engine=EMBED_MODEL)
        except:
            done = False
            while not done:
                sleep(5)
                try:
                    res = openai.Embedding.create(input=texts, engine=EMBED_MODEL)
                    done = True
                except:
                    pass
        embeds = [record['embedding'] for record in res['data']]

        to_upsert = list(zip(ids_batch, embeds, meta_batch))

        pinecone.init(
            api_key=PINECONE_API_KEY,
            environment=PINECONE_ENV
        )

        if PINECONE_INDEX_NAME not in pinecone.list_indexes():
            # create a new index if it doesn't exist
            pinecone.create_index(
                name=PINECONE_INDEX_NAME,
                metric='cosine',
                dimension=PINECONE_DIMENSION
            )

        index = pinecone.Index(PINECONE_INDEX_NAME)

        # upsert to Pinecone
        index.upsert(vectors=to_upsert)

In [None]:
id_array, text_array, meta_array = read_and_chunk('')
create_and_upload_embeddings(id_array, text_array, meta_array)