In [None]:
from haystack.nodes import PDFToTextConverter
from haystack.nodes import PreProcessor
from haystack.document_stores import PineconeDocumentStore
from haystack.nodes import EmbeddingRetriever
from dotenv import dotenv_values
from pathlib import Path
from haystack.pipelines import Pipeline
from typing import Dict, List
import pandas as pd
import numpy as np
import openai
import pinecone
import os

In [None]:
config = dotenv_values('api_keys.env')
openai.api_key = config['OPENAI_API_KEY']
pinecone_api_key = config['PINECONE_API_KEY']
pinecone_env = config['PINECONE_ENV']

In [None]:
pinecone.init(api_key=pinecone_api_key,
              environment=pinecone_env)

# pinecone.create_index(name='diw-test-index',
#                        dimension=1536)

list_of_indexes = pinecone.list_indexes()
index = pinecone.Index(list_of_indexes[0])

In [None]:
df = pd.read_pickle('diw.pkl')
df.head()

In [None]:
meta_data = df.to_dict('records')

In [None]:

converter = PDFToTextConverter(
    remove_numeric_tables=True,
    valid_languages=["de","en"]
)

processor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="word",
    split_length=200,
    split_respect_sentence_boundary=True,
    split_overlap=0
)


In [None]:
document_store = PineconeDocumentStore(
    api_key=pinecone_api_key,
    environment=pinecone_env,
    similarity="cosine",
    index='testing',
    embedding_dim=1536
)


In [None]:
def get_embeddings(text: str, model="text-embedding-ada-002") -> List[float]:
    embedding = openai.Embedding.create(
        input=text,
        model=model
    )["data"][0]["embedding"]

    return embedding

In [None]:
def get_embeddings_for_pinecone(text: str, model="text-embedding-ada-002") -> List[str]:
    embedding = openai.Embedding.create(
        input=text,
        model=model
    )["data"][0]["embedding"]

    return [str(i) for i in embedding]

In [None]:
def create_prompt(dataframe: pd.DataFrame, df_column: str, question: str) -> str:

    content = []
    dataframe[df_column].apply(lambda x: content.append(str(x).replace(",", '').replace("'", '')))

    header = """Answer the question as truthfully as possible using the provided context, and if the answer
     is not contained within the text below, say "I don't know."\n\nContext:\n"""

    header_total = str(header + str(content)).replace('[', '').replace(']', '')

    return header_total + "\n\n Q: " + question + "\n A:"

In [None]:
# clean and process doc, attach metadata, post to pinecone
processed_doc_store = []

for count, filename in enumerate(os.listdir("pdfs/")):
    if count < 2:
        print(f'Working on {filename}')
        doc = converter.convert(file_path=Path("pdfs/"+str(filename)), meta=meta_data[count])
        processed_docs = processor.process(doc)
        processed_doc_store.append(processed_docs)

        # document_store.write_documents(processed_docs)  ## posting to pinecone


In [None]:
processed_doc_store[0][0].meta['author']

In [None]:
retriever = EmbeddingRetriever(
   document_store=document_store,
   batch_size=8,
   embedding_model="text-embedding-ada-002",
   api_key=openai.api_key,
   max_seq_len=1024
)

In [None]:
document_store.update_embeddings(
    retriever=retriever, 
    batch_size=8
)

In [None]:
# clean and process doc, split into chunks, attach metadata, embed with OpenAI, post to Pinecone

processed_doc_store = []

for count, filename in enumerate(os.listdir("pdfs/")):
    if count < 2:
        print(f'Working on {filename}')
        doc = converter.convert(file_path=Path("pdfs/"+str(filename)), meta=meta_data[count])
        processed_docs = processor.process(doc)
        
        for doc_count in range(len(processed_docs)):
            embedding_meta_data = get_embeddings(text=processed_docs[doc_count].content)
            
            processed_docs[doc_count].meta['embedding'] = embedding_meta_data

        processed_doc_store.append(processed_docs)
        # document_store.write_documents(processed_docs)
    else:
        break

In [None]:
def flatten_document_store(documents_store_to_flatten: List[List[Dict]]) -> list:
    return [element for sublist in documents_store_to_flatten for element in sublist]

In [None]:
flat_list = flatten_document_store(processed_doc_store)

In [73]:
# Takes in List[List[dict]] processed_doc_store, extracts unique id, embeddings, and meta data
# uses list(zip(ids_batch, embeds, meta_data)) to upset to Pinecone DB 'testing'

def upsert_to_pinecone(list_of_document_schemas: List[dict], batch_size: int, pinecone_index: object) -> None:

    for i in range(0, len(list_of_document_schemas), batch_size):
        i_end = min(len(list_of_document_schemas), i+batch_size)

        meta_batch = list_of_document_schemas[i:i_end]

        ids_batch = [x.id for x in meta_batch]

        doc_content = [x.content for x in meta_batch]

        embeddings = [get_embeddings(x) for x in doc_content]

        meta_data = [{'title': x.meta['title'], 
                      'author': x.meta['author'], 
                      'year': x.meta['year'], 
                      'keywords': x.meta['keywords'], 
                      'issue': x.meta['issue'], 
                      'volume': x.meta['volume'], 
                      'journal': x.meta['journal'], 
                      'date_added': x.meta['date_added']}

                      for x in meta_batch]

        to_upsert = list(zip(ids_batch, embeddings, meta_data))
        pinecone_index.upsert(vectors=to_upsert)


In [74]:
upsert_to_pinecone(list_of_document_schemas=flat_list, batch_size=20, pinecone_index=index)

In [76]:
index.delete(delete_all='true', namespace='vectors')

{}

In [77]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 97}},
 'total_vector_count': 97}

In [None]:
candidate_docs = retriever.retrieve(
    query="Printabilty in 3D printing",
    top_k=3, 
    filters={}
)