In [5]:
from haystack.nodes import PDFToTextConverter
from haystack.nodes import PreProcessor
from haystack.document_stores import PineconeDocumentStore
from dotenv import dotenv_values
from pathlib import Path
from haystack.pipelines import Pipeline
import pandas as pd
import numpy as np
import openai
import pinecone
import os

In [6]:
config = dotenv_values('apikeys.env')
openai_api_key = config['OPEN_AI_API']
pinecone_api_key = config['PINECONE_API']
pinecone_env = config['PINECONE_ENV']

In [7]:
pinecone.init(api_key=pinecone_api_key,
              environment=pinecone_env)

# pinecone.create_index(name='diw-test-index',
#                        dimension=1536)
pinecone.list_indexes()

['diw-test-index']

In [8]:
document_store = PineconeDocumentStore(
    api_key=pinecone_api_key,
    similarity="cosine",
    index='diw-test-index',
    embedding_dim=1536
)

ImportError: Failed to import 'PineconeDocumentStore', which is an optional component in Haystack.
Run 'pip install 'farm-haystack[pinecone]'' to install the required dependencies and make this component available.
(Original error: No module named 'pinecone')

In [26]:
df = pd.read_pickle('diw.pkl')

In [27]:
meta_data = df.to_dict('records')

In [32]:
import pinecone

converter = PDFToTextConverter(
    remove_numeric_tables=True,
    valid_languages=["de","en"]
)

processor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="word",
    split_length=200,
    split_respect_sentence_boundary=True,
    split_overlap=0
)

# document_store = InMemoryDocumentStore(use_bm25=True)




ImportError: Failed to import 'PineconeDocumentStore', which is an optional component in Haystack.
Run 'pip install 'farm-haystack[pinecone]'' to install the required dependencies and make this component available.
(Original error: No module named 'pinecone')

In [None]:
def get_embeddings(text: str, model="text-embedding-ada-002") -> np.ndarray:
    embedding = openai.Embedding.create(
        input=text,
        model=model
    )["data"][0]["embedding"]

    return np.array(embedding)

In [None]:
def create_prompt(dataframe, df_column, question):

    content = []
    dataframe[df_column].apply(lambda x: content.append(str(x).replace(",", '').replace("'", '')))

    header = """Answer the question as truthfully as possible using the provided context, and if the answer
     is not contained within the text below, say "I don't know."\n\nContext:\n"""

    header_total = str(header + str(content)).replace('[', '').replace(']', '')

    return header_total + "\n\n Q: " + question + "\n A:"

In [None]:
for count, filename in enumerate(os.listdir("pdfs/")):
    if count < 1:
        print(f'Working on {filename}')
        doc = converter.convert(file_path=Path("pdfs/"+str(filename)), meta=meta_data[count])
        processed_docs = processor.process(doc)
        
        for doc_count in range(len(processed_docs)):
            embedding_meta_data = get_embeddings(text=processed_docs[doc_count].content)
            
            processed_docs[doc_count].meta['embedding'] = embedding_meta_data

        document_store.write_documents(processed_docs)
    else:
        break

In [None]:
# testing retrieval 
from haystack.nodes import BM25Retriever

retriever = BM25Retriever(document_store)

candidate_docs = retriever.retrieve(
        query="3D Printability",
        top_k=3, 
        filters=None)
