In [1]:
!pip install farm-haystack
!pip install urllib3==1.25.4
!!pip install sentence-transformers
!pip install faiss-cpu
!pip install --force-reinstall -v "SQLAlchemy==1.4.47"

Collecting farm-haystack
  Downloading farm_haystack-1.20.1-py3-none-any.whl (789 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m789.9/789.9 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting boilerpy3 (from farm-haystack)
  Downloading boilerpy3-1.0.6-py3-none-any.whl (22 kB)
Collecting canals==0.7.0 (from farm-haystack)
  Downloading canals-0.7.0-py3-none-any.whl (32 kB)
Collecting events (from farm-haystack)
  Downloading Events-0.5-py3-none-any.whl (6.8 kB)
Collecting lazy-imports==0.3.1 (from farm-haystack)
  Downloading lazy_imports-0.3.1-py3-none-any.whl (12 kB)
Collecting posthog (from farm-haystack)
  Downloading posthog-3.0.2-py2.py3-none-any.whl (37 kB)
Collecting prompthub-py==4.0.0 (from farm-haystack)
  Downloading prompthub_py-4.0.0-py3-none-any.whl (6.9 kB)
Collecting quantulum3 (from farm-haystack)
  Downloading quantulum3-0.9.0-py3-none-any.whl (10.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.7/10.7 MB[0m [

In [2]:
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import EmbeddingRetriever
from haystack import Document
import pandas as pd
from haystack.nodes import FARMReader
from haystack.pipelines import ExtractiveQAPipeline
from haystack.nodes import PromptNode, PromptTemplate, AnswerParser
from haystack.pipelines import Pipeline
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
DEFAULT_EMBEDDING_MODEL= "sentence-transformers/multi-qa-mpnet-base-dot-v1"
DEFAULT_READER = "deepset/roberta-base-squad2"
DEFAULT_GENERATOR = "pszemraj/flan-t5-large-instruct-dolly_hhrlhf"
DATA_FILE= "AIUseCases.csv"
PARENT_DIR= "/content/gdrive/MyDrive/"

In [4]:
masterAI_df= pd.read_csv(PARENT_DIR + DATA_FILE)
masterAI_df= masterAI_df.drop(columns=['Unnamed: 0'])
masterAI_df.head()

Unnamed: 0,Use Case Name,Summary of Use Case,Agency,Bureau / Department,Use Case ID,Stage of System Development Life Cycle,Date Initiated (if applicable),Date when Development and/or Acquisition began (if applicable),Date Implemented (if applicable),Developer Information,...,Additional Life Cycle Comments,Agency Training Used,Included In Enterprise Inventory,Public Data Link,Code Included In Agency Inventory,FISMA System Name,Withhold Use Case,Explanation To Withhold Use Case,Releaseable to the Public,text
0,4% Repair Dashboard,The model reviews the descriptions of expenses...,United States Department of Agriculture,ARS,USDA-1,Operation and Management,0,0,0,In-house,...,0,0,0,0,0,0,0,0,0,"United States Department of Agriculture, 4% Re..."
1,ARS Project Mapping,NLP of research project plans including term a...,United States Department of Agriculture,ARS,USDA-2,Operation and Management,1/1/2020,1/1/2021,5/1/2022,Contracted,...,0,0,0,0,0,0,0,0,0,"United States Department of Agriculture, ARS P..."
2,NAL Automated indexing,"Cogito (vendor) software, uses AI for automate...",United States Department of Agriculture,ARS,USDA-3,Operation and Management,6/1/2011,6/1/2012,12/1/2012,In-house,...,0,0,0,0,0,0,0,0,0,"United States Department of Agriculture, NAL A..."
3,Predictive modeling of invasive pest species a...,Macine learning algorithms are used to develop...,United States Department of Agriculture,APHIS,USDA-4,Operation and Management,0,0,0,In-house,...,0,0,0,0,0,0,0,0,0,"United States Department of Agriculture, Predi..."
4,Detection of pre-symptomatic HLB infected citrus,Identify pixels with HLB infection signature i...,United States Department of Agriculture,APHIS,USDA-5,Operation and Management,0,0,0,In-house,...,0,0,0,0,0,0,0,0,0,"United States Department of Agriculture, Detec..."


In [5]:
def load_document_store():
    return FAISSDocumentStore(faiss_index_factory_str="Flat",return_embedding=True)

In [6]:
def get_embedding_retriever(doc_store):
    return EmbeddingRetriever(
        document_store=doc_store,
        embedding_model=DEFAULT_EMBEDDING_MODEL,
    )

In [7]:
def create_document():
    table_df = pd.read_csv(PARENT_DIR + DATA_FILE)
    table_dict= table_df.to_dict(orient='records')
    return [Document(content = item['text'], meta = item) for item in table_dict]

In [8]:
def get_reader():
    return FARMReader(model_name_or_path=DEFAULT_READER)

In [9]:
def summarization_template():
    return 'Summarize this document: {join(documents)}\nSummary:'

In [10]:
def question_answering_template():
    return 'Given the context please answer the question. Context: {join(documents)};\nQuestion: {query};\nAnswer:'

In [11]:
def get_prompt_node():
    rag_prompt = PromptTemplate(
        prompt=summarization_template(),
        output_parser=AnswerParser()
    )
    return PromptNode(model_name_or_path=DEFAULT_GENERATOR, default_prompt_template=rag_prompt)

In [12]:
def QADocStoreRetriever():
    document_store_faiss= load_document_store()
    retriever_faiss= get_embedding_retriever(document_store_faiss)
    doc= create_document()
    document_store_faiss.delete_documents()
    document_store_faiss.write_documents(doc)
    document_store_faiss.update_embeddings(retriever=retriever_faiss)
    return retriever_faiss

In [13]:
def extractiveQA(query):
    reader= get_reader()
    pipeline= ExtractiveQAPipeline(reader, retriever_faiss)
    prediction = pipeline.run(
      query=query,
      params={
          "Retriever": {"top_k": 10},
          "Reader": {"top_k": 5}
        }
      )
    return prediction["answers"][0].answer

In [14]:
def generativeQA(query):
    prompt_node= get_prompt_node()
    pipeline= Pipeline()
    pipeline.add_node(component=retriever_faiss, name="retriever", inputs=["Query"])
    pipeline.add_node(component=prompt_node, name="prompt_node", inputs=["retriever"])
    output = pipeline.run(query=query)
    return output["answers"][0].answer


In [15]:
retriever_faiss= QADocStoreRetriever()

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)16ebc/.gitattributes:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b6b5d16ebc/README.md:   0%|          | 0.00/8.65k [00:00<?, ?B/s]

Downloading (…)b5d16ebc/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)ebc/data_config.json:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)16ebc/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)6ebc/train_script.py:   0%|          | 0.00/13.9k [00:00<?, ?B/s]

Downloading (…)b6b5d16ebc/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5d16ebc/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Writing Documents: 10000it [00:17, 588.17it/s]
Updating Embedding:   0%|          | 0/142 [00:00<?, ? docs/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Documents Processed: 10000 docs [00:12, 770.45 docs/s]


In [16]:
extractiveQA("Which agencies and bureaus are using deepfake detection?")

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  3.67 Batches/s]


'Department of the Interior, TMDL and Data Mining Investigations'

In [17]:
generativeQA("Which agencies and bureaus are using deepfake detection?")

Downloading (…)lve/main/config.json:   0%|          | 0.00/818 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/325 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.35k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1554 > 512). Running this sequence through the model will result in indexing errors
Both `max_new_tokens` (=192) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


'The Department of the Interior is developing a new system to help identify corrosion levels in wells. This system will be used to identify areas of concern and provide recommendations for further inspections.'