In [12]:
from pathlib import Path
import pymupdf

In [13]:
def get_pdf_files(folder_path):
    dir_path = Path(folder_path)
    pdf_files = list(dir_path.glob("*.pdf"))
    return pdf_files

In [85]:
def extract_text_from_pdf(pdf_path):
    doc = pymupdf.open(pdf_path)
    text = ""
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

In [109]:
def add_extracted_text(file_path):
    pdf_files = get_pdf_files(file_path)
    text_file = {}
    for pdf_file in pdf_files:
        file_string = str(pdf_file)
        pdf_name = file_string.split('\\')[-1]
        pdf_identifier = pdf_name.split(".")[1]

        extracted_text = extract_text_from_pdf(pdf_file)
        text_file[pdf_identifier] = extracted_text
        print(f"Text from {pdf_file} added into {pdf_identifier}")

    print("Completed text extraction")
    return text_file
    

Now it's time to split the texts, chunk them and add the text chunks to the vectorstore

In [132]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [137]:
def split_texts(text_file, model):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)

    pdf_docs = [{"page_content": text, "metadata": {"file_id": file_id}} for file_id, text in text_file.items()]

    splits = text_splitter.split_documents(pdf_docs)

    print("Splitting completed")

    embeddings = [model.encode(split).tolist() for split in splits]

    vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)

    retriever = vectorstore.as_retriever()

    return retriever

In [138]:
path = 'C:/Users/j/.vscode/Resume_search/novelIdeas/data'
model = SentenceTransformer('all-MiniLM-L6-v2')
text_file = add_extracted_text(path)
retriever = split_texts(text_file, model)



Text from C:\Users\j\.vscode\Resume_search\novelIdeas\data\2409.10031v1.pdf added into 10031v1
Text from C:\Users\j\.vscode\Resume_search\novelIdeas\data\2409.10949v1.pdf added into 10949v1
Text from C:\Users\j\.vscode\Resume_search\novelIdeas\data\2409.11303v1.pdf added into 11303v1
Text from C:\Users\j\.vscode\Resume_search\novelIdeas\data\2409.11409v1.pdf added into 11409v1
Text from C:\Users\j\.vscode\Resume_search\novelIdeas\data\2409.13142v1.pdf added into 13142v1
Completed text extraction


AttributeError: 'dict' object has no attribute 'page_content'

LANGCHAIN

In [139]:
from langchain import hub
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.prompts import ChatPromptTemplate
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [190]:
import os
import getpass
from langchain_fireworks import ChatFireworks

In [151]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
from typing import List

In [165]:
from langchain_huggingface import HuggingFaceEmbeddings

In [161]:
folderPath = "C:/Users/j/.vscode/Resume_search/novelIdeas/data"
loader = PyPDFDirectoryLoader(folderPath)

In [166]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
hfEmbedding = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)



In [162]:
docs = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

In [167]:
vectorstore = Chroma.from_documents(documents=splits, embedding=hfEmbedding)
retriever = vectorstore.as_retriever()

In [189]:
os.environ["FIREWORKS_API_KEY"] = getpass.getpass()

In [192]:
MODEL_ID = "accounts/fireworks/models/llama-v3p1-405b-instruct"
fireworks_llm = ChatFireworks(
    model=MODEL_ID,
    temperature = 0.6,
    max_tokens = 16384,
    model_kwargs={
        "top_p": 1,
    },
    cache=None,
)

In [168]:
system_prompt = (
    """
    Innovation is seeing what everybody has seen and thinking what nobody has thought.
    An idea is nothing more nor less than a new combination of old elements. 
    It can be a thought or suggestion aimed at solving a problem or exploring a possibility.
    Ideas challenge, shift paradigms, and drive innovation by synthesizing information, reflection, and imagination. 
    As a research scientist, your role is to generate new ideas and innovations based on a research paper."""
    "\n\n"
    "{context}"
)

In [178]:
human_prompt = (
    """
    You are a research scientist following the Diagram of Thought (DoT) framework to generate ideas from a research paper. Your workflow will transition through three roles: Proposer, Critic, and Summarizer.
    
    **1. <Proposer>:** 
    - **Process**: Analyze the research paper on the topic: {input}
    - **Reflect**: Examine prior knowledge, look for patterns, and break down complex concepts. Critically assess assumptions and concepts to gain insights.
    - **Imagine**: Use the insights to brainstorm new ideas and generate novel concepts or solutions beyond what is already known. Store these in a list called 'Idea list.'
    - **Output**: 'Idea list' = {{proposed_ideas}}
    - If no ideas are generated or {{proposed_ideas}} is empty, respond with: "No ideas proposed."

    **2. <Critic>**:
    - **Evaluate Relevancy**: Ensure the ideas in {{proposed_ideas}} are relevant to the research paper. An idea is not relevant if it seems unrelated to the research the paper. An idea is relevant if it aligns with the research paper. Remove irrelevant ideas from {{proposed_ideas}}, creating {{relevant_ideas}}.
    - **Evaluate Novelty**: Assess the originality/novelty of the ideas in {{relevant_ideas}}. An idea is not novel if it is generic, already exists, or has already been explored by numerous researchers. An idea is novel if it represents a good direction, highly innovative, and has been explored by only few or no researchers. Remove unnoriginal ideas from {{relevant_ideas}}, resulting in {{novel_ideas}}.
    - **Evaluate Feasiblity**: Review {{novel_ideas}} for factual correctness and practicality. An idea is not feasible if it doesn't make any sense, impractical, or not realistic. An idea is feasible if it is practical and realistic even to a minimal degree. Remove any unrealistic or impractical ideas from {{novel_ideas}}, creating {{refined_ideas}}.
    - If {{refined_ideas}} is empty after the critique process, respond with: "No ideas after critique.".
    
    **3. <Summarizer>**:
    - Synthesize the remaining ideas from {{refined_ideas}} and write a concise summary for each idea in bullet points.  Begin the summary with: "Potential top future research ideas from the paper are:"

    "\n\n"
    "{input}"
    """
)

In [179]:
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", human_prompt),
    ]
)

In [193]:
idea_generation_chain = create_stuff_documents_chain(fireworks_llm, prompt)

In [194]:
idea_retriever_chain = create_retrieval_chain(retriever, idea_generation_chain)

In [195]:
response = idea_retriever_chain.invoke({"input": "CyberNFTs: Conceptualizing a decentralized and reward-driven intrusion detection system with ML"})

In [204]:
print(response['answer'])

**1. <Proposer>**

After analyzing the research paper "CyberNFTs: Conceptualizing a decentralized and reward-driven intrusion detection system with ML", I reflected on the concepts and broke down complex ideas to gain insights. Here are some potential ideas that came to mind:

* Using blockchain-based systems for decentralized threat intelligence sharing
* Developing a machine learning-powered intrusion detection system that leverages cyberNFTs for reward-driven detection
* Exploring the application of cyberNFTs in other areas of cybersecurity, such as incident response and threat hunting
* Investigating the use of decentralized autonomous organizations (DAOs) for managing and maintaining decentralized intrusion detection systems
* Designing a framework for evaluating the effectiveness of decentralized intrusion detection systems in various network environments
* Developing a system for detecting and mitigating advanced persistent threats (APTs) using a combination of machine learning 

In [None]:
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Cell In[176], line 1
----> 1 response = idea_retriever_chain.invoke({"input": "CyberNFTs: Conceptualizing a decentralized and reward-driven intrusion detection system with ML"})

File c:\Users\j\.vscode\Resume_search\resume-search\Lib\site-packages\langchain_core\runnables\base.py:5313, in invoke(self, input, config, **kwargs)
   5310 def get_output_schema(
   5311     self, config: Optional[RunnableConfig] = None
   5312 ) -> type[BaseModel]:
-> 5313     if self.custom_output_type is not None:
   5314         return super().get_output_schema(config)
   5315     return self.bound.get_output_schema(merge_configs(self.config, config))

File c:\Users\j\.vscode\Resume_search\resume-search\Lib\site-packages\langchain_core\runnables\base.py:3013, in invoke(self, input, config, **kwargs)
   3010 # invoke all steps in sequence
   3011 try:
   3012     for i, step in enumerate(self.steps):
-> 3013         # mark each step as a child run
   3014         config = patch_config(
   3015             config, callbacks=run_manager.get_child(f"seq:step:{i+1}")
   3016         )
   3017         context = copy_context()

File c:\Users\j\.vscode\Resume_search\resume-search\Lib\site-packages\langchain_core\runnables\passthrough.py:497, in invoke(self, input, config, **kwargs)
    488 def invoke(
...
    161     )
    162     raise KeyError(msg)
--> 163 return inner_input

KeyError: "Input to ChatPromptTemplate is missing variables {'novel_ideas', 'refined_ideas', 'refined_list', 'proposed_ideas', 'relevant_ideas'}.  Expected: ['context', 'input', 'novel_ideas', 'proposed_ideas', 'refined_ideas', 'refined_list', 'relevant_ideas'] Received: ['input', 'context']\nNote: if you intended {novel_ideas} to be part of the string and not a variable, please escape it with double curly braces like: '{{novel_ideas}}'."