<a href="https://colab.research.google.com/github/danielsteman/textifai/blob/main/Langchain_Personal_Agent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from IPython.display import clear_output

In [None]:
!pip install -qU python-magic openai langchain pypdf chromadb tiktoken
clear_output()

In [None]:
import requests
import openai

import langchain
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain import OpenAI, VectorDBQA
from langchain.chains import RetrievalQA
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DirectoryLoader, PyPDFLoader

import nltk
nltk.download('punkt')
nltk.download('stopwords')
import os

from google.colab import drive
drive.mount('/content/drive')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Mounted at /content/drive


# Define API Keys

In [None]:
os.environ["S2_KEY"] = 'tuQC9DbAlj5KFFAavsTqTad7kViNUYoyaX6qPNt8'
os.environ["OPENAI_API_KEY"] = "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"

# Define Functions

In [None]:
def search_literature(query, limit=20, fields=["title", "authors", "year", "openAccessPdf", "abstract"]):
    # space between the  query to be removed and replaced with +
    query = query.replace(" ", "+")
    url = f'https://api.semanticscholar.org/graph/v1/paper/search?query={query}&limit={limit}&fields={",".join(fields)}'
    headers = {"Accept": "*/*", "x-api-key": os.environ["S2_KEY"]}

    response = requests.get(url, headers=headers)
    return response.json()

In [None]:
def preprocess_query(query):
    query = query.lower()
    # remove stopwords from the query
    stopwords = set(nltk.corpus.stopwords.words("english"))
    query = " ".join([word for word in query.split() if word not in stopwords])
    return query

In [None]:
def load_documents_pdf(path):
    loader = DirectoryLoader(path, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()

    return documents

In [None]:
def split_text(documents, chunk_size=1000, chunk_overlap=0):
    text_splitter = RecursiveCharacterTextSplitter(
                        chunk_size=chunk_size, 
                        chunk_overlap=chunk_overlap, 
                        separators=["\n\n", "\n", " ", ""]
                )

    texts = text_splitter.split_documents(documents)

    return texts

In [None]:
def create_embeddings(texts, model='text-embedding-ada-002'):
    
    embeddings = OpenAIEmbeddings(
    openai_api_key=os.environ['OPENAI_API_KEY'], 
    model=model
    )

    docsearch = Chroma.from_documents(texts, embeddings, metadatas=[{"source": str(i)} for i in range(len(texts))])
  
    return docsearch

In [None]:
def create_context(docsearch, query):
    context = docsearch.similarity_search(query)

    return context

In [None]:
def create_context_dict(context):
    metadata_list = [item  for sublist in context for item in sublist]

    # Split the list into pairs of tuples (each pair contains a 'page_content' tuple and a 'metadata' tuple)
    pairs_of_tuples = [metadata_list[i:i+2] for i in range(0, len(metadata_list), 2)]

    metedata_dict = []

    for pair in pairs_of_tuples:
        dict_data = {}
        for item in pair:
            if item[0] == 'page_content':
                dict_data['context'] = item[1]
            elif item[0] == 'metadata':
                dict_data.update(item[1])
        metedata_dict.append(dict_data)

    metedata_dict = sorted(metedata_dict, key=lambda x: x['page'])

    return metedata_dict

In [None]:
def get_langchain_response(docs, query, k=5):
    """
    Get the langchain response for a query. Here we are using the langchain mapreduce function to get the response.
    Prompts here should be played around with. These are the prompts that worked best for us.
    """
    question_prompt_template = """Use the following portion of a long document to see if any of the text is relevant to answer the question. 

    {context}
    Question: {question}
    Relevant text, if any:"""
    QUESTION_PROMPT = PromptTemplate(
        template=question_prompt_template, input_variables=["context", "question"]
    )

    combine_prompt_template = """
    You are a research assistant and reacts to questions in a professional manner. 

    Given the following extracted parts of a number of scientific papers and a question, create a summarized answer. 
    If you don't know the answer, just say that you don't know. Don't try to make up an answer.

    If you create a summarized answer from multiple sources, start with the general summary first followed with the title of the scientic 
    paper and return a summarized answer per scientic paper.

    Create a final answer with references ("SOURCES"). Return sources as a list of strings, e.g. ["source1", "source2", ...]

    QUESTION: {question}
    =========
    {summaries}
    =========
    FINAL ANSWER:"""
    
    COMBINE_PROMPT = PromptTemplate(
        template=combine_prompt_template, input_variables=["summaries", "question"]
    )

    chain = load_qa_with_sources_chain(
        ChatOpenAI(
            model_name='gpt-3.5-turbo',
            temperature=0.0, 
            openai_api_key=os.environ["OPENAI_API_KEY"]
        ),
        chain_type="map_reduce",
        return_intermediate_steps=True,
        question_prompt=QUESTION_PROMPT,
        combine_prompt=COMBINE_PROMPT,
    )

    chain_out = chain(
        {"input_documents": docs, "question": query}, return_only_outputs=False # docs[:k] -> k as input variable to limit number of documents
    )
    return chain_out

# Get Relevant Papers based on Question

In [None]:
query = 'what is the impact of gendered wording in job advertisements?'

# Apply Functions 

In [None]:
documents = load_documents_pdf('/content/drive/MyDrive/Colab Notebooks/docs/')

In [None]:
texts = split_text(documents)

In [None]:
docsearch = create_embeddings(texts=texts)

In [None]:
context = create_context(docsearch, query)

In [None]:
context_dict = create_context_dict(context)

In [None]:
get_langchain_response(docs=context, query=query)

{'input_documents': [Document(page_content='worthwhile. Linguistik online, 1, 1–12. Rosenthal, R., Rosnow, R. L., & Rubin, D. B. (2000). Contrasts and effect sizes in behavioral research. New York: Cambridge University Press. Rudman, L. A., & Glick, P. (1999). Feminized management and backlash toward agentic women: The hidden costs to women of a kinder, gentler image of middle-managers. Journal of Personality and Social Psychology, 77, 1004–1010. doi:10.1037/0022-3514.77.5.1004', metadata={'source': '/content/drive/MyDrive/Colab Notebooks/docs/Horvath&Sczesny.pdf', 'page': 34}),
  Document(page_content='contemporary writings. Academy of Management Review ,22,\n257–282.\nHeatherington, L., Daubman, K. A., Bates, C., Ahn, A., Brown, H., &\nPreston, C. (1993). Two investigations of “female modesty ”in\nachievement situations. Sex Roles ,29, 739 –753.\nHeilman, M. E. (1983). Sex bias in work settings: The lack of fit\nmodel. Research in Organizational Behavior ,5, 269 –298.\nHoppenstedt (2

# Searching Relevant Literature

In [None]:
search_query = ''

In [None]:
search_literature(query=preprocess(search_query))

# Tests