<a href="https://colab.research.google.com/github/dvdblk/hack4good-oecd/blob/main/extract_docs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install langchain faiss-cpu openai tiktoken

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma, FAISS
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda

import re
import os
import openai
import tiktoken
from operator import itemgetter
import typing
import numpy as np
import pandas as pd
import copy
import time
import json

from topic import Topic, recursive_topic_creator
from util import get_standard_doc_splits
from query_handler import QueryHandler

In [3]:
OPENAI_API_KEY = "redacted"
openai.api_key = OPENAI_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [4]:
multiple_q_dict = {
    "questions" : ["summary", "binary"],
    "top" : {
        "summary": "Summarize whether and how {topic} is mentioned in the document.",
        "binary": "Is {topic} mentioned in the document?",
        "sentiment": "How is the sentiment towards {topic} in the document?"},
    "relation" :{
        "summary": "With regard to {topic}, summarize whether and how the document mentions {subtopic}.",
        "binary": "With regard to {topic}, does the document mention {subtopic}?",
        "sentiment": "With regard to {topic}, what is the sentiment of {subtopic} in the document?"},
    "link": {"basic": "With regard to {technology} does the document mention {skill_caps}?"},
    # policy, investment
    "formatting" : {
        "summary": " Use 20 words or less.",
        "binary": " Yes -> 1 or No -> 0", # " Answer 1 for yes, 0 for no"
        "sentiment": " Positive -> 1, Negative -> -1, or Neutral -> 0"} # " Answer 1 for positive, 0 for neutral and -1 for negative sentiment"
}

In [5]:
# RUN TO RESET PROGRESS
import importlib
import query_handler
importlib.reload(query_handler)
all_files_results = {}

CHUNK_SIZE = 2000

df = pd.read_csv('topic_datasheet.csv')
topics = recursive_topic_creator(df)

In [6]:
file_lst = os.listdir("data")
folder = "data/"

In [None]:
working_lst = [i for i in file_lst if i not in all_files_results.keys()]

for f in working_lst:
    print("working on: ", f)
    splits = get_standard_doc_splits(folder+f, chunk_size=CHUNK_SIZE)
    vectorstore = FAISS.from_documents(splits, embedding=OpenAIEmbeddings())

    qh = query_handler.QueryHandler(
        topics,
        multiple_q_dict,
        vectorstore,
        rag_topk=8,
        model_name="gpt-4-1106-preview", #"gpt-3.5-turbo-1106",
        query_json_split_size=3,
        sleep_time=1)

    res = qh.run()
    res["chunk_size"] = CHUNK_SIZE
    res["additional_notes"] = ""

    all_files_results[f] = res

# Ask Advanced Questions

In [27]:
specific_question = {
    "policy": {
        "summary": "Summarize policies, if any, the document recommends for {topic} in one sentence.",
        "binary": "Does the document recommend any policies for {topic}? Answer 1 for yes, 0 for no."
    },
    "investment": {
        "summary": "Summarize the investment, if any, the document recommends for {topic} in one sentence.",
        "binary" : "Does the document recommend investment for {topic}? Answer 1 for yes, 0 for no."
    }
}
# "sentiment": "For {topic} is the sentiment positive, negative or neutral? Output 1 for positive, -1 for negative, 0 for neutral."

In [56]:
importlib.reload(query_handler)

<module 'query_handler' from '/content/query_handler.py'>

In [None]:
extended_results = {}
for key in all_files_results.keys():
    print("working on: ", key)
    splits = get_standard_doc_splits(folder+key, chunk_size=CHUNK_SIZE)
    vectorstore = FAISS.from_documents(splits, embedding=OpenAIEmbeddings())

    qh = query_handler.QueryHandler(
        topics,
        multiple_q_dict,
        vectorstore,
        rag_topk=8,
        model_name="gpt-3.5-turbo-1106", #"gpt-4-1106-preview",
        query_json_split_size=3,
        sleep_time=1)

    extended_results[key] = qh.traverse_advanced(all_files_results[key]["content"], specific_question)

In [52]:
count_binaries = 0
xfile = "UK_36.txt"
for key in extended_results[xfile].keys():
    for k in extended_results[xfile][key].keys():
        if "binary" in k:
            count_binaries += extended_results[xfile][key][k]
print(count_binaries)

54


In [None]:
extended_results[xfile]["Advanced Computing"]

In [None]:
# oh a new function
def json_to_dataframe(doc_dict):
    df = pd.DataFrame()
    for file_key in doc_dict.keys():
        file_dict = doc_dict[file_key]
        to_pd_dict = {}
        for topic in file_dict.keys():
            if topic == "questions":
                continue
            if isinstance(file_dict[topic], dict):
                my_dict = {}
                for k in file_dict[topic].keys():
                    my_dict[
                        topic.title().replace(" ", "") + "_" + k.title().replace(" ", "")
                        ] = file_dict[topic][k]
                # my_dict = file_dict[topic].copy()

                # my_dict[topic] = my_dict['general']
                # my_dict.pop('general', None)

                to_pd_dict.update(my_dict)
            # else:
            #     print(topic, file_dict[topic])

        for key in to_pd_dict.keys():
            to_pd_dict[key] = int(to_pd_dict[key])

        to_pd_dict["doc_name"] = file_key
        df = pd.concat([df, pd.DataFrame([to_pd_dict])], ignore_index=True)


    cols = list(df.columns.values)
    cols = [cols[-1]] + cols[:-1]
    df = df[cols]
    return df

In [None]:
df = json_to_dataframe(all_files_results)

In [None]:
print(df)

In [None]:
df.to_csv("first_binary_datasheet.csv")

In [None]:
json.dump(all_files_results, open("all_files_results.json", "w"))