SET UP ENVIRONMENT

In [None]:
%pip install langchain-openai
%pip install langchain
%pip install titoken
%pip install langchain-text-splitters
%pip install pypdf
%pip install pdfminer.six

SET OPENAI_API_KEY AS ENVIRONMENT VARIABLE

In [5]:
import os
with open('secrets/api_key', 'r') as file:
    api_key = file.read().strip()  # read the file content and remove any leading or trailing whitespace
os.environ['OPENAI_API_KEY'] = api_key

IMPORT MODEL AND SET UP HYPERPARAMETERS

In [6]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model_name="gpt-3.5-turbo")

PDF LOADER

In [7]:
from langchain_community.document_loaders import PDFMinerLoader

loader = PDFMinerLoader("b295_slim.pdf")
documentPDF = loader.load()

chunks = len(documentPDF)
print(f"Number of chunks: {chunks}, size of chunks: {llm.get_num_tokens(documentPDF[0].page_content)}")

Number of chunks: 1, size of chunks: 67353


COBOL TEXT SPLITTER

In [8]:
from langchain_text_splitters import (
    Language,
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter
    )

cobol_splitter = RecursiveCharacterTextSplitter.from_language(language=Language.COBOL, chunk_size = 10000, chunk_overlap = 500)

#cobol_splitter = CharacterTextSplitter.from_tiktoken_encoder(
#    chunk_size=100, chunk_overlap=100
#)

cobol_docs = cobol_splitter.split_documents(documentPDF)
chunks = len(cobol_docs)
print(f"Number of chunks: {chunks}, size of chunks: {llm.get_num_tokens(cobol_docs[0].page_content)}")

Number of chunks: 22, size of chunks: 2909


#MAPREDUCE SUMMARIZATION

MAP STAGE

In [9]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain, MapReduceDocumentsChain, ReduceDocumentsChain, StuffDocumentsChain
# Map
map_template = """The following is the Cobol code you want to analyze {docs} \n
Provide the list of all functions and subroutines in the code, their goal and sintetically what they do. \n
Helpful Answer:"""
map_prompt = PromptTemplate.from_template(map_template)
map_chain = LLMChain(llm=llm, prompt=map_prompt)

REDUCE STAGE

In [10]:
# Reduce
reduce_template = """The following is a summary of the Cobol program you want to analyze
{docs} \n
Based on the document provided give an executive overview of what the program does and what kind of task it helps to accomplish \n
Helpful Answer:"""
reduce_prompt = PromptTemplate.from_template(reduce_template)

# Reduce chain
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain, document_variable_name="docs"
)

# Combines and iteratively reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
    # This is final chain that is called.
    combine_documents_chain=combine_documents_chain,
    # If documents exceed context for `StuffDocumentsChain`
    collapse_documents_chain=combine_documents_chain,
    # The maximum number of tokens to group documents into.
    token_max=4000,
)

RUN THE CHAIN

In [11]:
# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_chain,
    # Reduce chain
    reduce_documents_chain=reduce_documents_chain,
    # The variable name in the llm_chain to put the documents in
    document_variable_name="docs",
    # Return the results of the map  steps in the output
    return_intermediate_steps=True,
)

print(map_reduce_chain.invoke(cobol_docs))