In [1]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chains.summarize import load_summarize_chain
from langchain_community.document_loaders.text import TextLoader
from langchain_mistralai import ChatMistralAI
from IPython.display import Markdown, display
from dotenv import dotenv_values
environment_variables = dotenv_values()
OPENAI_API_KEY = environment_variables["OPENAI_API_KEY"]
MISTRAL_API_KEY = environment_variables["MISTRAL_API_KEY"]

# Summarize a single document

In [2]:
text = TextLoader("../test_documents/sample_text.txt").load()[0]
# display(Markdown(text.page_content))

In [3]:
llm = ChatMistralAI(model="mistral-small-latest", temperature=0.3, api_key=MISTRAL_API_KEY)

In [4]:
prompt_template = """
Write a comprehensive summary of the following text. The summary should:
1. Highlight the main points and key ideas
2. Include important details and supporting evidence
3. Maintain the original meaning and intent
4. Be well-structured and coherent

Text to summarize:
{text}

Comprehensive Summary:
"""
prompt = PromptTemplate(template=prompt_template, input_variables=["text"])
chain = prompt | llm | StrOutputParser()

In [5]:
summary = chain.invoke({"text": text})

In [7]:
Markdown(summary[:500])

### Comprehensive Summary of Data Processing Engines: Ray, Dask, and Apache Spark

#### Introduction
This summary compares and provides an integrated overview of three prominent data processing engines: Ray, Dask, and Apache Spark. Each engine is evaluated based on its core functionalities, performance capabilities, and ideal use cases in data science and machine learning (ML). The insights are drawn from various analyses, webinars, and blog posts discussing the evolution of machine learning too

# Summarize multiple documents

In [8]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=4000, chunk_overlap=200, separators=["\n\n", "\n", " ", ""]
)
docs = text_splitter.create_documents([text.page_content])

In [9]:
len(docs)

2

In [10]:
map_prompt_template = """
Write a concise summary of the following text, focusing on the key points:
{text}

Concise Summary:
"""

combine_prompt_template = """
You are provided with multiple summaries from different sections of a document or article.
Your task is to create a comprehensive, well-structured final summary that:
1. Integrates all the important information from the individual summaries
2. Presents a coherent overview of the entire content
3. Organizes the information logically with appropriate headings and structure
4. Eliminates redundancy while preserving important details

Individual summaries:
{text}

Comprehensive Final Summary:
"""

map_prompt = PromptTemplate(
    template=map_prompt_template, input_variables=["text"]
)
combine_prompt = PromptTemplate(
    template=combine_prompt_template, input_variables=["text"]
)

In [11]:
summary_chain = load_summarize_chain(
    llm,
    chain_type="map_reduce",
    map_prompt=map_prompt,
    combine_prompt=combine_prompt,
    verbose=False,
)

In [12]:
result = summary_chain.invoke(docs)

  from .autonotebook import tqdm as notebook_tqdm

A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.4 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/gabriel/Documents/Git/Document summarizer/.venv/lib/python3.11/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/gabriel/Documents/Git/Document summarizer/.venv/lib/python3.11/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/Users/gabriel/Documents/G

In [13]:
result.keys()

dict_keys(['input_documents', 'output_text'])

In [15]:
Markdown(result["output_text"][:500])

### Comprehensive Summary of Data Processing Engines: Ray, Dask, and Apache Spark

#### Overview
This document compares three prominent data processing engines—Ray, Dask, and Apache Spark—focusing on their core functionalities, performance, scalability, and ideal use cases in data science and machine learning.

#### Core Functionalities and Strengths

##### Ray
- **Strengths**: Known for ease of use, efficient distributed applications, and strong performance in reinforcement and deep learning.
-