# Writing a Newsletter with LLMs

> Summarizing the scraped insights articles for newsletter entries

In [152]:
import random
import pandas as pd
import pickle

from langchain import PromptTemplate
from langchain.schema.document import Document
from langchain_community.llms import Ollama
from langchain.chains.summarize import load_summarize_chain
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_community.document_loaders.pdf import PyPDFDirectoryLoader, PyPDFLoader
from langchain_community.document_loaders import DataFrameLoader


## Instantiate LLM Model

In [3]:
llm = Ollama(model="llama3.1")

In [4]:
# test the llm
res = llm.invoke("Tell me a joke")
print(res)

Here's one:

What do you call a fake noodle?

An impasta.

I hope that made you laugh! Do you want to hear another one?


## Get Best Documents

In our first iteration of this method, we want to first use the LLM to identify the best three articles to write about and then summarize them

In [None]:
print(titles)

In [158]:
title_prompt_china = """Out of the following news headlines, select the three you believe are most significant to China. Return ONLY their corresponding indices.

{titles}
"""

title_prompt_retail = """Out of the following news headlines, select the three you believe are most significant to Walmart. Return ONLY their corresponding indices.

{titles}
"""

title_prompt_market = """Out of the following news headlines, select the three you believe are most significant to Tech Stocks. Return ONLY their corresponding indices.

{titles}
"""

title_template = PromptTemplate(template=title_prompt, input_variables=["titles"])

stuff_prompt = """
Summarize the below article in 300 words for a newsletter. 

{text}

NEWSLETTER ENTRY:
"""

stuff_prompt_template = PromptTemplate(template=stuff_prompt, input_variables=["text"])

map_prompt = """
Write a brief summary of the main points in the news article section.
Make sure to include relevant supporting information and data.

```{text}```

SUMMARY:
"""
map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"])

combine_prompt = """
Summarize the following text into an informative 500 word newsletter article. Use supporting data where possible, avoid bulletpoints.

```{text}```

NEWSLETTER ENTRY:
"""
combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["text"])

In [159]:
def select_best_articles(df, topic):
    """
    Uses LLM to select the articlse with the highest relevance and importance
    """
    
    # get titles
    titles = ""
    for i, title in df.title.items():
        titles += f"{i}. {title} \n"

    # use llm to select best titles for that particular topic
    if topic == "china":
        title_template = PromptTemplate(template=title_prompt_china, input_variables=["titles"])
    elif topic == "retail":
        title_template = PromptTemplate(template=title_prompt_retail, input_variables=["titles"])
    elif topic == "market":
        title_template = PromptTemplate(template=title_prompt_market, input_variables=["titles"])
    else:
        raise ValueError("provided topic not in list of predefined topics")
        
    title_formatted_prompt = title_template.format(titles=titles)
    response = llm.invoke(title_formatted_prompt)

    article_selector = []
    for char in response:
        if char.isdigit():
            article_selector.append(int(char))

    
    # ensure three are selected
    if len(article_selector) < 3:
        not_selected = list(set(df.index.to_list()) - set(article_selector))
        article_selector += random.sample(not_selected, 3-len(article_selector))

    elif len(article_selector) > 3:
        article_selector = article_selector[:3]
    
    return article_selector

# load chain
chain = load_summarize_chain(llm, 
         chain_type="map_reduce",
         map_prompt = map_prompt_template,
         combine_prompt = combine_prompt_template,
         verbose=True
        )

stuff_chain = load_summarize_chain(llm, 
         chain_type="stuff",
         prompt = stuff_prompt_template,
         #verbose=True
        )

def load_documents(df, article_ind):
    """
    Load the selected documents from the dataframe
    """
    df_trim = df.iloc[article_ind]
    document_loader = DataFrameLoader(df_trim, page_content_column="content")
    return document_loader.load()

def split_documents(documents: list[Document]):
    """
    Split our documents as they are too long each
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2000,
        chunk_overlap=500,
        #length_function=len,
        #is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)

def summarize_articles(topic:str):
    """
    Takes the top three articles and summarizes them with LLM
    """
    df = pd.read_csv(f"data/{topic}.csv") 
    article_ind = select_best_articles(df, topic)
    documents = load_documents(df, article_ind)
    #split_docs = split_documents(documents)
    #print(len(documents))
    outputs = []
    for document in documents:
        #print(document)
        output_summary = stuff_chain.invoke([document])
        outputs.append(output_summary)

    return outputs

def summarize_all():
    topics = ["china", "retail", "market"]
    summaries_all = {}
    for topic in topics:
        topic_outputs = summarize_articles(topic)
        summaries_all[topic] = topic_outputs
        print(f"{topic} SUMMARIES COMPLETE")
        
    with open("data/summaries.pkl", 'wb') as outp:  # Overwrites any existing file.
        pickle.dump(summaries_all, outp, pickle.HIGHEST_PROTOCOL)
        
    return summaries_all
        

In [160]:
outputs = summarize_all()

china SUMMARIES COMPLETE
retail SUMMARIES COMPLETE
market SUMMARIES COMPLETE


In [164]:
[output["output_text"] for output in outputs["retail"]]

["**Walmart Dominates US Online Grocery Market**\n\nAccording to a recent report, Walmart has become even more dominant in the US online grocery market, capturing 37% of the market share in Q2 2024. This represents a 1.5 percentage point increase from the same quarter last year and marks the company's highest share level to date.\n\nIn contrast, supermarkets saw a decline of 2.5 percentage points, finishing with 27.3% of the online grocery market. Walmart's sales share began gaining on supermarkets in early 2022, driven by factors such as inflation, declining personal savings rates, and rising interest rates.\n\nTarget also made moderate gains, increasing its share to 7% from 6% in Q2 2021. Brick Meets Click/Mercatus analysis suggests that Target's strong execution in filling pickup orders and a price gap halfway between supermarkets and Walmart contributed to its online grocery performance.\n\nThe report also highlights the growth of delivery sales, with mass formats (including Walmar

## Appendix

In [170]:
# Unused combined prompts

combine_prompt = """
You will be given a series of texts below all from a single long article, delimited by triple backquotes.
You are writing a newsletter section condensing out the important points in the article, each supported by a paragraph of around 200 words
The target audience is company leadership, so be concise and rigorous.
Final summary should be at around 500 words long and not only consist of bullet points.

```{text}```

SUMMARY:
"""

combine_prompt = """
You will be given a series of texts below within the triple backquotes, all summaries of sections from the SAME article.
You are a Walmart market researcher writing a newsletter section condensing out the important points in the article. Give sufficient support to these points.
The target audience is executive leadership, final summary should be at around 500 words long and NOT USE bullet points.

```{text}```

NEWSLETTER ENTRY:
"""
