# Trending Wikipedia articles using Langchain Memory to add context

When I have run the trending analysis in my previous notebooks, I was seeing articles that related to each other, but one didn't have any new information present in the portion of the artcile I was passing.

Like when Greg Gumbell died his article trended and it was updated with his death date so its trending reason was correctly identified. The article for his brother, Bryant Gumbell, also trended. But the agent was not able to figure this out.

I am testing out Langchain's memory feature to try to solve this problem

In [None]:
#!pip install langchain langchain-openai langchain-community openai arize-phoenix openinference-instrumentation-openai


In [None]:
from dotenv import load_dotenv
import os

load_dotenv()

OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

In [None]:
import json
from langchain_openai import ChatOpenAI
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder



### Create function to save/load conversation memory

### Get trending wikipedia articles

In [None]:
import requests
import datetime

today = datetime.datetime.now()
yesterday = today - datetime.timedelta(days=1)

date_to_query = yesterday
url = 'https://api.wikimedia.org/feed/v1/wikipedia/en/featured/' + date_to_query.strftime('%Y/%m/%d')


response = requests.get(url)
featured_feed = response.json()
print(f"API call: {url}")
print(f"Retrieved Wikipedia top article statistics for {date_to_query}")



### Save to file

In [None]:
import os
import json

# Ensure the 'featured-feed' folder exists
file_directory = "data"
os.makedirs(file_directory, exist_ok=True)

# Define the filename based on the date
base_file_name = date_to_query.strftime('%Y-%m-%d')
file_path = f'{file_directory}/{base_file_name}.json'

# Save to JSON file (overwrite if it already exists)
with open(file_path, 'w', encoding='utf-8') as file:
    json.dump(featured_feed, file, indent=4, ensure_ascii=False)

print(f'Saved Wikipedia response to {file_path}')

### Build data structure with all relevant information and placeholders for LLM responses

In [None]:
article_list = []


for item in featured_feed['mostread']['articles'][:10]:
    title = item['title']
    views = item['views']
    link = item['content_urls']['desktop']['page']
    extract = item['extract']
    thumbnail = item.get('thumbnail', {}).get('source', None)

    print(f"Getting full text of {title} article")


    # Download raw text of article
    url = f"https://en.wikipedia.org/w/index.php?title={title}&action=raw"
    print(url)

    article_text = requests.get(url).text
    article_text_truncated = article_text[:5000]

    
    
    article={
        'title': title,
        'views': views,
        'link': link,
        'thumbnail': thumbnail,
        'extract': extract,
        'text': article_text_truncated,
        'trendingreason': '',
        'memorycontext': ''
    }



    article_list.append(article)

print(article_list)

In [None]:
# def save_memory(memory, filename="trending-history.json"):
#     with open(filename, "w") as f:
#         json.dump(memory.chat_memory.messages, f, default=lambda x: x.__dict__)

# def load_memory(filename="trending-history.json"):
#     try:
#         with open(filename, "r") as f:
#             messages = json.load(f)
#         memory = ConversationBufferMemory(return_messages=True)
#         memory.chat_memory.messages = messages
#         return memory
#     except FileNotFoundError:
#         return ConversationBufferMemory(return_messages=True)

### Creating conversation chain

In [None]:
trending_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful Wikipedia analyst and historian. You speak consiseley and given the choice to say too much or too little, you say too little"),
    MessagesPlaceholder(variable_name="history"),
    ("human", "{input}")
])

# memory = ConversationBufferMemory()
memory = ConversationBufferMemory(return_messages=True)

# Ensure the memory's history is a list
# assert isinstance(memory.chat_memory.messages, list), "Memory history is not initialized as a list."

llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0.7,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    # api_key="...",  # if you prefer to pass api key in directly instaed of using env vars
    # base_url="...",
    # organization="...",
    # other params...
)

trending_conversation = ConversationChain(
    llm=llm,
    memory=memory,
    prompt=trending_prompt,
    verbose=True
)

#### Loop through all articles in data structure
- Use LangChain/ChatGPT to give suggestions why each one is trending
- Save reason to structure

In [None]:

for article in article_list:
    print(f"Analyzing {title}")

    title = article['title']
    text = article['text']

    prediction_prompt = f"Act as a professional news summarizer. Based on your knowledge of {title} and the following extract. In 1 concise and confident sentence, explain why the {title} article might be trending on Wikipedia on #{date_to_query}:\n\n{text}"

    response = trending_conversation.predict(input=prediction_prompt)
    print("trendingreason:", response)
    
    article['trendingreason'] =  response

# save_memory(trending_conversation.memory)

#### Use conversation memory to derive more context from

- Pass memory from first conversation into a new conversation 
- Search for cross context between today's articles

In [None]:
memory_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful Wikipedia historian."),
    MessagesPlaceholder(variable_name="history"),
    ("human", "{input}")
])

# todays_memory = load_memory()
memory_conversation = ConversationChain(
    llm=llm,
    memory=trending_conversation.memory,
    prompt=memory_prompt,
    verbose=True
)

for article in article_list:
    print(f"Analyzing {title}")

    title = article['title']
    text = article['text']

    memory_prompt = f"Does {title} relate to any other trending article? If yes, tell me why in 1 or 2 sentences."

    response = memory_conversation.predict(input=memory_prompt)
    print("memorycontext:", response)
    
    article['memorycontext'] =  response

#### Build HTML Page to display the top 10 list complete with 
- title
- thumbnail
- trending reason
- relation to other articles

In [76]:
# Start building the HTML
html_title = f"<h1>Wikipedia's most viewed articles on {date_to_query.strftime("%B %d, %Y")}</h1>"
html_list = "<ol>\n"

# Iterate through the data
for item in article_list:
    title = item['title']
    link = item['link']
    thumbnail = item['thumbnail']
    trendingreason = item['trendingreason']
    
    memorycontext = item['memorycontext']
    views = item['views']
    extract = item['extract']

    # Handle null thumbnail
    if thumbnail:
        thumbnail_html = f'<img src="{thumbnail}" alt="Thumbnail for {title}"/><br>'
    else:
        thumbnail_html = '<p><em>No thumbnail available</em></p>'
    
    # Create a list item for each entry
    html_list += f"""
    <li>
        <h2>
          <a href="{link}" target="_blank">{title}</a><br>
        </h2>
        {thumbnail_html}
        <strong>Views:</strong> {views}<br><br>
        <strong>Reason for Trending:</strong> {trendingreason}<br><br>
        <strong>Relation to knowledge base:</strong> {memorycontext}
        
    </li>\n
    """

# Close the HTML list
html_list += "</ol>"

html_page = html_title + html_list
# Save to html file (overwrite if it already exists)
file_path = f'{file_directory}/{base_file_name}.html'

with open(file_path, 'w', encoding='utf-8') as file:
    file.write(html_page)

# Display the HTML in the notebook
from IPython.display import display, HTML
display(HTML(html_page))

## Takeaways
- running two separate chains revealed some copy-pasta growing pains
- LangChain defaults to GPT-3 when no model is specified
- GPT 4o mini is a token-lover's dream
- It's tough to get the model to calm down when asking questions of it's knowledgebase. 