# Trending Wikipedia articles using Langchain Memory to add context

When I have run the trending analysis in my previous notebooks, I was seeing articles that related to each other, but one didn't have any new information present in the portion of the artcile I was passing.

Like when Greg Gumbell died his article trended and it was updated with his death date so its trending reason was correctly identified. The article for his brother, Bryant Gumbell, also trended. But the agent was not able to figure this out.

I am testing out Langchain's memory feature to try to solve this problem

In [None]:
#!pip install langchain langchain-openai langchain-community openai arize-phoenix openinference-instrumentation-openai


In [56]:
from dotenv import load_dotenv
import os

load_dotenv()

OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

In [57]:
import json
from langchain_openai import ChatOpenAI
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder



### Create function to save/load conversation memory

### Get trending wikipedia articles

In [58]:
import requests
import datetime

today = datetime.datetime.now()
yesterday = today - datetime.timedelta(days=1)

date_to_query = yesterday
url = 'https://api.wikimedia.org/feed/v1/wikipedia/en/featured/' + date_to_query.strftime('%Y/%m/%d')


response = requests.get(url)
featured_feed = response.json()
print(f"API call: {url}")
print(f"Retrieved Wikipedia top article statistics for {date_to_query}")



API call: https://api.wikimedia.org/feed/v1/wikipedia/en/featured/2025/01/05
Retrieved Wikipedia top article statistics for 2025-01-05 22:12:26.184105


### Save to file

In [60]:
import os
import json

# Ensure the 'featured-feed' folder exists
file_directory = "data"
os.makedirs(file_directory, exist_ok=True)

# Define the filename based on the date
base_file_name = date_to_query.strftime('%Y-%m-%d')
file_path = f'{file_directory}/{base_file_name}.json'

# Save to JSON file (overwrite if it already exists)
with open(file_path, 'w', encoding='utf-8') as file:
    json.dump(featured_feed, file, indent=4, ensure_ascii=False)

print(f'Saved Wikipedia response to {file_path}')

Saved Wikipedia response to data/2025-01-05.json


### Build data structure with all relevant information and placeholders for LLM responses

In [61]:
article_list = []


for item in featured_feed['mostread']['articles'][:10]:
    title = item['title']
    views = item['views']
    link = item['content_urls']['desktop']['page']
    extract = item['extract']
    thumbnail = item.get('thumbnail', {}).get('source', None)

    print(f"Getting full text of {title} article")


    # Download raw text of article
    url = f"https://en.wikipedia.org/w/index.php?title={title}&action=raw"
    print(url)

    article_text = requests.get(url).text
    article_text_truncated = article_text[:5000]

    
    
    article={
        'title': title,
        'views': views,
        'link': link,
        'thumbnail': thumbnail,
        'extract': extract,
        'text': article_text_truncated,
        'trendingreason': '',
        'memorycontext': ''
    }



    article_list.append(article)

print(article_list)

Getting full text of Jeff_Baena article
https://en.wikipedia.org/w/index.php?title=Jeff_Baena&action=raw
Getting full text of Aubrey_Plaza article
https://en.wikipedia.org/w/index.php?title=Aubrey_Plaza&action=raw
Getting full text of Squid_Game_season_2 article
https://en.wikipedia.org/w/index.php?title=Squid_Game_season_2&action=raw
Getting full text of Human_metapneumovirus article
https://en.wikipedia.org/w/index.php?title=Human_metapneumovirus&action=raw
Getting full text of Nosferatu_(2024_film) article
https://en.wikipedia.org/w/index.php?title=Nosferatu_(2024_film)&action=raw
Getting full text of Jimmy_Carter article
https://en.wikipedia.org/w/index.php?title=Jimmy_Carter&action=raw
Getting full text of Brothers_Home article
https://en.wikipedia.org/w/index.php?title=Brothers_Home&action=raw
Getting full text of Squid_Game article
https://en.wikipedia.org/w/index.php?title=Squid_Game&action=raw
Getting full text of Luke_Littler article
https://en.wikipedia.org/w/index.php?title

In [62]:
# def save_memory(memory, filename="trending-history.json"):
#     with open(filename, "w") as f:
#         json.dump(memory.chat_memory.messages, f, default=lambda x: x.__dict__)

# def load_memory(filename="trending-history.json"):
#     try:
#         with open(filename, "r") as f:
#             messages = json.load(f)
#         memory = ConversationBufferMemory(return_messages=True)
#         memory.chat_memory.messages = messages
#         return memory
#     except FileNotFoundError:
#         return ConversationBufferMemory(return_messages=True)

### Creating conversation chain

In [70]:
trending_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful Wikipedia analyst and historian. You speak consiseley and given the choice to say too much or too little, you say too little"),
    MessagesPlaceholder(variable_name="history"),
    ("human", "{input}")
])

# memory = ConversationBufferMemory()
memory = ConversationBufferMemory(return_messages=True)

# Ensure the memory's history is a list
# assert isinstance(memory.chat_memory.messages, list), "Memory history is not initialized as a list."

llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0.7,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    # api_key="...",  # if you prefer to pass api key in directly instaed of using env vars
    # base_url="...",
    # organization="...",
    # other params...
)

trending_conversation = ConversationChain(
    llm=llm,
    memory=memory,
    prompt=trending_prompt,
    verbose=True
)

#### Loop through all articles in data structure
- Use LangChain/ChatGPT to give suggestions why each one is trending
- Save reason to structure

In [71]:

for article in article_list:
    print(f"Analyzing {title}")

    title = article['title']
    text = article['text']

    prediction_prompt = f"Act as a professional news summarizer. Based on your knowledge of {title} and the following extract. In 1 concise and confident sentence, explain why the {title} article might be trending on Wikipedia on #{date_to_query}:\n\n{text}"

    response = trending_conversation.predict(input=prediction_prompt)
    print("trendingreason:", response)
    
    article['trendingreason'] =  response

# save_memory(trending_conversation.memory)

Analyzing Jeff_Baena


[1m> Entering new ConversationChain chain...[0m


ValueError: variable history should be a list of base messages, got  of type <class 'str'>

#### Use conversation memory to derive more context from

When I first ran through this I got the old: `BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 16385 tokens.`

And I realized I never even selected which model to use. LangChain was deciding for me... GPT 3... had to give that an override to get the next bit to work...

In [None]:
memory_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful Wikipedia historian."),
    MessagesPlaceholder(variable_name="history"),
    ("human", "{input}")
])

# todays_memory = load_memory()
memory_conversation = ConversationChain(
    llm=llm,
    memory=trending_conversation.memory,
    prompt=memory_prompt,
    verbose=True
)

for article in article_list:
    print(f"Analyzing {title}")

    title = article['title']
    text = article['text']

    memory_prompt = f"Does {title} relate to any other trending article? If yes, tell me why in 1 or 2 sentences."

    response = memory_conversation.predict(input=memory_prompt)
    print("memorycontext:", response)
    
    article['memorycontext'] =  response

Analyzing Pan_Am_Flight_103


[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: You are a helpful Wikipedia historian.
Human: Act as a professional news summarizer. Based on your knowledge of Jeff_Baena and the following extract. In 1 concise and confident sentence, explain why the Jeff_Baena article might be trending on Wikipedia on #2025-01-05 10:56:07.753157:

{{Short description|American screenwriter and film director (1977–2025)|bot=PearBOT 5}}
{{pp-protected|small=yes}}
{{Use mdy dates|date=January 2025}}
{{Use American English|date=January 2025}}
{{Infobox person
| image              = Jeff Baena.jpg
| caption            = Baena at the 2017 [[Seattle International Film Festival]]
| alt                = Baena posing
| birth_name         = Jeffrey Baena
| birth_date         = {{birth date|1977|6|29}}
| birth_place        = [[Miami]], Florida, U.S.
| death_date         = {{death date and age|2025|01|03|1977|06|29}}
| death_place        

In [None]:
# # First interaction
# response = conversation.predict(input="Hello! What's your name?")
# print("AI:", response)

# # Second interaction
# response = conversation.predict(input="What was my first question to you?")
# print("AI:", response)

# Save the conversation memory
# save_memory(conversation.memory)


In [None]:


# Continue the conversation
response = new_conversation.predict(input="Which model are you using?")
print("AI:", response)

#### Build HTML Page to display the top 10 list complete with 
- title
- thumbnail
- trending reason
- relation to other articles

In [None]:
# Start building the HTML
html_title = f"<h1>Wikipedia's most viewed articles on {date_to_query.strftime("%B %d, %Y")}</h1>"
html_list = "<ol>\n"

# Iterate through the data
for item in article_list:
    title = item['title']
    link = item['link']
    thumbnail = item['thumbnail']
    trendingreason = item['trendingreason']
    
    memorycontext = item['memorycontext']
    views = item['views']
    extract = item['extract']

    # Handle null thumbnail
    if thumbnail:
        thumbnail_html = f'<img src="{thumbnail}" alt="Thumbnail for {title}"/><br>'
    else:
        thumbnail_html = '<p><em>No thumbnail available</em></p>'
    
    # Create a list item for each entry
    html_list += f"""
    <li>
        <h2>
          <a href="{link}" target="_blank">{title}</a><br>
        </h2>
        {thumbnail_html}
        <strong>Views:</strong> {views}<br><br>
        <strong>Reason for Trending:</strong> {trendingreason}<br><br>
        <strong>Relation to knowledge base:</strong> {memorycontext}
        
    </li>\n
    """

# Close the HTML list
html_list += "</ol>"

html_page = html_title + html_list
# Save to html file (overwrite if it already exists)
file_path = f'{file_directory}/{base_file_name}.html'

with open(file_path, 'w', encoding='utf-8') as file:
    file.write(html_page)

# Display the HTML in the notebook
from IPython.display import display, HTML
display(HTML(html_page))

## Takeaways
- running two separate chains revealed some copy-pasta growing pains
- LangChain defaults to GPT-3 when no model is specified
- GPT 4o mini is a token-lover's dream
- It's tough to get the model to calm down when asking questions of it's knowledgebase. 