# Trending Wikipedia

1. Get trending Wikipedia articles from yesterday
2. Pass plain text from article to OpenAI for suggestsions as to why each article is trending
3. Build HTML page to display each article and why it is trending

In [47]:
# !pip install requests ollama tiktoken openai python-dotenv

In [48]:
from dotenv import load_dotenv
import os

load_dotenv()

OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]



API call for the featured feed shows different for today than it does for any previous days.

This is written for previous days only

In [49]:
import requests
import datetime

today = datetime.datetime.now()
yesterday = today - datetime.timedelta(days=1)

date_to_query = yesterday
url = 'https://api.wikimedia.org/feed/v1/wikipedia/en/featured/' + date_to_query.strftime('%Y/%m/%d')


response = requests.get(url)
featured_feed = response.json()
print(f"API call: {url}")
print(f"Retrieved Wikipedia top article statistics for {date_to_query}")

API call: https://api.wikimedia.org/feed/v1/wikipedia/en/featured/2024/12/28
Retrieved Wikipedia top article statistics for 2024-12-28 17:12:37.093627


### Save API response to file

In [50]:
import os
import json

# Ensure the 'featured-feed' folder exists
file_directory = "data"
os.makedirs(file_directory, exist_ok=True)

# Define the filename based on the date
base_file_name = date_to_query.strftime('%Y-%m-%d')
file_path = f'{file_directory}/{base_file_name}.json'

# Save to JSON file (overwrite if it already exists)
with open(file_path, 'w', encoding='utf-8') as file:
    json.dump(featured_feed, file, indent=4, ensure_ascii=False)

print(f'Saved Wikipedia response to {file_path}')

Saved Wikipedia response to data/2024-12-28.json


# Keep an eye on the token count

Since we're using the context window for the entire wikipedia article I want to keep an eye on the token count for each article. Here's what I've seen:

- Squid_Game_season_2 (16k)
- Olivia_Hussey (12k)
- Greg_Gumbel (6k)
- Bryant_Gumbel (8k)
- Nosferatu_(2024_film) (17k)
- Pushpa_2 (38k)
- Manmohan_Singh (38k)


In [51]:
import tiktoken

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

# Changing from Ollama to openai...

When passing the entire article text to Ollama I was having a greate deal of hallucinations. Decided to see what it looked like to pass the entire text to ChatGPT. Dropped it down to do only the top article to test out the token count and cost


- gpt-3.5-turbo-0125 16,385 tokens is not enough for the Anthropology article with 26k tokens
- gpt-4-32k-0613 has a limit of 32k
- gpt-4-turbo 128k tokens: 


In [52]:
from openai import OpenAI
client = OpenAI(api_key=OPENAI_API_KEY)

articles_with_reasons = []


for item in featured_feed['mostread']['articles'][:7]:
    title = item['title']
    views = item['views']
    link = item['content_urls']['desktop']['page']
    extract = item['extract']
    thumbnail = item.get('thumbnail', {}).get('source', None)
    print(f"Analyzing {title}")

    date_to_query = yesterday

    # Download raw text of article
    url = f"https://en.wikipedia.org/w/index.php?title={title}&action=raw"
    print(url)

    article_text = requests.get(url).text

    
    print(f"Token count: {num_tokens_from_string(article_text, 'cl100k_base')}")


    prompt = f"Act as a professional news summarizer. Based on your knowledge of {title} and the following extract. In 1-2 sentences, explain why the {title} article might be trending on Wikipedia on #{date_to_query}:\n\n{article_text}"


    response = client.chat.completions.create(
    model="gpt-4-turbo",
    messages=[
        {
        "role": "user",
        "content": prompt
        }
    ],
    temperature=1,
    max_tokens=2048,
    top_p=1
    )
    print(f"response: {response}")
    print(f"trendingreason: {response.choices[0].message.content}")
    
    article={
        'title': title,
        'views': views,
        'link': link,
        'thumbnail': thumbnail,
        'extract': extract,
        'trendingreason': response.choices[0].message.content
    }



    articles_with_reasons.append(article)


Analyzing Manmohan_Singh
https://en.wikipedia.org/w/index.php?title=Manmohan_Singh&action=raw
Token count: 31993
response: ChatCompletion(id='chatcmpl-AkL6GiKG7OY8FXNmnavSUK5uilolY', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="The article about Manmohan Singh, former Prime Minister of India, may be trending on Wikipedia due to his recent passing on December 26, 2024. His significant contributions as a key architect in liberalizing India's economy and his tenure as a respected global statesman are likely driving interest in his life and legacy.", refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None))], created=1735607560, model='gpt-4-turbo-2024-04-09', object='chat.completion', service_tier=None, system_fingerprint='fp_1a5512f3de', usage=CompletionUsage(completion_tokens=66, prompt_tokens=32064, total_tokens=32130, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tok

RateLimitError: Error code: 429 - {'error': {'message': 'Request too large for gpt-4-turbo-preview in organization org-FzpucI54an60M5UmUwg4kb9M on tokens per min (TPM): Limit 30000, Requested 32212. The input or output tokens must be reduced in order to run successfully. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}

Kept running into rate limit errors... But the results I was getting are very positive and close to what I was looking for.

On the downside.. running this for two days hit the Tokens Per Minute limits and cost almost $7....

This is enough for this

### Save vital information to new file

In [53]:
file_path = f'{file_directory}/{base_file_name}-trending-reasons.json'

# Save to JSON file (overwrite if it already exists)
with open(file_path, 'w', encoding='utf-8') as file:
    json.dump(articles_with_reasons, file, indent=4, ensure_ascii=False)

print(f'articles_with_reasons saved to {file_path}')

articles_with_reasons saved to data/2024-12-28-trending-reasons.json


#### Build HTML Page to display the top 10 list complete with thumbnails and the reason generated by Ollama

In [54]:
# Start building the HTML
html_list = "<ol>\n"

# Iterate through the data
for item in articles_with_reasons:
    title = item['title']
    link = item['link']
    thumbnail = item['thumbnail']
    trendingreason = item['trendingreason']
    views = item['views']
    extract = item['extract']

    # Handle null thumbnail
    if thumbnail:
        thumbnail_html = f'<img src="{thumbnail}" alt="Thumbnail for {title}"/><br>'
    else:
        thumbnail_html = '<p><em>No thumbnail available</em></p>'
    
    # Create a list item for each entry
    html_list += f"""
    <li>
        <h2>
          <a href="{link}" target="_blank">{title}</a><br>
        </h2>
        {thumbnail_html}
        <strong>Views:</strong> {views}<br>
        <strong>Extract:</strong> {extract}<br>
        <strong>Reason for Trending:</strong> {trendingreason}
    </li>\n
    """

# Close the HTML list
html_list += "</ol>"

# Save to html file (overwrite if it already exists)
file_path = f'{file_directory}/{base_file_name}.html'

with open(file_path, 'w', encoding='utf-8') as file:
    file.write(html_list)

# Display the HTML in the notebook
from IPython.display import display, HTML
display(HTML(html_list))