# Trending Wikipedia articles with current news context

- Adding new prompt/column to get a very short news relation summary
- Save to Supabase


In [30]:

# !pip install openai supabase dotenv requests

In [31]:
from dotenv import load_dotenv
import os

load_dotenv()

SERPER_API_KEY = os.environ["SERPER_API_KEY"] 
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"] 
WIKIPEDIA_APP_NAME = os.environ["WIKIPEDIA_APP_NAME"] 

### Set date to query

In [32]:
import datetime
today = datetime.datetime.now()
date_to_query = today

In [33]:
import os
from supabase import create_client

supabase_url = os.environ.get('SUPABASE_URL')
supabase_key = os.environ.get('SUPABASE_KEY')
supabase = create_client(supabase_url, supabase_key)

## Get latest prompt versions

In [None]:

    
current_agent_version = supabase.table('agent_versions') \
    .select('*') \
    .lte('effective_date', today) \
    .order('effective_date', desc=True) \
    .limit(1) \
    .execute()

print(current_agent_version)

### Get feed data 

create an account here: https://api.wikimedia.org/wiki/Special:CreateAccount

In [None]:
import requests
import json

language_code = 'en' # English
headers = {
  #'Authorization': 'Bearer YOUR_ACCESS_TOKEN',
  'User-Agent': f"""YOUR_APP_NAME ({WIKIPEDIA_APP_NAME})"""
}

base_url = 'https://api.wikimedia.org/feed/v1/wikipedia/'
url = base_url + language_code + '/featured/' + date_to_query.strftime('%Y/%m/%d')
response = requests.get(url, headers=headers)

featured_feed = response.json()
feed_date_str = featured_feed["mostread"]["date"].rstrip("Z")
feed_date = datetime.datetime.strptime(feed_date_str, '%Y-%m-%d')
feed_date_long =  feed_date.strftime('%B %d, %Y')

### Build data structure with all relevant information and placeholders for LLM responses

In [36]:
article_list = []
rank_counter = 1 

for item in featured_feed['mostread']['articles']:
    title = item['title']
    normalized_title = item['titles']['normalized']
    view_count = item['views']
    link = item['content_urls']['desktop']['page']
    extract = item['extract_html']
    thumbnail = item.get('thumbnail', {}).get('source', None)
    view_history = item['view_history']
    mystery_rank = item['rank'] # Not sure why the first article is always ranked 3

    article={
        'date': feed_date_str,
        'title': title,
        'normalized_title': normalized_title,
        'summary': '',
        'view_count': view_count,
        'rank': rank_counter,
        'mystery_rank': mystery_rank,
        'link': link,
        'thumbnail': thumbnail,
        'extract': extract,
        'raw_text': '',
        'trending_reason': '',
        'view_history': view_history,
        'is_newly_trending': '',
        'view_delta_percentage': '',
        'raw_news_results': '',
        'news_relation': '',
        'news_relation_short': '',
        'categories': '',
        'news_relation_json': ''
    }

    article_list.append(article)
    rank_counter += 1

## Determine if the article is newly trending

In [None]:
newly_trending_article_list = []

for article in article_list:
    view_history = article['view_history']

    view_history_length = len(view_history)

    yesterdays_views = view_history[view_history_length-2]['views']
    todays_views = view_history[view_history_length-1]['views']

    # Handle division by zero case
    if yesterdays_views == 0:
        view_delta_percentage = float('inf') if todays_views > 0 else 0
    else:
        view_delta_percentage = ((todays_views - yesterdays_views) / yesterdays_views) * 100

    # More concise assignment using a boolean expression
    article['is_newly_trending'] = view_delta_percentage > 100
    article['view_delta_percentage'] = int(view_delta_percentage)
    print(f"{article['normalized_title']} view delta: {article['view_delta_percentage']} is newly trending: {article['is_newly_trending']}")

  

In [None]:
newly_trending_count = sum(1 for article in article_list if article.get('is_newly_trending') == True)
print( f"Total trending: {len(article_list)}")
print( f"Newly trending: {newly_trending_count}" )

### Get first 5000 characters of each article

In [None]:
for article in article_list:
  # Download raw text of article
  if article.get('is_newly_trending') == True:
    url = f"https://en.wikipedia.org/w/index.php?title={article['title']}&action=raw"
    print(url)
    
    article_text = requests.get(url).text
    article_text_truncated = article_text[:5000]
    article['raw_text'] = article_text_truncated

In [40]:
from openai import OpenAI
openAIClient = OpenAI(api_key=OPENAI_API_KEY)
    

## Summarize article contents

In [None]:

openAIClient = OpenAI(api_key=OPENAI_API_KEY)
step_1_prompt_text = current_agent_version.data[0]["step_1_prompt_text"]
step_1_model_name = current_agent_version.data[0]["step_1_model_name"]


for article in article_list:

    extract = article['extract']
    title = article['normalized_title']

    print(f"Analyzing {title}")

    summary_prompt = step_1_prompt_text.replace("{extract}", extract).replace("{title}", title)
    print(summary_prompt)

    openAISummary = openAIClient.chat.completions.create(
        model=step_1_model_name,
        messages=[
            {
            "role": "user",
            "content": summary_prompt
            }
        ],
        temperature=0.3,
        max_tokens=128,
        top_p=1
    )

    openAISummaryResponse = openAISummary.choices[0].message.content

    print(f"""openAISummaryResponse: {openAISummaryResponse}""")

    article['summary'] =  openAISummaryResponse



    

## Get Google News results

In [None]:
# import requests
# import json

url = "https://google.serper.dev/news"

for article in article_list:
  if article.get('is_newly_trending') == True:
    title = article['normalized_title']
    payload = json.dumps({
      "q": title,
      "autocorrect": False,
      "tbs": "qdr:w"
    })
    headers = {
      'X-API-KEY': SERPER_API_KEY,
      'Content-Type': 'application/json'
    }

    response = requests.request("POST", url, headers=headers, data=payload)

    article['raw_news_results'] = response.json()

    print(response.text)

## Summarize news results

In [None]:

llm_miss_response = "No relevant news found"

step_3_prompt_text = current_agent_version.data[0]["step_3_prompt_text"]
step_3_model_name = current_agent_version.data[0]["step_3_model_name"]


for article in article_list:
    if article.get('is_newly_trending') == True:

        title = article['normalized_title']
        news = article['raw_news_results']['news']
        extract = article['extract']

        print(f"Analyzing {title}")
        if len(news) == 0:
            print(f"No news results for {title}")
            continue

        # If news is already a JSON string but needs reformatting
        news_str = json.dumps(news, indent=2)
        # Now use the string version in your replacement
        news_prompt = step_3_prompt_text.replace("{title}", title).replace("{news}", news_str).replace("{extract}", extract)

        openAINewsRelation = openAIClient.chat.completions.create(
            model=step_3_model_name,
            messages=[
                {
                "role": "user",
                "content": news_prompt
                }
            ],
            max_completion_tokens=2048
        )

        openAINewsResponse = openAINewsRelation.choices[0].message.content

        print(openAINewsResponse)

        article['news_relation'] =  openAINewsResponse


    

## Categorize article

In [None]:
step_4_prompt_text = current_agent_version.data[0]["step_4_prompt_text"]
step_4_model_name = current_agent_version.data[0]["step_4_model_name"]


for article in article_list:

    extract = article['extract']
    news_relation = article['news_relation']
    title = article['title']

    print(f"Analyzing {title}")

    category_prompt = step_4_prompt_text.replace("{extract}", extract).replace("{news_relation}", news_relation)

    openAICategories = openAIClient.chat.completions.create(
        model=step_4_model_name,
        messages=[
            {
            "role": "user",
            "content": category_prompt
            }
        ],
        temperature=0.3,
        max_tokens=128,
        top_p=1
    )

    openAICategoriesResponse = openAICategories.choices[0].message.content

    print(openAICategoriesResponse)

    article['categories'] =  openAICategoriesResponse

## Condense News summary into one sentence.

In [None]:

llm_miss_response = "No relevant news found"

step_5_prompt_text = current_agent_version.data[0]["step_5_prompt_text"]
step_5_model_name = current_agent_version.data[0]["step_5_model_name"]


for article in article_list:
    if article.get('is_newly_trending') == True:
        news_relation = article['news_relation']
        extract = article['extract']

        print(f"Analyzing {article['title']}")
        news_str = json.dumps(news, indent=2)
        
        short_news_prompt = step_5_prompt_text.replace("{news_relation}", news_relation).replace("{extract}", extract)

        openAINewsRelationShort = openAIClient.chat.completions.create(
            model=step_5_model_name,
            messages=[
                {
                "role": "user",
                "content": short_news_prompt
                }
            ],
            max_completion_tokens=2048
        )

        openAIShortNewsResponse = openAINewsRelationShort.choices[0].message.content

        print(f"News relation short: {openAIShortNewsResponse}")

        article['news_relation_short'] =  openAIShortNewsResponse


    

#### Build HTML Page to display results for debug

In [46]:
# # Start building the HTML
# html_title = f"<h1>Newly Trending on {feed_date_long}</h1>\n"
# if len(article_list) > 0:
#     html_list = "<ol>\n"

#     # Iterate through the data
#     for item in article_list:
#         title = item['normalized_title']
#         link = item['link']
#         view_count = item['view_count']
        
#         thumbnail = item['thumbnail']
#         trending_reason = item['trending_reason']
#         news_relation = item['news_relation']
#         news_relation_short = item['news_relation_short']
        
#         extract = item['extract']
#         summary = item['summary']

#         # Handle null thumbnail
#         if thumbnail:
#             thumbnail_html = f'<img src="{thumbnail}" alt="Thumbnail for {title}"/><br>'
#         else:
#             thumbnail_html = ''
        
#         news_relation_output = f"<strong>News related to this:</strong> {news_relation}<br><br>"

#         short_news_relation_output = f"<strong>SHORT News related to this:</strong> {news_relation_short}<br><br>"

#         view_history_list = "<ul>"
#         for view in item['view_history']:
#             view_history_list += f"<li><strong>{view['date'].split("Z")[0]}:</strong> {view['views']:,}</li>"
#         view_history_list += "</ul>"

#         # Create a list item for each entry
#         if item['is_newly_trending'] == True:
#             html_list += f"""
#             <li>
#                 <h2>
#                 <a href="{link}" target="_blank">{title}</a><br>
#                 </h2>
#                 {thumbnail_html}
#                 <strong>Views:</strong><br>
#                 {view_history_list}<br><br>
#                 <strong>view_delta_percentage: </strong> {item['view_delta_percentage']}<br>
#                 <strong>Extract:</strong {extract}<br>
#                 <strong>Summary:</strong> {summary}<br>

#                 <h3>GPT o3 mini</h3>
#                 <strong>Reason for Trending:</strong> {trending_reason}<br><br>
#                 {news_relation_output}<br>
#                 {short_news_relation_output}<br>
#                 <strong>categories: </strong> {item['categories']}<br>
                
#             </li>
#             """
#         else:
#             html_list += f"""
#             <li>
#                 <h2>
#                 <a href="{link}" target="_blank">{title}</a><br>
#                 </h2>                
#                 <strong>Views:</strong><br>
#                 {view_history_list}<br><br>
#                 <strong>view_delta_percentage: </strong> {item['view_delta_percentage']}<br>
#                 <strong>categories: </strong> {item['categories']}<br>
                
#             </li>
#             """
        

#     # Close the HTML list
#     html_list += "\n</ol>"
# else:
#     html_list = "<p>No articles are trending today.</p>"
# html_page = html_title + html_list



### Display generated html

In [47]:
# # Display the HTML in the notebook (assuming Jupyter or similar)
# from IPython.display import display, HTML
# display(HTML(html_page))

## Upload to Supabase

In [None]:
agent_version_id = current_agent_version.data[0]["id"]
for article in article_list:
    # Insert each article
    data = {
        'agent_version_id': agent_version_id,
        'trending_date': article['date'],
        'title': article['title'],
        'normalized_title': article['normalized_title'],
        'link': article['link'],
        'thumbnail': article['thumbnail'],
        'extract': article['extract'],
        'is_newly_trending': article['is_newly_trending'],
        'view_delta_percentage': article['view_delta_percentage'],
        'summary': article['summary'],
        'view_count': article['view_count'],
        'rank': article['rank'],
        'mystery_rank': article['mystery_rank'],
        'view_history': article['view_history'],
        'trending_reason': article['trending_reason'],
        'raw_news_results': article['raw_news_results'],
        'news_relation': article["news_relation"],
        'news_relation_short': article["news_relation_short"],
        'categories': article["categories"]
    }
    

    try:
        result = supabase.table('trending_articles').insert(data).execute()
        print(f"Inserted {article['title']}")
    except Exception as e:
        print(f"Error inserting {article['title']}: {str(e)}")

print("All data inserted successfully")