In [1]:
import time
from datetime import datetime, timezone
import pathlib
import json
from selenium.webdriver import Remote, ChromeOptions
from bs4 import BeautifulSoup
from pprint import pprint

import helpers
from helpers import ai, scraping

now = datetime.now(timezone.utc)
today = now.strftime("%Y-%m-%d")
print(today)
# today = "2024-02-13"

2024-02-17


In [2]:
options = ChromeOptions()
max_pages = 1
today = "2024-02-13"

# disable downloading images
prefs = {"profile.managed_default_content_settings.images": 2}
options.add_experimental_option("prefs", prefs)

now = datetime.now(timezone.utc)
# today = now.strftime("%Y-%m-%d")

url_pattern = "https://news.ycombinator.com/front?day={day}&p={page}"
detail_pattern = "https://news.ycombinator.com/item?id={item_id}"
sbr_connection = helpers.get_sbr_connection()

In [3]:
html_datas = []
with Remote(sbr_connection, options=options) as driver:
    for page in range(1, max_pages + 1):
        url = url_pattern.format(day=today, page=page)
        print(page, url)
        driver.get(url) # HTTP GET
        time.sleep(2)
        html_source = driver.page_source 
        html_datas.append(html_source)

1 https://news.ycombinator.com/front?day=2024-02-13&p=1


In [4]:
def extract_summary_and_keywords(content="", client=None, raw=None):
    system_prompt = "".join([
        "You are an expert web scraper and researcher.",
        "When you get data, you perform expert-level summarization and keyword extraction.",
    ])
    prompt_start = "".join([
        "Provide a concise summary of the contents of the following text with minimum of 3 paragraphs.",
        "The summary should not include anything related to the discussion nature of the following text.",
        "The summary should not include anything related to the conversation nature of the following text.",
        "Also extract a 1-word subject of the following text as the top ranked keyword.",
        "Extract and rank top keywords based on the subject matter of only of the following text.",
        "Use the following text: "
    ])
    prompt_end="Using format of \"{'summary': <generated-summary>, 'keywords': [{value: 'a', rank: 1}, {value: 'b', rank: 2}, {value: 'c', rank: 3}, {value: 'd', rank: 4}, {value: 'e', rank: 5}]}\" return a response with json"
    messages=[
        {"role": "system", "content": system_prompt},
        {
            "role": "user", 
            "content": f"{prompt_start} {content} {prompt_end}",
        }
    ]
    return ai.perform_completion(messages, client=client, raw=raw)

In [5]:
def extract_post_data(tr):
    id = tr.attrs.get('id')
    next_tr = tr.find_next('tr')
    score_span = next_tr.find("span", class_="score")
    score = None
    if score_span:
        score = "".join([x for x in score_span.get_text() if x.isdigit()])
    title_element = tr.find("span", class_="titleline")
    text = title_element.get_text()
    target_links = [x.get('href') for x in tr.find_all('a') if x.get('href').startswith("http")]
    target_link = target_links[0] if len(target_links) >= 1 else None
    detail_link = detail_pattern.format(item_id = id)
    return  {
            "id": id,
            "text": text,
            'target_link': target_link,
            "score": score,
            "thread_link": detail_link
        }

In [None]:
dataset = []
for html_source in html_datas:
    soup = BeautifulSoup(html_source, 'html.parser')
    rows = soup.find_all('tr', class_="athing")
    with Remote(sbr_connection, options=options) as driver:
        for tr in rows[:5]:
            id = tr.attrs.get('id')
            data = extract_post_data(tr)
            thread_link = data.get('thread_link')
            driver.get(thread_link)
            thread_page_source = driver.page_source
            soup = BeautifulSoup(thread_page_source, 'html.parser')
            body = soup.find('body')
            # use the parsed data
            content = body.get_text()
            content = content.replace('new | past | comments | ask | show | jobs | submit', '')
            content = content.replace('login', '').replace('Hacker News', '')
            content = content.replace('| hide | past | favorite |', '')
            content = content.replace('| parent', '')
            content = content.replace('| next [–] ', '')
            content = content.strip()
            content = content[:5000]
            # using Ollama locally
            pred_data, is_json = extract_summary_and_keywords(content=content)
            dataset.append({
                "scraped": data,
                "preds": pred_data
            })

In [None]:
for data in dataset:
    pprint(data)