In [90]:
import json
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm


This script:
1. Scrapes the Hacker News website for the latest headlines and links.
2. Inserts the headlines, links, and link content into opensearch.

In [61]:

def get_all_things(url):
    # Fetch the list of 'things' from hackernews
    response = requests.get(url)
    content = response.content

    soup = BeautifulSoup(content, 'html.parser')
    things = soup.find_all('tr', class_='athing')
    next_page = soup.find('a', class_='morelink')

    return things, next_page

In [74]:
def getPageText(url):
    try:
        html_content = requests.get(url).content
        soup = BeautifulSoup(html_content, 'html.parser')
        [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])]
        text = soup.getText(separator='\n')
        return '\n'.join(line.strip() for line in text.split('\n') if line.strip())
    except TimeoutError:
        print(f'Warning: Timeout error requesting content from {url}')
        return ''

In [75]:

class HackerNewsPage():
    # hackernews index pages have 30 listings each
    # the thing object passed to init is the 
    # `titleline > a` is the link to the thing
    # `subline:last-child` is the comments link
    def __init__(self, thing):
        link_element_obj = thing.select('.titleline > a')[0]
        self.headline = link_element_obj.text
        self.thing_url = link_element_obj['href']

        self.thing_content = getPageText(self.thing_url)

        # get comments, which are in next parent sibling
        try:
            comments = thing.next_sibling.select('.subline > a')[-1]
            base_url = 'https://news.ycombinator.com/'
            self.comments_url = f'{base_url}{comments["href"]}'
            self.comments_content = getPageText(self.comments_url)
        except IndexError:
            # no comments, so no link to comments page
            self.comments_url = ''
            self.comments_content = ''
    
    def __str__(self):
        return f'Headline: {self.headline}\nThing URL: {self.thing_url}\nThing contents: {self.thing_content[:20]}\nComments URL: {self.comments_url}\nComments: {self.comments_content[:20]}'


In [76]:
# each page has 30 links denoted by html class _athing
# each thing has 2 pages to gather: the thing linked to and the comments
# example 3 pages = 3*30*2 = 180. So 180/2=90 requests to the site, 180/2=90 requests to other sites, and 180 entries to opensearch
NUM_PAGES = 1
base_url = 'https://news.ycombinator.com/'
all_things = [] # list of HackerNewsPage objects
current_url=base_url
for i in range(NUM_PAGES):
    print(f'Getting hackernews page {i+1}')
    # step 1 find the tbody

    things, next_page = get_all_things(current_url)
    print(f'Found {len(things)} from page {i+1}')
    all_things.extend([HackerNewsPage(thing) for thing in tqdm(things)])
    print(f'Got {len(things)} links on page {i+1}')
    
    if not next_page or not (current_url := f'{base_url}{next_page["href"]}') or current_url == base_url:
        print(f'No more pages to scrape.')
        break

print(f'Total links: {len(all_things)}')

Getting hackernews page 1
Found 30 from page 1


100%|██████████| 30/30 [00:36<00:00,  1.21s/it]

Got 30 links on page 1
Total links: 30





In [77]:
for thing in all_things:
    print(thing)

Headline: LLM4Decompile: Decompiling Binary Code with LLM
Thing URL: https://github.com/albertan017/LLM4Decompile
Thing contents: Skip to content
Togg
Comments URL: https://news.ycombinator.com/item?id=39733275
Comments: Hacker News
new
|
pa
Headline: The return of the frame pointers
Thing URL: https://www.brendangregg.com/blog/2024-03-17/the-return-of-the-frame-pointers.html
Thing contents: Brendan's site:
Star
Comments URL: https://news.ycombinator.com/item?id=39731824
Comments: Hacker News
new
|
pa
Headline: Gravitational Collapse of Spongebob
Thing URL: https://twitter.com/PeRossello/status/1769035370031694214
Thing contents: This browser is no l
Comments URL: https://news.ycombinator.com/item?id=39730625
Comments: Hacker News
new
|
pa
Headline: Flemish Proverbs by Jan Wierix (ca. 1568)
Thing URL: https://publicdomainreview.org/collection/wierix-flemish-proverbs/
Thing contents: Home
Essays
Collecti
Comments URL: https://news.ycombinator.com/item?id=39727280
Comments: Hacker News
n

In [130]:
class OpenSearch:
    # rest API documentation
    
    def __init__(self, host, port):
        self.host = host
        self.port = port
        self.url = f'http://{self.host}:{self.port}'
        self.headers = {'Content-Type': 'application/json'}
        try:
            self.session = requests.Session()
        except ConnectionError:
            print('Connection to OpenSearch failed.')

    def create_index(self, index_name, mapping):
        url = f'{self.url}/{index_name}'
        response = requests.put(url, headers=self.headers, data=json.dumps(mapping))
        return response.json()

    def insert_document(self, index_name, document):
        url = f'{self.url}/{index_name}/_doc'
        response = requests.post(url, headers=self.headers, data=json.dumps(document))
        print(f'Sending document object:\n{json.dumps(document)}')

        return response.json()

    def bulk_insert(self, index_name, documents):
        # bulk is recommended: https://opensearch.org/docs/latest/api-reference/document-apis/bulk/
        url = f'{self.url}/{index_name}/_bulk'
        data = '\n'.join([json.dumps({"create":{}}) + '\n' + json.dumps(doc) for doc in documents]) + '\n'
        print(f'Sending bulk document object:\n{data}')
        response = requests.post(url, headers=self.headers, data=data)
        return response.json()

    def delete_index(self, index_name):
        url = f'{self.url}/{index_name}'
        response = requests.delete(url, headers=self.headers)
        return response.json()

    def delete_all(self, index_name):
        query = {
            "query": {
                "match_all": {}
            }
        }
        url = f'{self.url}/{index_name}/_delete_by_query'
        response = requests.post(url, headers=self.headers, data=json.dumps(query))
        return response.json()

    def delete_by_query(self, index_name, query):
        url = f'{self.url}/{index_name}/_delete_by_query'
        response = requests.post(url, headers=self.headers, data=json.dumps(query))
        return response.json()

    def count(self, index_name):
        url = f'{self.url}/{index_name}/_count'
        response = requests.get(url, headers=self.headers)
        return response.json()

opensearch_instance = OpenSearch('localhost', 9200)


In [123]:
hackernews_mapping = {
    "settings": {
    "index": {
      "number_of_shards": 1,
      "number_of_replicas": 0
    }
  },
    "mappings": {
        "properties": {
            "title": {"type": "text"},
            "url": {"type": "keyword"},
            "comments": {"type": "text"},
            "article": {"type": "text"}
        }
    }
}

In [132]:
# create index
opensearch_instance.create_index('hackernews', hackernews_mapping)


{'acknowledged': True, 'shards_acknowledged': True, 'index': 'hackernews'}

In [125]:
# insert a single documentthing = all_things[0]
document = {
    "title": thing.headline,
    "url": thing.thing_url,
    "comments": thing.comments_content,
    "article": thing.thing_content
}


opensearch_instance.insert_document('hackernews', document)

Sending document object:
{"title": "LLM4Decompile: Decompiling Binary Code with LLM", "url": "https://github.com/albertan017/LLM4Decompile", "comments": "Hacker News\nnew\n|\npast\n|\ncomments\n|\nask\n|\nshow\n|\njobs\n|\nsubmit\nlogin\nLLM4Decompile: Decompiling Binary Code with LLM\n(\ngithub.com/albertan017\n)\n135 points\nby\nDavidbrcz\n4 hours ago\n|\nhide\n|\npast\n|\nfavorite\n|\n20\u00a0comments\nmadisonmay\n1 hour ago\n|\nnext\n[\u2013]\nThis is an excellent use case for LLM fine-tuning, purely because of the ease of generating a massive dataset of input / output pairs from public C code\nreply\nklik99\n1 hour ago\n|\nprev\n|\nnext\n[\u2013]\nThis is a fascinating idea, but (honest question, not a judgement) would the output be reliable? It would be hard to identify hallucinations since recompiling could produce different machine code. Particularly if there is some novel construct that could be a key part of the code. Are there ways of also reporting the LLMs confidence in se

{'_index': 'hackernews',
 '_id': 'tKt5To4BeTdGKgZomaov',
 '_version': 1,
 'result': 'created',
 '_shards': {'total': 1, 'successful': 1, 'failed': 0},
 '_seq_no': 3,
 '_primary_term': 1}

In [134]:
documents = []
for thing in all_things:
    document = {
        "title": thing.headline,
        "url": thing.thing_url,
        "comments": thing.comments_content,
        "article": thing.thing_content
    }
    documents.append(document)
opensearch_instance.bulk_insert('hackernews', documents)

Sending bulk document object:
{"create": {}}
{"title": "LLM4Decompile: Decompiling Binary Code with LLM", "url": "https://github.com/albertan017/LLM4Decompile", "comments": "Hacker News\nnew\n|\npast\n|\ncomments\n|\nask\n|\nshow\n|\njobs\n|\nsubmit\nlogin\nLLM4Decompile: Decompiling Binary Code with LLM\n(\ngithub.com/albertan017\n)\n135 points\nby\nDavidbrcz\n4 hours ago\n|\nhide\n|\npast\n|\nfavorite\n|\n20\u00a0comments\nmadisonmay\n1 hour ago\n|\nnext\n[\u2013]\nThis is an excellent use case for LLM fine-tuning, purely because of the ease of generating a massive dataset of input / output pairs from public C code\nreply\nklik99\n1 hour ago\n|\nprev\n|\nnext\n[\u2013]\nThis is a fascinating idea, but (honest question, not a judgement) would the output be reliable? It would be hard to identify hallucinations since recompiling could produce different machine code. Particularly if there is some novel construct that could be a key part of the code. Are there ways of also reporting the L

{'took': 97,
 'errors': False,
 'items': [{'create': {'_index': 'hackernews',
    '_id': '06uOTo4BeTdGKgZozqpN',
    '_version': 1,
    'result': 'created',
    '_shards': {'total': 1, 'successful': 1, 'failed': 0},
    '_seq_no': 0,
    '_primary_term': 1,
    'status': 201}},
  {'create': {'_index': 'hackernews',
    '_id': '1KuOTo4BeTdGKgZozqpN',
    '_version': 1,
    'result': 'created',
    '_shards': {'total': 1, 'successful': 1, 'failed': 0},
    '_seq_no': 1,
    '_primary_term': 1,
    'status': 201}},
  {'create': {'_index': 'hackernews',
    '_id': '1auOTo4BeTdGKgZozqpN',
    '_version': 1,
    'result': 'created',
    '_shards': {'total': 1, 'successful': 1, 'failed': 0},
    '_seq_no': 2,
    '_primary_term': 1,
    'status': 201}},
  {'create': {'_index': 'hackernews',
    '_id': '1quOTo4BeTdGKgZozqpN',
    '_version': 1,
    'result': 'created',
    '_shards': {'total': 1, 'successful': 1, 'failed': 0},
    '_seq_no': 3,
    '_primary_term': 1,
    'status': 201}},
  {'

In [129]:
opensearch_instance.delete_index('hackernews')

{'acknowledged': True}

In [135]:
opensearch_instance.count('hackernews')

{'count': 30,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}}

In [None]:
query = {
    "query": {
        "match_all": {}
    }
}