In [None]:
import json
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm


This script:
1. Scrapes the Hacker News website for the latest headlines and links.
2. Inserts the headlines, links, and link content into opensearch.

In [None]:

def get_all_things(url):
    # Fetch the list of 'things' from hackernews
    response = requests.get(url)
    content = response.content

    soup = BeautifulSoup(content, 'html.parser')
    things = soup.find_all('tr', class_='athing')
    next_page = soup.find('a', class_='morelink')

    return things, next_page

In [None]:
def getPageText(url):
    try:
        html_content = requests.get(url).content
        soup = BeautifulSoup(html_content, 'html.parser')
        [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])]
        text = soup.getText(separator='\n')
        return '\n'.join(line.strip() for line in text.split('\n') if line.strip())
    except TimeoutError:
        print(f'Warning: Timeout error requesting content from {url}')
        return ''

In [None]:

class HackerNewsPage():
    # hackernews index pages have 30 listings each
    # the thing object passed to init is the 
    # `titleline > a` is the link to the thing
    # `subline:last-child` is the comments link
    def __init__(self, thing):
        link_element_obj = thing.select('.titleline > a')[0]
        self.headline = link_element_obj.text
        self.thing_url = link_element_obj['href']
        base_url = 'https://news.ycombinator.com/'

        if 'http' not in self.thing_url:
            self.thing_url=f'{base_url}{self.thing_url}'
        print(f'using {self.thing_url}')
        self.thing_content = getPageText(self.thing_url)

        # get comments, which are in next parent sibling
        try:
            comments = thing.next_sibling.select('.subline > a')[-1]
            self.comments_url = f'{base_url}{comments["href"]}'
            self.comments_content = getPageText(self.comments_url)
        except IndexError:
            # no comments, so no link to comments page
            self.comments_url = ''
            self.comments_content = ''
    
    def __str__(self):
        return f'Headline: {self.headline}\nThing URL: {self.thing_url}\nThing contents: {self.thing_content[:20]}\nComments URL: {self.comments_url}\nComments: {self.comments_content[:20]}'


In [None]:
# each page has 30 links denoted by html class _athing
# each thing has 2 pages to gather: the thing linked to and the comments
# example 3 pages = 3*30*2 = 180. So 180/2=90 requests to the site, 180/2=90 requests to other sites, and 180 entries to opensearch
NUM_PAGES = 1
base_url = 'https://news.ycombinator.com/'
all_things = [] # list of HackerNewsPage objects
current_url=base_url
for i in range(NUM_PAGES):
    print(f'Getting hackernews page {i+1}')
    # step 1 find the tbody

    things, next_page = get_all_things(current_url)
    print(f'Found {len(things)} from page {i+1}')
    all_things.extend([HackerNewsPage(thing) for thing in tqdm(things)])
    print(f'Got {len(things)} links on page {i+1}')
    
    if not next_page or not (current_url := f'{base_url}{next_page["href"]}') or current_url == base_url:
        print(f'No more pages to scrape.')
        break

print(f'Total links: {len(all_things)}')

In [None]:
for thing in all_things:
    print(thing)

In [None]:
class OpenSearch:
    # rest API documentation: https://opensearch.org/docs/latest/api-reference/
    def __init__(self, host, port):
        self.host = host
        self.port = port
        self.url = f'http://{self.host}:{self.port}'
        self.headers = {'Content-Type': 'application/json'}
        try:
            self.session = requests.Session()
        except ConnectionError:
            print('Connection to OpenSearch failed.')

    def create_index(self, index_name, mapping):
        url = f'{self.url}/{index_name}'
        response = requests.put(url, headers=self.headers, data=json.dumps(mapping))
        return response.json()

    def insert_document(self, index_name, document):
        url = f'{self.url}/{index_name}/_doc'
        response = requests.post(url, headers=self.headers, data=json.dumps(document))
        print(f'Sending document object:\n{json.dumps(document)}')

        return response.json()

    def bulk_insert(self, index_name, documents):
        # bulk is recommended: https://opensearch.org/docs/latest/api-reference/document-apis/bulk/
        url = f'{self.url}/{index_name}/_bulk'
        data = '\n'.join([json.dumps({"create":{}}) + '\n' + json.dumps(doc) for doc in documents]) + '\n'
        print(f'Sending bulk document object:\n{data}')
        response = requests.post(url, headers=self.headers, data=data)
        return response.json()

    def delete_index(self, index_name):
        url = f'{self.url}/{index_name}'
        response = requests.delete(url, headers=self.headers)
        return response.json()

    def delete_all(self, index_name):
        query = {
            "query": {
                "match_all": {}
            }
        }
        url = f'{self.url}/{index_name}/_delete_by_query'
        response = requests.post(url, headers=self.headers, data=json.dumps(query))
        return response.json()

    def delete_by_query(self, index_name, query):
        url = f'{self.url}/{index_name}/_delete_by_query'
        response = requests.post(url, headers=self.headers, data=json.dumps(query))
        return response.json()

    def count(self, index_name):
        url = f'{self.url}/{index_name}/_count'
        response = requests.get(url, headers=self.headers)
        return response.json()

    def search(self, index_name, query):
        url = f'{self.url}/{index_name}/_search'
        response = requests.get(url, headers=self.headers, data=json.dumps(query))
        return response.json()
        
    def analyze(self, index_name, text_array):
        url = f'{self.url}/{index_name}/_analyze'
        query = {
            "analyzer" : "standard",
            "text" : text_array
        }
        response = requests.get(url, headers=self.headers, data=json.dumps(query))
        return response.json()

opensearch_instance = OpenSearch('localhost', 9200)


In [None]:
hackernews_mapping = {
    "settings": {
    "index": {
      "number_of_shards": 1,
      "number_of_replicas": 0
    }
  },
    "mappings": {
        "properties": {
            "title": {"type": "text"},
            "url": {"type": "keyword"},
            "comments": {"type": "text"},
            "article": {"type": "text"}
        }
    }
}

In [None]:
# create index
opensearch_instance.create_index('hackernews', hackernews_mapping)


In [None]:
# insert a single documentthing = all_things[0]
document = {
    "title": thing.headline,
    "url": thing.thing_url,
    "comments": thing.comments_content,
    "article": thing.thing_content
}


opensearch_instance.insert_document('hackernews', document)

In [None]:
documents = []
for thing in all_things:
    document = {
        "title": thing.headline,
        "url": thing.thing_url,
        "comments": thing.comments_content,
        "article": thing.thing_content
    }
    documents.append(document)
opensearch_instance.bulk_insert('hackernews', documents)

In [None]:
opensearch_instance.delete_index('hackernews')

In [None]:
opensearch_instance.count('hackernews')

## Search Time!
#### Text Fields
- Full-text Search:
  - Use Match and Match Phrase queries for natural language search.
    - Example: Searching for articles or comments with common language terms.
- Multi-field Search:
  - Multi-Match query to search across several text fields.
    - Useful when the search term may appear in multiple text areas (e.g., title, article).
- Wildcard and Regex:
  - For pattern-based searches, where exact term is unknown.
  - Useful for partial matches in larger text fields.
#### Keyword Fields:
- Exact Match:
    - Use Term queries for exact matching.
    - Ideal for URLs or specific identifiers.
- Aggregations and Sorting:
  - Aggregate data based on exact keyword values.
    - Sort query results using keyword fields for consistent ordering.
- Wildcard with Caution:
  - Can be used, but less efficient than with text fields.
    - Suitable for keywords with predictable patterns.



In [None]:

# match query on text field
query = {
  "query": {
    "match": {
      "title": "Work Microchips"
    }
  }
}
opensearch_instance.search('hackernews', query)

In [None]:
# match phrase query on text field
query = {
  "query": {
    "match_phrase": {
      "article": "Microchips Work"
    }
  }
}
opensearch_instance.search('hackernews', query)

In [None]:
# multi-match query on multiple text fields
query = {
  "query": {
    "multi_match": {
      "query": "How",
      "fields": ["title", "article"]
    }
  }
}
opensearch_instance.search('hackernews', query)

In [None]:
# term query on keyword field
query = {
  "query": {
    "term": {
      "url": "https://publicdomainreview.org/collection/wierix-flemish-proverbs/"
    }
  }
}
opensearch_instance.search('hackernews', query)

In [None]:
# aggregation on keyword field
query = {
  "size": 0,
  "aggs": {
    "unique_urls": {
      "terms": {
        "field": "url"
      }
    }
  }
}
opensearch_instance.search('hackernews', query)

In [None]:
# wildcard query on text field
query = {
  "query": {
    "wildcard": {
      "url": "*github*"
    }
  }
}
opensearch_instance.search('hackernews', query)

In [None]:
# wildcard query on text field
text_array=['hello', 'first array element', 'second array element']

opensearch_instance.analyze('hackernews', text_array)