# A quick demo of the WebBotParser

### Import the WebBotParser and specify which engine is supposed to be used

In [1]:
from webbot import WebBotParser

### Extract search results and metadata from a given results page

In [2]:
parser = WebBotParser(engine = 'DuckDuckGo News')
file = './testdata/duckduckgo.com_climate change_news_2023-01-30_14_23_50.html'
metadata, results = parser.get_results(file, with_metadata=True)
metadata

{'result type': 'news',
 'engine': 'duckduckgo.com',
 'query': 'climate change',
 'date': Timestamp('2023-01-30 14:23:50')}

In [3]:
results

Unnamed: 0,title,link,text,source,has_image,published
0,Living With Climate Change: Want a rebate to u...,https://www.finanzen.ch/nachrichten/aktien/liv...,Consumer rebates for home electrification are ...,Finanzen,False,2 days ago
1,Mit Technologien das Klima schützen? Warum Deu...,https://web.de/magazine/wissen/klima/deutschla...,"Um die globale Erderwärmung auf 1,5 Grad Celsi...",Web.de,True,9 days ago
2,Living With Climate Change: How to invest in '...,https://www.finanzen.ch/nachrichten/aktien/liv...,Replacing traditional steel with 'green steel'...,Finanzen,False,3 days ago
3,Hexagon Purus opens new hydrogen cylinder manu...,https://www.wallstreet-online.de/nachricht/164...,The demand for zero-emission mobility & infras...,wallstreet,False,3 hours ago
4,Black & Veatch: Build Business Resilience with...,https://www.wallstreet-online.de/nachricht/164...,"OVERLAND PARK, KS / ACCESSWIRE / January 27, 2...",wallstreet,False,3 days ago
...,...,...,...,...,...,...
61,California American Water Applies for Water Re...,https://www.wallstreet-online.de/nachricht/164...,California American Water has supplemented its...,wallstreet,False,3 days ago
62,Dieser Ameise wurde ein parasitischer Pilz der...,https://www.gmx.net/magazine/wissen/natur-umwe...,Popkultur trifft Wissenschaft: Im Computerspie...,GMX,True,10 days ago
63,Fortuna reports on Yaramoko's updated reserves...,https://www.wallstreet-online.de/nachricht/164...,"VANCOUVER, British Columbia, Jan. 27, 2023 (GL...",wallstreet,False,3 days ago
64,The Coach Foundation Brings Mentoring to the F...,https://www.wallstreet-online.de/nachricht/164...,"NORTHAMPTON, MA / ACCESSWIRE / January 27, 202...",wallstreet,False,3 days ago


### Only get the metadata

In [4]:
parser = WebBotParser(engine = 'Google Text')
file = './testdata/www.google.com_climate change_text_2023-01-30_14_18_24.html'
parser.get_metadata(file)

{'result type': 'text',
 'engine': 'www.google.com',
 'query': 'climate change',
 'page': 3,
 'date': Timestamp('2023-01-30 14:18:24'),
 'total results': 1620000000}

### Extract all search results from a directory, in correct order
This is particularly useful for search engines that return results in pages, e.g. news results on Google. If more results are loaded by scrolling (e.g. DuckDuckGo text results), we don't need this.

In [5]:
parser = WebBotParser(engine = 'Google News')
dir = './testdata/google_news/'
metadata, results = parser.get_results_from_dir(dir)
metadata

{'result type': 'news',
 'engine': 'www.google.com',
 'query': 'climate change',
 'total results': 268000000}

In [6]:
results[-5:]

Unnamed: 0,title,link,text,source,has_image,published,date,page,position
45,"South Korea, Qatar to take on climate change w...",https://dohanews.co/south-korea-qatar-to-take-...,"South Korea, Qatar to take on climate change w...",Doha News | Qatar,True,vor 5 Stunden,2023-01-30 14:19:08,5,5
46,Black communities in Norfolk see major climate...,https://www.bayjournal.com/news/climate_change...,He blames climate change. “As far as I'm conce...,Bay Journal,True,vor 7 Stunden,2023-01-30 14:19:08,5,6
47,"To Make Progress on Climate Action, Pop ‘Norma...",https://behavioralscientist.org/to-make-progre...,"Beliefs about climate change, political orient...",Behavioral Scientist,True,vor 3 Stunden,2023-01-30 14:19:08,5,7
48,Why bipartisan efforts on climate can help sav...,https://www.deseret.com/2023/1/29/23567515/win...,Our planet is the common denominator. In recen...,Deseret News,True,vor 9 Stunden,2023-01-30 14:19:08,5,8
49,South Africa supports calls for global co-oper...,https://www.zawya.com/en/press-release/events-...,South Africa supports calls for global co-oper...,ZAWYA,True,vor 2 Stunden,2023-01-30 14:19:08,5,9


### Initialize the WebBotParser with custom queries

In [7]:
def hello_world(_soup): return 'Hello World'

queries = [
        {'name': 'hello', 'type': 'custom', 'function': hello_world}
        # ..is how you can pass functions for custom queries. If most queries are custom, rewrite the evaluate_query function instead.
    ]

parser = WebBotParser(queries = queries)

Exception: A result_selector string is required to discriminate individual results on each page.

In [None]:
parser.get_results('./testdata/www.google.com_climate change_text_2023-01-30_14_18_24.html')