# A quick demo of the WebBotParser

### Import the WebBotParser and specify which engine is supposed to be used

In [1]:
from webbotparser import WebBotParser

### Extract search results and metadata from a given results page

In [2]:
parser = WebBotParser(engine = 'DuckDuckGo News')
file = './testdata/duckduckgo.com_climate change_news_2023-01-30_14_23_50.html'
metadata, results = parser.get_results(file, with_metadata=True)
metadata

{'result type': 'news',
 'engine': 'duckduckgo.com',
 'query': 'climate change',
 'date': Timestamp('2023-01-30 14:23:50')}

In [3]:
results

Unnamed: 0,title,link,text,source,has_image,published
0,Living With Climate Change: Want a rebate to u...,https://www.finanzen.ch/nachrichten/aktien/liv...,Consumer rebates for home electrification are ...,Finanzen,False,2 days ago
1,Mit Technologien das Klima schützen? Warum Deu...,https://web.de/magazine/wissen/klima/deutschla...,"Um die globale Erderwärmung auf 1,5 Grad Celsi...",Web.de,True,9 days ago
2,Living With Climate Change: How to invest in '...,https://www.finanzen.ch/nachrichten/aktien/liv...,Replacing traditional steel with 'green steel'...,Finanzen,False,3 days ago
3,Hexagon Purus opens new hydrogen cylinder manu...,https://www.wallstreet-online.de/nachricht/164...,The demand for zero-emission mobility & infras...,wallstreet,False,3 hours ago
4,Black & Veatch: Build Business Resilience with...,https://www.wallstreet-online.de/nachricht/164...,"OVERLAND PARK, KS / ACCESSWIRE / January 27, 2...",wallstreet,False,3 days ago
...,...,...,...,...,...,...
61,California American Water Applies for Water Re...,https://www.wallstreet-online.de/nachricht/164...,California American Water has supplemented its...,wallstreet,False,3 days ago
62,Dieser Ameise wurde ein parasitischer Pilz der...,https://www.gmx.net/magazine/wissen/natur-umwe...,Popkultur trifft Wissenschaft: Im Computerspie...,GMX,True,10 days ago
63,Fortuna reports on Yaramoko's updated reserves...,https://www.wallstreet-online.de/nachricht/164...,"VANCOUVER, British Columbia, Jan. 27, 2023 (GL...",wallstreet,False,3 days ago
64,The Coach Foundation Brings Mentoring to the F...,https://www.wallstreet-online.de/nachricht/164...,"NORTHAMPTON, MA / ACCESSWIRE / January 27, 202...",wallstreet,False,3 days ago


### Only get the metadata

In [4]:
parser = WebBotParser(engine = 'Google Text')
file = './testdata/www.google.com_climate change_text_2023-01-30_14_18_24.html'
parser.get_metadata(file)

{'result type': 'text',
 'engine': 'www.google.com',
 'query': 'climate change',
 'page': 3,
 'date': Timestamp('2023-01-30 14:18:24'),
 'total results': 1620000000}

### Extract all search results from a directory, in correct order
This is particularly useful for search engines that return results in pages, e.g. news results on Google. If more results are loaded by scrolling (e.g. DuckDuckGo text results), we don't need this.

In [5]:
parser = WebBotParser(engine = 'Google News')
dir = './testdata/google_news/'
metadata, results = parser.get_results_from_dir(dir)
metadata

{'result type': 'news',
 'engine': 'www.google.com',
 'query': 'climate change',
 'total results': 268000000}

In [6]:
results[-5:]

Unnamed: 0,title,link,text,source,has_image,published,date,page,position
45,"South Korea, Qatar to take on climate change w...",https://dohanews.co/south-korea-qatar-to-take-...,"South Korea, Qatar to take on climate change w...",Doha News | Qatar,True,vor 5 Stunden,2023-01-30 14:19:08,5,5
46,Black communities in Norfolk see major climate...,https://www.bayjournal.com/news/climate_change...,He blames climate change. “As far as I'm conce...,Bay Journal,True,vor 7 Stunden,2023-01-30 14:19:08,5,6
47,"To Make Progress on Climate Action, Pop ‘Norma...",https://behavioralscientist.org/to-make-progre...,"Beliefs about climate change, political orient...",Behavioral Scientist,True,vor 3 Stunden,2023-01-30 14:19:08,5,7
48,Why bipartisan efforts on climate can help sav...,https://www.deseret.com/2023/1/29/23567515/win...,Our planet is the common denominator. In recen...,Deseret News,True,vor 9 Stunden,2023-01-30 14:19:08,5,8
49,South Africa supports calls for global co-oper...,https://www.zawya.com/en/press-release/events-...,South Africa supports calls for global co-oper...,ZAWYA,True,vor 2 Stunden,2023-01-30 14:19:08,5,9


### Initialize the WebBotParser with custom queries

This is an example of how you can initialize the WebBotParser with custom result_selector, queries, and metadata_extractor. This is necessary for parsing result types that are not covered out of the box by the WebBotParser. I might also become necessary if a search engine changes their layout such that the predefined queries or result selectors become erroneous.

In [7]:
from webbotparser import GoogleParser
import pandas as pd

# some custom functions for extracting information from individual results
# if most queries are custom, rewrite the __evaluate_query function instead.
def get_date(_soup):
    date = _soup.select('div.P7xzyf > span:last-child')[0].get_text()
    try: date = pd.to_datetime(date, format="%d.%m.%Y") # some dates are relative, we ignore them for now
    except: date = None
    return date

def get_duration(_soup):
    duration = _soup.select('div.J1mWY')
    if len(duration) == 1: # some videos don't have a duration
        min_sec = duration[0].get_text().split(':')
        return pd.to_timedelta(int(min_sec[0])*60 + int(min_sec[1]), unit='seconds')
    return None

# queries descriminate the parts of an individual result
# WebBotParser supports text, attribute, exists, and custom queries
my_queries = [
        {'name': 'title', 'type': 'text', 'selector': 'h3'},
        {'name': 'link', 'type': 'attribute', 'selector': 'div.ct3b9e > a', 'attribute': 'href'},
        {'name': 'text', 'type': 'text', 'selector': 'div.Uroaid'},
        {'name': 'source', 'type': 'text', 'selector': 'span.Zg1NU'},
        {'name': 'published', 'type': 'custom', 'function': get_date}, # pass a function for custom queries.
        {'name': 'duration', 'type': 'custom', 'function': get_duration}
    ]

# the result_selector is used to find the individual results, returned as a list
my_result_selector = 'div.MjjYud'

# initialize a custom WebBotParser for Google Video results (also provided for out of the box usage with the webbotparser package)
parser = WebBotParser(
    queries = my_queries,
    result_selector = my_result_selector,
    metadata_extractor = GoogleParser.google_metadata # you can re-use parts such as this metadata_extractor already defined in the webbotparser package
)

In [8]:
metadata, results = parser.get_results('./testdata/www.google.com_climate change_videos_2023-01-30_14_20_04.html')
metadata

{'result type': 'videos',
 'engine': 'www.google.com',
 'query': 'climate change',
 'page': 5,
 'date': Timestamp('2023-01-30 14:20:04'),
 'total results': 161000000}

In [9]:
results

Unnamed: 0,title,link,text,source,published,duration
0,That's how fast the Carbon Clock is ticking,https://www.mcc-berlin.net/en/research/co2-bud...,The Mercator Research Institute on Global Comm...,Mercator Research Institute on Global Commons ...,2016-10-28,0 days 00:00:16
1,Emissions Gap Report 2022 - UN Environment Pro...,https://www.unep.org/resources/emissions-gap-r...,As growing climate change impacts are experien...,UN Environment Programme,2022-10-27,0 days 00:01:04
2,Climate change education - Unesco.org,https://www.unesco.org/en/education/sustainabl...,"The UN Framework Convention on Climate Change,...",Unesco.org,2015-07-10,0 days 00:29:50
3,About the Fashion Industry Charter for Climate...,https://unfccc.int/climate-action/sectoral-eng...,"Under the auspices of UN Climate Change, fashi...",UNFCCC,2021-11-08,0 days 00:01:01
4,Strengthening resilience for a changing climat...,https://www.oecd.org/climate-change/theme/resi...,Reducing and managing the risks from climate c...,OECD,2021-06-22,0 days 00:01:53
5,Leading the fight against climate change - Ibe...,https://www.iberdrola.com/sustainability/again...,Climate change is defined as a change in clima...,Iberdrola,2020-06-06,0 days 00:01:04
6,FAO's work on climate change,https://www.fao.org/climate-change/en/,Climate change threatens our ability to ensure...,Food and Agriculture Organization of the Unite...,2015-09-23,0 days 00:01:16
7,Climate Solutions - USDA,https://www.usda.gov/climate-solutions,Climate Change Affects U.S. Agriculture and Ru...,USDA,2022-09-14,0 days 00:04:15
8,Climate Change Science | US EPA,https://www.epa.gov/climatechange-science,Human Influence on Climate. Human influence on...,Environmental Protection Agency,2022-02-03,NaT
9,Climate Crisis - United States Department of S...,https://www.state.gov/policy-issues/climate-cr...,The Climate Crisis: Working Together for Futur...,State Department,2022-10-28,0 days 00:03:04
