In [15]:
import requests
import urllib
import pandas as pd
from requests_html import HTML
from requests_html import HTMLSession
import html2text

h = html2text.HTML2Text()
h.ignore_links = True

In [6]:
def get_source(url):
    """Return the source code for the provided URL. 

    Args: 
        url (string): URL of the page to scrape.

    Returns:
        response (object): HTTP response object from requests_html. 
    """

    try:
        session = HTMLSession()
        response = session.get(url)
        return response

    except requests.exceptions.RequestException as e:
        print(e)

In [None]:
def scrape_google(query):

    query = urllib.parse.quote_plus(query)
    response = get_source("https://www.google.co.uk/search?q=" + query)

    links = list(response.html.absolute_links)
    google_domains = ('https://www.google.', 
                      'https://google.', 
                      'https://webcache.googleusercontent.', 
                      'http://webcache.googleusercontent.', 
                      'https://policies.google.',
                      'https://support.google.',
                      'https://maps.google.')

    for url in links[:]:
        if url.startswith(google_domains):
            links.remove(url)

    return links

In [8]:
def get_results(query):
    
    query = urllib.parse.quote_plus(query)
    response = get_source("https://www.google.co.uk/search?q=" + query)
    
    return response

In [9]:
def parse_results(response):
    
    css_identifier_result = ".tF2Cxc"
    css_identifier_title = "h3"
    css_identifier_link = ".yuRUbf a"
    css_identifier_text = ".IsZvec"
    
    results = response.html.find(css_identifier_result)

    output = []
    
    for result in results:

        item = {
            'title': result.find(css_identifier_title, first=True).text,
            'link': result.find(css_identifier_link, first=True).attrs['href'],
            'text': result.find(css_identifier_text, first=True).text
        }
        
        output.append(item)
        
    return output

In [10]:
def google_search(query):
    response = get_results(query)
    return parse_results(response)

In [16]:
results = google_search("vanitas meaning")
results

[{'title': 'Vanitas - Wikipedia',
  'link': 'https://en.wikipedia.org/wiki/Vanitas',
  'text': "The Latin noun vanitas (from the Latin adjective vanus 'empty') means 'emptiness' , 'futility', or 'worthlessness', the traditional Christian view being that earthly\xa0...\n\u200eThemes · \u200eMotifs · \u200eOutside visual art · \u200eIn modern times"},
 {'title': 'Vanitas – Art Term | Tate',
  'link': 'https://www.tate.org.uk/art/art-terms/v/vanitas',
  'text': 'Tate glossary definition for vanitas: A still life artwork which includes various symbolic objects designed to remind the viewer of their mortality and of the\xa0...'},
 {'title': 'vanitas | Definition, Painters, & Facts | Britannica',
  'link': 'https://www.britannica.com/art/vanitas-art',
  'text': 'Vanitas, in art, a genre of still-life painting that flourished in the Netherlands in the early 17th century. A vanitas painting contains collections of objects symbolic\xa0...'},
 {'title': 'Vanitas definition and meaning | Collins 

In [14]:
results[0]['text']

"The Latin noun vanitas (from the Latin adjective vanus 'empty') means 'emptiness' , 'futility', or 'worthlessness', the traditional Christian view being that earthly\xa0...\n\u200eThemes · \u200eMotifs · \u200eOutside visual art · \u200eIn modern times"