In [154]:
import pandas as pd
import dateparser
from bs4 import BeautifulSoup


In [205]:
import re
import time
import requests
import datetime
import unicodedata
import unidecode
import dateutil.parser
from random import randint
from retry_decorator import retry


class RateLimitError(Exception):
    pass


class GooglePage:
    def __init__(self, soup, query_id=None):
        self.soup = soup
        self.date_crawled = datetime.datetime.today()
        self.counter = 0
        self.query_id = query_id

    @property
    def query(self):
        title = self.soup.title
        if title is not None:
            text = title.get_text().split('- ')[:-1]
            query = ''.join(text)
            return query

    @property
    def results_count(self):
        stats = self.soup.find('div', id='resultStats')
        if stats is not None:
            text = stats.get_text().strip()
            m = re.findall(r'((?:\d+[,\.])*\d+)', text)
            text = m[0].replace(",", "").replace('.', '')
            text = unicodedata.normalize("NFKD", text)
            results_count = int(text)
            return results_count

    @property
    def results_containers(self):
        container = self.soup.find('div', id='search')
        if container is not None:
            result_containers = container.find_all("g-card")
            return result_containers

    @property
    def image_script(self):
        scripts = self.soup.find_all('script')
        image_scripts = [x for x in scripts if x.get_text().startswith('function _setImagesSrc')]
        if len(image_scripts) == 1:
            image_script = image_scripts[0].get_text()
            return image_script

    @property
    def data_images(self):
        image_script = self.image_script
        if image_script:
            data_images = GooglePage.find_images_in_script(image_script)
            return data_images

    def results(self):
        query = self.query
        query_id = self.query_id
        date_crawled = self.date_crawled
        for i, result_soup in enumerate(self.results_containers):
            search_position = str(i+1)
            yield GoogleResult(result_soup, search_position=search_position, query=query, query_id=query_id, date_crawled=date_crawled).parse()

    def videos(self):
        video_heading = self.soup.find_all(lambda x: x.name == "h3" and x.get_text() == "Videos")
        if video_heading:
            return video_heading[0].parent.parent

class VideoResult:
    def __init__(self, soup):
        return
        
            
class GoogleResult:
    def __init__(self, soup, search_position=None, query=None, query_id=None, date_crawled=None, data_images=None):
        self.soup = soup
        self.search_position = search_position
        self.query = query
        self.query_id = query_id
        self.date_crawled = date_crawled
        self.data_images = data_images

    @property
    def id(self):
        if self.query_id:
            id_ = "{}{}{}".format(int(self.date_crawled.timestamp()), self.query_id, self.search_position)  # Should be unique with single thread
            return id_

    @property
    def title(self):
        title_ = self.soup.find('h3') or self.soup.find("div", {'role': 'heading'})
        if title_ is not None:
            title = title_.get_text().strip()
            title = unicodedata.normalize("NFKD", title)
            return title

    @property
    def link(self):
        div = self.soup.find('div', class_='r')
        if div is not None:
            link = div.a.get('href')
            if link.startswith('/url?'):
                link = link.split('/url?q=')[1]

        anchor = self.soup.find("a", {"style": "text-decoration:none;display:block"}) #news result
        if anchor is not None:
            link = anchor.get("href")

        return link

    @property
    def website(self):
        citation = self.soup.find("cite")
        if citation:
            website = citation.get_text().split("›")[0]
            return website

        images = self.soup.find_all("g-img")
        if len(images) == 2:
            website = images[1].parent.get_text()
            return website
        

    @property
    def description(self):
        description_ = self.soup.find('span', class_='st')
        
        sibling = self.soup.find("div", {"role": "heading"})
        if sibling is not None:
            description_ = list(sibling.parent.children)[-1]

        if description_:
            description = description_.get_text()
            description = unicodedata.normalize("NFKD", description)
            return description

    @property
    def keywords_matched(self):
        description_text = self.soup.find('span', class_='st')
        if description_text:
            keywords_matched = description_text.find_all('em')
            keywords_matched = list(set([x.get_text().lower() for x in keywords_matched]))
            return keywords_matched

    @property
    def image(self):
        parent = self.soup.find("g-img")
        if parent:
            image = parent.img.get("src")
            return image

    @property
    def date(self):
        spans = self.soup.find_all("span")
        if len(spans) > 0:
            date_description = spans[-1].get_text()
            try: 
                date = GoogleResult.format_date(date_description)
                return date
            except Exception as e:
                print(e)
            

    @staticmethod
    def clean_text(text):
        if text is not None and isinstance(text, str):
            cleaned = ' '.join(unidecode.unidecode(text).split()).strip()
            return cleaned
        return text

    @staticmethod
    def format_date(x: str) -> str:
        try:
            parsed = dateparser.parse(x, languages=["de"])
            formatted = parsed.strftime("%d/%m/%Y %H:%M")
            return formatted

        except Exception as e:
            print(e, x)
            try:
                parsed = time.strptime(x, "%d.%m.%Y")
                formatted = time.strftime("%d/%m/%Y %H:%M", parsed)
                return formatted
                
            except Exception as e:
                print(e)

        return x
        

    def parse(self):
        data = {
            'id': self.id,
            'title': self.title,
            'link': self.link,
            'website': self.website,
            'description': self.description,
            'search_position': self.search_position,
            'query': self.query,
            'query_id': self.query_id,
            'keywords_matched': self.keywords_matched,
            "image": self.image,
            "date": self.date,
            'date_crawled': self.date_crawled,
        }

        for k, v in data.items():
            data[k] = GoogleResult.clean_text(v)

        return data

    @staticmethod
    def date_text_to_datetime(text):
        if '-' in text:
            text = text.split('- ')[1]
        if 'ago' in text:
            amount, measure = text.replace(' ago', '').split()
            if not measure.endswith('s'):
                measure += 's'
            delta = {measure: int(amount)}
            date_published = datetime.datetime.today() - datetime.timedelta(**delta)
        else:
            date_published = dateutil.parser.parse(text)
        return date_published


class GoogleSearch:
    '''
        Geolocation: aa
        HL: https://sites.google.com/site/tomihasa/google-language-codes
    '''

    def __init__(self, params):
        self.params = params
        self.urls = ['http://www.google.com/search', 'https://www.google.com/search']
        self.headers = {'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36'}
        self.counter = 0

    @retry(RateLimitError, tries=5, timeout_secs=60)
    def get_html(self, query):
        counter = self.counter % 2
        url = self.urls[counter]
        headers = self.headers #[counter]
        params = self.params.copy()
        params.update({'q': query})
        response = requests.get(url, params=params, headers=headers)
        print('\t', response.url)
        status_code = response.status_code
        print('\t', self.counter, status_code, url, headers)

        if response.ok:
            return response.text
        else:
            self.counter += 1
            if status_code == 429:
                raise RateLimitError('Ratelimit surpassed')
            else:
                raise Exception('Forbidden')

In [206]:
query = 'Tribes of Europa'
query_id = '1'
params = {
    'tbm': 'nws',
    'hl': 'de',
    'gl': 'DE',
    'num': 100
#         'tbs': 'qdr:d'
}

print(query)
google_search = GoogleSearch(params)
html = google_search.get_html(query)
with open('test.html', 'wb') as f:
    f.write(html.encode('utf8'))
soup = BeautifulSoup(html, features="html.parser")
google_page = GooglePage(soup, query_id=query_id)
results = list(google_page.results())

print('\tNumber of results: {}'.format(len(results)))
# print(results)

Tribes of Europa
	 https://www.google.com/search?tbm=nws&hl=de&gl=DE&num=100&q=Tribes+of+Europa&gws_rd=ssl
	 0 200 http://www.google.com/search {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36'}
	Number of results: 100


In [207]:
df = pd.DataFrame(results)
df.to_csv("~/Desktop/tribes of europa.csv", index=False)
print(df.shape)
df.head()


(100, 12)


Unnamed: 0,id,title,link,website,description,search_position,query,query_id,keywords_matched,image,date,date_crawled
0,161582946111,"Wird in Staffel 2 von ""Tribes Of Europa"" wicht...",http://www.filmstarts.de/nachrichten/18534842....,filmstarts,Noch hat Netflix keine zweite Season der deuts...,1,Tribes of Europa,1,,"data:image/gif;base64,R0lGODlhAQABAIAAAP//////...",08/03/2021 17:31,2021-03-15 17:31:01.875345
1,161582946112,"""Tribes Of Europa""-Erfinder Philip Koch: Wie m...",https://www.gq-magazin.de/entertainment/artike...,GQ Germany,"""Tribes of Europa"" spielt im Jahr 2079 in eine...",2,Tribes of Europa,1,,"data:image/gif;base64,R0lGODlhAQABAIAAAP//////...",01/03/2021 17:31,2021-03-15 17:31:01.875345
2,161582946113,"9 (!) Staffeln ""Tribes Of Europa""? So sieht de...",http://www.filmstarts.de/nachrichten/18534734....,filmstarts,"Am Ende der ersten Staffel von ,,Tribes Of Eur...",3,Tribes of Europa,1,,"data:image/gif;base64,R0lGODlhAQABAIAAAP//////...",01/03/2021 17:31,2021-03-15 17:31:01.875345
3,161582946114,Ein Konigreich fur ein Interrail-Ticket,https://www.faz.net/aktuell/feuilleton/medien/...,FAZ - Frankfurter Allgemeine Zeitung,BDSM-Samurai und bewaffnete Pfadfinder: Das de...,4,Tribes of Europa,1,,"data:image/gif;base64,R0lGODlhAQABAIAAAP//////...",22/02/2021 17:31,2021-03-15 17:31:01.875345
4,161582946115,"""Tribes of Europa"": Das postapokalyptische Ber...",https://www.zeit.de/kultur/film/2021-02/tribes...,ZEIT ONLINE,Die verfuhrerische Kraft des barbarischen Stam...,5,Tribes of Europa,1,,"data:image/gif;base64,R0lGODlhAQABAIAAAP//////...",22/02/2021 17:31,2021-03-15 17:31:01.875345


In [10]:
Page:
- posts
- num results
- current_page
- features?

SyntaxError: invalid syntax (<ipython-input-10-cb3602022297>, line 1)

In [11]:
Post:
title
url
description


SyntaxError: invalid syntax (<ipython-input-11-6c09bbe6433c>, line 1)