In [4]:
import logging
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
import re

logging.basicConfig(
    format='%(asctime)s %(levelname)s:%(message)s',
    level=logging.INFO)

class Crawler:

    def __init__(self, urls=[]):
        self.visited_urls = []
        self.urls_to_visit = urls
        
    def download_url(self, url):
        return requests.get(url).text

    def get_linked_urls(self, url, html):
        soup = BeautifulSoup(html, 'html.parser')
        for link in soup.find_all('a'):
            path = link.get('href')
            if path and path.startswith('/'):
                path = urljoin(url, path)
            yield path

    def add_url_to_visit(self, url):
        if url not in self.visited_urls and url not in self.urls_to_visit:
            self.urls_to_visit.append(url)

    def crawl(self, url):
        html = self.download_url(url)
        for url in self.get_linked_urls(url, html):
            self.add_url_to_visit(url)

        self.search(html)
    def run(self):
        i = 0
        while self.urls_to_visit:
            url = self.urls_to_visit.pop(0)
            logging.info(f'Crawling: {url}')
            try:
                if i == 10:
                    break
                self.crawl(url)
                i += 1
            except Exception:
                logging.exception(f'Failed to crawl: {url}')
            finally:
                self.visited_urls.append(url)

    def search(self,html):
        searched_word = "กีฬา"
        soup = BeautifulSoup(html, 'html.parser')
        results = soup.body.find_all(string=re.compile('.*{0}.*'.format(searched_word)), recursive=True)

        print('Found the word "{0}" {1} times\n'.format(searched_word, len(results)))

        for content in results:
            words = content.split()
            for index, word in enumerate(words):
                # If the content contains the search word twice or more this will fire for each occurence
                if word == searched_word:
                    print('Whole content: "{0}"'.format(content))
                    before = None
                    after = None
                    # Check if it's a first word
                    if index != 0:
                        before = words[index-1]
                    # Check if it's a last word
                    if index != len(words)-1:
                        after = words[index+1]
                    print('\tWord before: "{0}", word after: "{1}"'.format(before, after))
if __name__ == '__main__':
    Crawler(urls=['https://www.thsport.com/']).run()

 


2022-03-20 20:18:25,855 INFO:Crawling: https://www.thsport.com/
2022-03-20 20:18:26,291 INFO:Crawling: https://www.thsport.com/


Found the word "กีฬา" 4 times



2022-03-20 20:18:26,764 INFO:Crawling: https://www.thsport.com/member-login.html?facebook


Found the word "กีฬา" 4 times



2022-03-20 20:18:29,047 INFO:Crawling: https://www.thsport.com/member-register.html


Found the word "กีฬา" 0 times



2022-03-20 20:18:29,387 INFO:Crawling: https://www.thsport.com/news.html
2022-03-20 20:18:29,592 INFO:Crawling: https://www.thsport.com/บอลไทย.html


Found the word "กีฬา" 1 times

Found the word "กีฬา" 5 times



2022-03-20 20:18:30,382 INFO:Crawling: https://www.thsport.com/ข่าวฟุตบอลต่างประเทศ.html


Found the word "กีฬา" 3 times



2022-03-20 20:18:30,600 INFO:Crawling: https://www.thsport.com/ข่าว-NBA.html


Found the word "กีฬา" 3 times



2022-03-20 20:18:30,867 INFO:Crawling: https://www.thsport.com/ข่าว-NFL.html


Found the word "กีฬา" 3 times



2022-03-20 20:18:31,322 INFO:Crawling: https://www.thsport.com/column.html


Found the word "กีฬา" 3 times



2022-03-20 20:18:32,019 INFO:Crawling: https://www.thsport.com/analyst.html


Found the word "กีฬา" 2 times

