In [1]:
import time
from selenium import webdriver
from bs4 import BeautifulSoup
from collections import defaultdict
import json

In [2]:
# target url
url = "https://eksisozluk.com/"

# keywords list
keywords = ["gaz", "hazımsızlık", "kabızlık"]

# output location
output_location = 'data/output/data.json'

In [3]:
# class structure
class Eksi:
    def __init__(self, url):
        self.url = url
        
        # init the browser
        options = webdriver.ChromeOptions()
        options.add_argument('--ignore-certificate-errors')
        options.add_argument('--incognito')
        options.add_argument('--headless')

        driver = webdriver.Chrome() # initialize the driver
        driver.get(self.url) # go to the url
        self.driver = driver
        
    def search_keyword(self, keyword):
        element = self.driver.find_element_by_id("search-textbox")
        element.send_keys(keyword)
        element.submit()
        time.sleep(1) # small delay before getting the page source
        
    def compile_page_source(self):
        page_source = self.driver.page_source # get the page source
        soup = BeautifulSoup(page_source.encode('utf-8','ignore')) # compile it with bs4
        self.max_pages = int(soup.find('div', {"class": "pager"})['data-pagecount'])
        self.page_source = soup
        return self
    
    def next_page(self, page_number):
        current_url = self.driver.current_url 
        current_url = current_url[:current_url.rfind("?")+1] # remove all url variables 
        
        # if there are not additional parameter in the existing url
        if not current_url:
            current_url = self.driver.current_url + "?"
            
        current_url = current_url + ('p={}'.format(page_number))
        self.driver.get(current_url)
        
    def clean_entry(self, entry):
        
        return (
            entry
            .replace("\n", "") # remove new lines
            .replace("\'", "'") # fix apostrophe
            .strip() # remove spaces
        )
        
    def scrape_data(self, keyword: str):
        
        all_entries = self.page_source.find_all('div', {"class": "content"}) # get all entries
        all_dates = self.page_source.find_all('a', {"class": "entry-date"}) # get all dates
        all_authors = self.page_source.find_all('a', {"class": "entry-author"}) # get all authors
        for entry, date, author in zip(all_entries, all_dates, all_authors):
            self.keyword_dict[keyword].append((date.text, author.text, self.clean_entry(entry.text)))
        
    def scrape_all_pages(self, keyword_list: list):
        
        # reset keywords dict
        self.keyword_dict = defaultdict(list)
        
        for keyword in keyword_list:
            self.search_keyword(keyword)
            self.compile_page_source()

            for i in range(1, self.max_pages + 1):
                self.next_page(i)
                self.compile_page_source().scrape_data(keyword)

In [4]:
# initialize the object
eksi = Eksi(url)

In [5]:
# scrape the data
eksi.scrape_all_pages(keywords)

In [13]:
# eksi.keyword_dict[keywords[0]]

In [None]:
# dump the json file
json_object = json.dumps(eksi.keyword_dict, ensure_ascii=False).encode('utf-8','ignore').decode() 

In [None]:
# get the output
with open(output_location, 'w+', encoding='utf-8') as f: 
    json.dump(json_object, f, ensure_ascii=False)