In [63]:
import requests
import pandas as pd
from bs4 import BeautifulSoup, NavigableString
import re

class Diki:
    def __init__(self, lang='ENG'):
        self.lang = lang
        self.data = pd.DataFrame(
            columns=[
                'english_word',
                'other_forms',
                'popularity',
                'pronunciation',
                'part_of_speech',
                'polish_word',
                'example',
                'synonyms',
                'opposites'
            ]
        )
        self.soup = None

    def _bs4_info(self, word):
        langs = {
            "ENG": "angielskiego",
            "GER": "niemieckiego",
            "ESP": 'hiszpanskiego',
            "ITA": 'wloskiego',
            "FRA": 'francuskiego'
        }
        
        result = requests.get(f'https://www.diki.pl/slownik-{langs[self.lang]}?q={word}')
        soup = BeautifulSoup(result.text, 'html.parser')
        self.soup = soup

    def extract_data(self, word):
        self._bs4_info(word)
        data_list = []

        # Pobieranie popularności słowa
        try:
            popularity_element = self.soup.find('a', class_='starsForNumOccurrences')
            popularity = popularity_element.text.strip() if popularity_element else None
        except AttributeError:
            popularity = None

        # Znajdowanie wszystkich definicji
        div_class = self.soup.find_all('div', class_='dictionaryEntity')

        for div in div_class:
            span_hw = div.find("span", class_="hw")
            if span_hw and span_hw.text.strip() == word:
                for m in div.find_all('li', id=re.compile('^meaning\d+')):
                    # Wyodrębnianie polskich tłumaczeń
                    polish_words = [span.get_text(strip=True) for span in m.find_all('span', class_='hw')]
                    polish_word = ', '.join(polish_words)

                    # Wyodrębnianie części mowy
                    ol_parent = m.find_parent('ol')
                    if ol_parent:
                        part_of_speech_element = ol_parent.find_previous_sibling('div', class_='partOfSpeechSectionHeader')
                        part_of_speech = part_of_speech_element.get_text(strip=True) if part_of_speech_element else None
                    else:
                        part_of_speech = None

                    # Wyodrębnianie innych form (af, vf)
                    af_vf_div = ol_parent.find_previous_sibling(
                        lambda tag: tag.name == 'div' and ('af' in tag.get('class', []) or 'vf' in tag.get('class', []))
                    ) if ol_parent else None

                    if af_vf_div:
                        other_forms = [span.get_text(strip=True) for span in af_vf_div.find_all('span', class_='foreignTermText')]
                    else:
                        other_forms = []

                    # Wyodrębnianie przykładu
                    example_div = m.find('div', class_='exampleSentence')
                    if example_div:
                        example_text = ' '.join(example_div.stripped_strings)
                    else:
                        example_text = None

                    # Wyodrębnianie synonimów i przeciwieństw
                    synonyms = set()
                    opposites = set()

                    refs = m.find_all('div', class_='ref')
                    for ref in refs:
                        # Przeszukujemy bezpośrednie dzieci 'div' w 'ref'
                        for child_div in ref.find_all('div', recursive=False):
                            # Usuwamy białe znaki i sprawdzamy, czy tekst zaczyna się od 'synonimy:' lub 'przeciwieństwo:'
                            text_content = child_div.get_text(strip=True)
                            if text_content.startswith('synon'):
                                links = child_div.find_all('a')
                                synonyms.update([link.get_text(strip=True) for link in links])
                            elif text_content.startswith('przeciw'):
                                links = child_div.find_all('a')
                                opposites.update([link.get_text(strip=True) for link in links])


                    # Dodawanie zebranych danych do listy
                    data_list.append({
                        'english_word': word,
                        'other_forms': other_forms,
                        'popularity': popularity,
                        'pronunciation': None,  # Możesz wyodrębnić wymowę, jeśli jest dostępna
                        'part_of_speech': part_of_speech,
                        'polish_word': polish_word,
                        'example': example_text,
                        'synonyms': list(synonyms),
                        'opposites': list(opposites)
                    })

        # Tworzenie DataFrame z zebranych danych
        self.data = pd.DataFrame(data_list)

# Przykładowe użycie
diki = Diki()
diki.extract_data('fulfilled')
diki.data


Unnamed: 0,english_word,other_forms,popularity,pronunciation,part_of_speech,polish_word,example,synonyms,opposites
0,fulfilled,[],*,,przymiotnik,"spełniony, zadowolony, zaspokojony",,[],[unfulfilled]
