In [42]:
from bs4 import BeautifulSoup
from googletrans import Translator
from pathlib import Path
from requests import get
from requests.exceptions import RequestException
from urllib.parse import quote
import PyPDF2
from os import system

In [35]:
def translate(text):
    translator = Translator()
    detected_language = translator.detect(text).lang

    if detected_language != "en":  # Check if language is not English
        translated_text = translator.translate(text, dest="en").text
        return translated_text
    else:
        return text


def get_num_pages(pdf_path):
    try:
        with open(pdf_path, "rb") as file:
            reader = PyPDF2.PdfFileReader(file)
            num_pages = reader.numPages
            return num_pages
    except FileNotFoundError:
        print(f"File {pdf_path} not found.")
        return None
    except PyPDF2.utils.PdfReadError:
        print(f"Unable to read file {pdf_path}.")
        return None


def download_pdf(url, folder_path):
    try:
        # Create the folder if it doesn't exist
        folder_path = Path(folder_path)
        folder_path.mkdir(
            parents=True, exist_ok=True
        )  # Create parent directories if needed

        # Get the filename from the URL (consider using urllib.parse for complex URLs)
        filename = Path(url).name

        # Download the file
        response = get(url, stream=True)
        response.raise_for_status()  # Raise an exception for unsuccessful download

        # Save the file
        filepath = folder_path / filename
        with filepath.open("wb") as f:
            for chunk in response.iter_content(1024):
                f.write(chunk)

        print(f"PDF downloaded successfully: {filepath}")
        return True

    except RequestException as e:
        print(f"Error downloading PDF: {e}")
        return False

In [43]:
GRADUS_LINK = "https://gradus.kefo.hu/archive/"
MTMT_LINK = "https://m2.mtmt.hu/gui2/?mode=search&query=publication;labelOrMtid;eq;"


# Data
data_json = []


gradus_soup = BeautifulSoup(get(GRADUS_LINK).text, "html.parser")


# Get issues div
issues_tag = gradus_soup.find("div", id="issues")
year_tags = gradus_soup.select('div[style="float: left; width: 100%;"]')


for year_tag in year_tags:

    # Get publication year
    publication_year = year_tag.find("h3", recursive=False).text.strip()

    for volume_info_tag in year_tag.find_all("a"):
        # Split text
        volume_info = volume_info_tag.text.split(":")[0].split("(")[0]
        # Remove unwanted substrings
        volume_info = (
            volume_info.replace("Vol ", "").replace("No ", "").replace(" ", "")
        )

        volume, number = volume_info.split(",")

        # Get volume catalog tag
        volume_catalog_link = GRADUS_LINK + volume_info_tag["href"]
        volume_catalog_soup = BeautifulSoup(
            get(volume_catalog_link).text, "html.parser"
        )
        volume_catalog = volume_catalog_soup.find("div", id="content")

        section = ""

        # Get articles
        for catalog_item in list(
            volume_catalog.select(
                "div#content > .tocSectionTitle, div#content > .tocArticle"
            )
        ):
            if catalog_item.attrs.get("class") == ["tocSectionTitle"]:
                section = catalog_item.text.strip()

            else:
                article_title = catalog_item.find(class_="tocTitle").text

                authors, doi_link = (
                    catalog_item.find(class_="tocAuthors").text.strip().split("\n")
                )

                authors = authors.strip()

                doi_link = doi_link
                doi_link_works: bool = get(doi_link).status_code == 200

                article_name = catalog_item.find("a")["href"]
                pdf_link = f"{volume_catalog_link}/{article_name}"

                mtmt_page_link = f"{MTMT_LINK}{quote(article_title)}"
                mtmt_soup = BeautifulSoup(get(mtmt_page_link).text, "html.parser")

                mtmt_search_tag = mtmt_soup.find(
                    "li",
                    class_="list-item ui-widget-content publication not-selected opened",
                )

                print(catalog_item.find(class_="tocTitle"))

                data_json.append(
                    {
                        "Publication Year": publication_year,
                        "Volume": volume,
                        "Number": number,
                        "Section": section,
                        "Article Name": article_name,
                        "Original Article Title": article_title,
                        "English Article Title": (article_title),
                        "Authors": authors,
                        "DOI Link": doi_link,
                        "DOI Link Status": doi_link_works,
                    }
                )

                # print(translate(article_title))
                # print(mtmt_search_tag)
                # print(mtmt_soup)

<td class="tocTitle">Egy viselkedÃ©sgazdasÃ¡gtani kÃ­sÃ©rlet tapasztalatai a tantermi Ã³rÃ¡kon Ã©s a kutatÃ¡sban</td>


KeyboardInterrupt: 

In [29]:
soup = BeautifulSoup(
    get(
        f"https://m2.mtmt.hu/gui2/?mode=search&query=publication;labelOrMtid;eq;{quote('First')}"
    ).text,
    "html.parser",
)


print(len(soup.find_all("li")))

5
