In [1]:
from dotenv import dotenv_values
from bs4 import BeautifulSoup
import string
from time import sleep
import json 
import requests

In [2]:
config = dotenv_values(".env")
DATA_TARGET_WEB_URL = config.get('DATA_TARGET_WEB_URL')

In [3]:
target: str = f'https://{DATA_TARGET_WEB_URL}/'

# Tüm ilaç adlarını ve urlleri almak

In [None]:
# DATA-SET
dataset = {}   

In [None]:
# TEST
path = target + f'aralist.php?Id=Y'
response = requests.get(path)
print(response.text)

In [None]:
def get_list_medication_data_from_letter_with_requests(LETTER: str = 'A') -> list[dict]:
    if not LETTER.upper() and len(LETTER) == 1:
        raise ValueError('letter param must be UPPER case')

    path = target + f'aralist.php?Id={LETTER}'
    response = requests.get(path)
    if response.status_code == 200:
        data = []
        try:
            soup = BeautifulSoup(response.content, 'html.parser')
            medi_list = soup.find('div', {"id":"iceriksollistesayfasi"})
            for medi in medi_list.find_all('li'):
                tag_a = medi.find('a')
                data.append({
                    "url": target + tag_a.get('href'),
                    "title": tag_a.get('title'),
                    "name": tag_a.text,
                })
        except Exception as e:
            print(e)
        finally:
            return data

In [None]:
empty_letters: list[str] = []

for letter in string.ascii_uppercase:
    dataset[letter] = get_list_medication_data_from_letter(letter)
    print(f'param {letter} has {len(dataset[letter])} items')
    if letter:
        with open(f'scraped_medication_data/medication_{letter}.json', 'w') as df:
            df.write(json.dumps(dataset[letter]))
    else:
        empty_letters.append(letter)
    sleep(2)

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from time import sleep

def get_list_medication_data_from_letter_with_selenium(LETTER: str = 'A', manuel_control: bool = False) -> list | None:
    if not LETTER.isalpha() or len(LETTER) != 1 or LETTER != LETTER.upper():
        raise ValueError('letter param must be UPPER case and a single letter')

    url = f"{target}aralist.php?Id={LETTER}"

    options = Options()
    # options.add_argument("--headless") 
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    try:
        driver.get(url)
        sleep(2)
        if manuel_control:
            if input('is ready (n: negative): ').lower() == 'n':
                return []
        medi_list = driver.find_element(By.ID, "iceriksollistesayfasi")
        items = medi_list.find_elements(By.TAG_NAME, "li")

        data = []
        for medi in items:
            tag_a = medi.find_element(By.TAG_NAME, "a")
            data.append({
                "url": target + tag_a.get_attribute('href'),
                "title": tag_a.get_attribute('title'),
                "name": tag_a.text.strip(),
            })

        return data

    except Exception as e:
        print(f"Hata: {e}")
        return None

    finally:
        driver.quit()


In [None]:
for empty_letter in empty_letters:
    dataset[letter] = get_list_medication_data_from_letter_with_selenium(letter)
    print(f'param {letter} has {len(dataset[letter])} items')
    if letter:
        with open(f'scraped_medication_data/medication_{letter}.json', 'w') as df:
            df.write(json.dumps(dataset[letter]))
    else:
        print(f'ERROR at this latter: {letter}')
    sleep(2)

In [None]:
x = get_list_medication_data_from_letter_with_selenium('Y')

# MONGO DB Download PDF

In [None]:
!pip install pymongo

In [4]:
from pymongo import MongoClient

client = MongoClient("mongodb://root:password_mongo@localhost:5003/")
db = client["medication_mongo"]

In [None]:
# import os

# collection = db["medications"]
# for root, dirs, files in os.walk('scraped_medication_data'):
#     for file in files:
#         if file.endswith('.json') and not 'checkpoint' in file :
#             with open(f'scraped_medication_data/{file}', 'r') as df: 
#                 data = json.loads(df.read())
#                 collection.insert_many(data)

In [None]:
count

# pdf urllerini almak

In [5]:
def get_pdf_urls(url) -> str | int | None:
    response = requests.get(url)
    print(url)
    if response.url == url:
        soup = BeautifulSoup(response.content, 'html.parser')
        for tag_a in soup.find_all('a'):
            if tag_a.get('href').endswith('.pdf'):
                return target + tag_a.get('href')
    else:
        return 0 # medicine is not have a pdf

In [None]:
test_t = target + 'allermune-polymerized-wild-grasses-0-3-hepd-enjeksiyonluk-cozelti-devam-30431'
r = get_pdf_urls(f'{test_t}/kullanma-talimati')
print(r)

In [None]:
test_t = target + 'd-3-ferol-oral-damla-15-ml-18801/kullanma-talimati'
r = get_pdf_urls(f'{test_t}')
print(r)

In [None]:
time

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed

collection = db["medications"]

pdf_paths = ['kullanma-talimati', 'kisa-urun-bilgisi']

def process_medication(medication_target):
    updated_fields = {}

    for pdf_path in pdf_paths:
        if not medication_target.get(pdf_path):
            pdf_url = get_pdf_urls(url=medication_target.get('url') + '/' + pdf_path)

            if isinstance(pdf_url, str):
                updated_fields[pdf_path] = pdf_url
            elif pdf_url == 0:
                updated_fields[pdf_path] = None  # Bilerek None veriyoruz: PDF yok
            
            # time.sleep(random.uniform(1.5, 3.5))
    print(updated_fields)
    if updated_fields:
        collection.update_one(
            {"_id": medication_target["_id"]},
            {"$set": updated_fields}
        )
        print(f"Güncellendi: {medication_target.get('name')}")

# Sadece eksik pdf'leri olanları filtrele
medications_to_update = list(collection.find({
    "$or": [
        {"kullanma-talimati": {"$in": [None, ""]}},
        {"kisa-urun-bilgisi": {"$in": [None, ""]}}
    ]
}))
print(f"target count {len(medications_to_update)}")
# Çok fazla thread açma, 10-20 yeterli
with ThreadPoolExecutor(max_workers=5) as executor:
    futures = [executor.submit(process_medication, med) for med in medications_to_update]

    for future in as_completed(futures):
        future.result()  # Hataları burada patlatır


target count 7888
https://www.ilacabak.com/abizol-10-mg-28-tablet-9503/kullanma-talimati
{}
https://www.ilacabak.com/abound-portakal-aromali-30-poset-x-24-g-11716/kullanma-talimati
https://www.ilacabak.com/adrenalin-0-5-mg-10-ampul-25958/kullanma-talimati
https://www.ilacabak.com/adrenalin-1-mg-10-ampul-25959/kullanma-talimati
https://www.ilacabak.com/adrenalin-0-25-mg-10-ampul-25957/kullanma-talimati
https://www.ilacabak.com/adrenalin-1-4mg-1ml-100-ampul-14028/kullanma-talimati
https://www.ilacabak.com/abound-portakal-aromali-30-poset-x-24-g-11716/kisa-urun-bilgisi
{'kullanma-talimati': None, 'kisa-urun-bilgisi': None}
Güncellendi: ABOUND PORTAKAL AROMALI 30 POSET x 24 G
https://www.ilacabak.com/adrenalin-1-mg-10-ampul-25959/kisa-urun-bilgisi
{'kullanma-talimati': None, 'kisa-urun-bilgisi': None}
Güncellendi: ADRENALIN 1 MG 10 AMPUL
https://www.ilacabak.com/adrenalin-0-5-mg-10-ampul-25958/kisa-urun-bilgisi
{'kullanma-talimati': None, 'kisa-urun-bilgisi': None}
Güncellendi: ADRENALIN 0

In [None]:
p = 0

In [None]:
type(p) is str

In [None]:
9324, 