In [None]:
from dotenv import dotenv_values
from bs4 import BeautifulSoup
import string
from time import sleep
import json 
import requests
from pypdf import PdfReader
import re
import unicodedata
from hashlib import sha256

In [None]:
config = dotenv_values(".env")
DATA_TARGET_WEB_URL = config.get('DATA_TARGET_WEB_URL')

In [None]:
target: str = f'https://{DATA_TARGET_WEB_URL}/'

In [None]:
def clean_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = unicodedata.normalize("NFKD", text)
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"\S+@\S+\.\S+", "", text)
    text = re.sub(r"\d+", "", text)
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\b\w\b", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

In [None]:
from pymongo import MongoClient

client = MongoClient("mongodb://root:password_mongo@localhost:5003/")
db = client["medication_mongo"]

# Tüm ilaç adlarını ve urlleri almak

In [None]:
# DATA-SET
dataset = {}   

In [None]:
# TEST
path = target + f'aralist.php?Id=Y'
response = requests.get(path)
print(response.text)

In [None]:
def get_list_medication_data_from_letter_with_requests(LETTER: str = 'A') -> list[dict]:
    if not LETTER.upper() and len(LETTER) == 1:
        raise ValueError('letter param must be UPPER case')

    path = target + f'aralist.php?Id={LETTER}'
    response = requests.get(path)
    if response.status_code == 200:
        data = []
        try:
            soup = BeautifulSoup(response.content, 'html.parser')
            medi_list = soup.find('div', {"id":"iceriksollistesayfasi"})
            for medi in medi_list.find_all('li'):
                tag_a = medi.find('a')
                data.append({
                    "url": target + tag_a.get('href'),
                    "title": tag_a.get('title'),
                    "name": tag_a.text,
                })
        except Exception as e:
            print(e)
        finally:
            return data

In [None]:
empty_letters: list[str] = []

for letter in string.ascii_uppercase:
    dataset[letter] = get_list_medication_data_from_letter(letter)
    print(f'param {letter} has {len(dataset[letter])} items')
    if letter:
        with open(f'scraped_medication_data/medication_{letter}.json', 'w') as df:
            df.write(json.dumps(dataset[letter]))
    else:
        empty_letters.append(letter)
    sleep(2)

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from time import sleep

def get_list_medication_data_from_letter_with_selenium(LETTER: str = 'A', manuel_control: bool = False) -> list | None:
    if not LETTER.isalpha() or len(LETTER) != 1 or LETTER != LETTER.upper():
        raise ValueError('letter param must be UPPER case and a single letter')

    url = f"{target}aralist.php?Id={LETTER}"

    options = Options()
    # options.add_argument("--headless") 
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    try:
        driver.get(url)
        sleep(2)
        if manuel_control:
            if input('is ready (n: negative): ').lower() == 'n':
                return []
        medi_list = driver.find_element(By.ID, "iceriksollistesayfasi")
        items = medi_list.find_elements(By.TAG_NAME, "li")

        data = []
        for medi in items:
            tag_a = medi.find_element(By.TAG_NAME, "a")
            data.append({
                "url": target + tag_a.get_attribute('href'),
                "title": tag_a.get_attribute('title'),
                "name": tag_a.text.strip(),
            })

        return data

    except Exception as e:
        print(f"Hata: {e}")
        return None

    finally:
        driver.quit()


In [None]:
for empty_letter in empty_letters:
    dataset[letter] = get_list_medication_data_from_letter_with_selenium(letter)
    print(f'param {letter} has {len(dataset[letter])} items')
    if letter:
        with open(f'scraped_medication_data/medication_{letter}.json', 'w') as df:
            df.write(json.dumps(dataset[letter]))
    else:
        print(f'ERROR at this latter: {letter}')
    sleep(2)

In [None]:
x = get_list_medication_data_from_letter_with_selenium('Y')

# MONGO DB Download PDF

In [None]:
!pip install pymongo

In [None]:
# import os

# collection = db["medications"]
# for root, dirs, files in os.walk('scraped_medication_data'):
#     for file in files:
#         if file.endswith('.json') and not 'checkpoint' in file :
#             with open(f'scraped_medication_data/{file}', 'r') as df: 
#                 data = json.loads(df.read())
#                 collection.insert_many(data)

In [None]:
count

# pdf urllerini almak

In [None]:
def get_pdf_urls(url) -> str | int | None:
    response = requests.get(url)
    if response.url == url:
        soup = BeautifulSoup(response.content, 'html.parser')
        for tag_a in soup.find_all('a'):
            if tag_a.get('href').endswith('.pdf'):
                return target + tag_a.get('href')
    else:
        return 0 # medicine is not have a pdf

In [None]:
test_t = target + 'allermune-polymerized-wild-grasses-0-3-hepd-enjeksiyonluk-cozelti-devam-30431'
r = get_pdf_urls(f'{test_t}/kullanma-talimati')
print(r)

In [None]:
test_t = target + 'd-3-ferol-oral-damla-15-ml-18801/kullanma-talimati'
r = get_pdf_urls(f'{test_t}')
print(r)

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed

collection = db["medications"]

pdf_paths = ['kullanma-talimati', 'kisa-urun-bilgisi']

def process_medication(medication_target):
    updated_fields = {}

    for pdf_path in pdf_paths:
        if not medication_target.get(pdf_path):
            pdf_url = get_pdf_urls(url=medication_target.get('url') + '/' + pdf_path)

            if isinstance(pdf_url, str):
                updated_fields[pdf_path] = pdf_url
            elif pdf_url == 0:
                updated_fields[pdf_path] = None  # Bilerek None veriyoruz: PDF yok
            
            # time.sleep(random.uniform(1.5, 3.5))
    if updated_fields:
        collection.update_one(
            {"_id": medication_target["_id"]},
            {"$set": updated_fields}
        )
        print(f"Güncellendi: {medication_target.get('name')}")

# Sadece eksik pdf'leri olanları filtrele
medications_to_update = list(collection.find({
    "$or": [
        {"kullanma-talimati": {"$in": [None, ""]}},
        {"kisa-urun-bilgisi": {"$in": [None, ""]}}
    ]
}))
print(f"target count {len(medications_to_update)}")
# Çok fazla thread açma, 10-20 yeterli
with ThreadPoolExecutor(max_workers=5) as executor:
    futures = [executor.submit(process_medication, med) for med in medications_to_update]

    for future in as_completed(futures):
        future.result()  # Hataları burada patlatır


## Title Clean

In [None]:
from pymongo import UpdateOne

collection = db["medications"]

# Sadece string olan ve başında/sonunda boşluk olma ihtimali olanları al
cursor = collection.find({"title": {"$type": "string"}})
batch_size = 1000
batch = []
counter = 0
update_counter = 0

for doc in cursor:
    _id = doc["_id"]
    original_title = doc["title"]
    cleaned_title = original_title.strip()

    if cleaned_title != original_title:
        batch.append(UpdateOne(
            {"_id": _id},
            {"$set": {"title": cleaned_title}}
        ))

    # Her 1000 işlemde bir bulk update yap
    if len(batch) == batch_size:
        result = collection.bulk_write(batch)
        update_counter += result.modified_count
        print(f"{update_counter} updated so far")
        batch = []

# Kalanları da gönder
if batch:
    result = collection.bulk_write(batch)
    update_counter += result.modified_count
    print(f"{update_counter} updated in total")


# PDF Scraping

## bar-code scraping !

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed

collection = db["medications"]

medications_to_update = list(collection.find({
    "$or": [
        {"barcode": {"$in": [None, ""]}},
        {"description": {"$in": [None, ""]}},
    ]
}))
print(f"target count {len(medications_to_update)}")


In [None]:
def get_barcode_and_description(url: str):
    barcode: str | None = None
    description: str | None = None

    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Description
        if description := soup.find('div', {"class": "sagkutukucuk"}):
            description = clean_text(description.text)
    
        # Barcode
        if table := soup.find('table', {"class":"anatablo"}):
            for tr in table.find_all('tr'):
                tds = tr.find_all('td')
                if len(tds) == 2:
                    key, value = tds
                    if key.text == 'Barkod :':
                        barcode = value.text
                        break
    return {
        "barcode": barcode,
        "description": description
    }

In [None]:
def process_medication(medication_target):
    updated_fields = {}
    url = medication_target.get("url")
    if not url:
        return

    data = get_barcode_and_description(url)

    for field in ["barcode", "description"]:
        if not medication_target.get(field) and data.get(field):
            updated_fields[field] = data[field]

    if updated_fields:
        collection.update_one(
            {"_id": medication_target["_id"]},
            {"$set": updated_fields}
        )
        print(f"Güncellendi: {medication_target.get('name')}")

# Eksik alanlara sahip dökümanları filtrele
medications_to_update = list(collection.find({
    "$or": [
        {"barcode": {"$in": [None, ""]}},
        {"description": {"$in": [None, ""]}},
    ]
}))

print(f"target count: {len(medications_to_update)}")

# Thread ile paralel çalıştır
with ThreadPoolExecutor(max_workers=5) as executor:
    futures = [executor.submit(process_medication, med) for med in medications_to_update]

    for future in as_completed(futures):
        future.result()

# PDF DOWNLOAD

In [None]:
collection.count_documents({
    "kullanma-talimati": {
        "$type": "string",  # sadece string olanları al
        "$regex": r"\.pdf$"  # .pdf ile bitenler
    }
})

In [None]:
response

In [None]:
with open('test.pdf', 'wb') as df:
    df.write(response.content)

In [None]:
hash_context = sha256(response.content).hexdigest()
reader = PdfReader('test.pdf')
context: str = ''
for page in reader.pages:
    text = page.extract_text()
    context += clean_text(text)

In [None]:
import os
from hashlib import sha256
from io import BytesIO
from concurrent.futures import ThreadPoolExecutor, as_completed


collection = db["medications"]

prop = collection.find({
    "kullanma-talimati": {
        "$type": "string",  # sadece string olanları al
        "$regex": r"\.pdf$"  # .pdf ile bitenler
    }
})


def get_pdf(pdf_url: str, pdf_type: str, barcode: str):
    try:
        if pdf_url == None:
            print(pdf_url)
            return
        response = requests.get(pdf_url)
        if response.status_code != 200:
            print(f"[HATA] PDF indirilemedi: {pdf_url}")
            return None
            
        if "application/pdf" not in response.headers.get("Content-Type", ""):
            print(f"PDF değil: {pdf_url}")
            return None
        if b"/JavaScript" in response.content or b"/JS" in response.content:
            print("JavaScript içeren PDF. Şüpheli.")
            return None
            
        os.makedirs('medications_pdf', exist_ok=True)
    
        pdf_id = pdf_url.split('/')[-1][:-4]
        hash_context = sha256(response.content).hexdigest()
        pdf_name = f'{barcode}-{pdf_type}__{hash_context}_{pdf_id}.pdf'
        pdf_path = os.path.join('medications_pdf', pdf_name)
    
        with open(pdf_path, 'wb') as df:
            df.write(response.content)
    
        # reader = PdfReader(BytesIO(response.content))
        reader = PdfReader(pdf_path)
        context = ''
        for page in reader.pages:
            if text := page.extract_text():
                context += clean_text(text)
    
        return {
            "pdf_name": pdf_name,
            "pdf_path": pdf_path,
            "pdf_context": context,
            "pdf_context_hash": hash_context
        }
    except:
        pass

def medis(medi):
    updated_fields: dict = {}

    for param in ['kisa-urun-bilgisi', 'kullanma-talimati']:
        if kub := get_pdf(medi.get(param), param, medi.get('barcode')): 
            updated_fields[param] = kub
    
    if updated_fields:
        collection.update_one(
        {"_id": medi["_id"]},
        {"$set": updated_fields}
    )
    print(f"Güncellendi: {medi.get('name')}")


medications = list(prop)

print(f"İşlenecek döküman sayısı: {len(medications)}")

with ThreadPoolExecutor(max_workers=5) as executor:
    futures = [executor.submit(medis, medi) for medi in medications]

    for future in as_completed(futures):
        try:
            future.result()
        except Exception as e:
            print(f"Thread hatası: {e}")



In [None]:
import os
from pypdf import PdfReader
from pypdf.errors import PdfReadError

def has_javascript(pdf_path):
    try:
        reader = PdfReader(pdf_path)
        catalog = reader.trailer.get("/Root", {})
        
        open_action = catalog.get("/OpenAction", {})
        if "/JS" in open_action or "/JavaScript" in open_action:
            return True

        names = catalog.get("/Names", {})
        js_dict = names.get("/JavaScript", {}) if names else {}
        if js_dict:
            return True

        if "/AA" in catalog:
            return True

    except PdfReadError as e:
        print(f"HATA ({pdf_path}): {e}")
    except Exception as e:
        print(f"Genel HATA ({pdf_path}): {e}")

    return False


def remove_js_pdfs(directory):
    for file_name in os.listdir(directory):
        if file_name.lower().endswith(".pdf"):
            full_path = os.path.join(directory, file_name)
            if has_javascript(full_path):
                print(f"JavaScript içeren PDF siliniyor: {file_name}")
                os.remove(full_path)

pdf_directory = "medications_pdf"
remove_js_pdfs(pdf_directory)
