# 1. Imports e Configurazione

##### Questa sezione include tutti gli import necessari per l’estrazione e l’analisi dei dati dai vari servizi e file.

In [None]:
# Imports principali e configurazione
import sqlite3
import requests
import json
import re
import io
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
from PyPDF2 import PdfReader
import pytesseract
from PIL import Image
import logging
import streamlit as st

# Configurazione logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Configurazione pytesseract (se necessario)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'


# 2. Funzioni per estrarre da CIR

##### Queste funzioni gestiscono l'estrazione di link e dati dai report CIR (Cosmetic Ingredient Review).

In [None]:
# Funzione per creare una sessione di richieste con strategie di retry
def create_session():
    retry_strategy = Retry(
        total=5,
        backoff_factor=1,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["HEAD", "GET", "OPTIONS"]
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session = requests.Session()
    session.mount("https://", adapter)
    session.mount("http://", adapter)
    return session

# Funzione per estrarre il primo link di status da CIR
def extract_first_status_link(session, url):
    try:
        response = session.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        status_links = soup.find('table').find_all('a')
        if not status_links:
            return None
        first_link = "https://cir-reports.cir-safety.org/" + status_links[0]['href'].replace("../", "")
        return first_link
    except requests.RequestException as e:
        logging.error(f"Error retrieving status link from CIR: {e}")
        return None

# Funzione per processare un ingrediente e ottenere il PDF CIR
def get_pdf_for_ingredient(session, ingredient_id):
    url = f"https://cir-reports.cir-safety.org/cir-ingredient-status-report/?id={ingredient_id}"
    pdf_link = extract_first_status_link(session, url)
    if pdf_link:
        response = session.get(pdf_link)
        if response.status_code == 200:
            return io.BytesIO(response.content)
    return None


# 3. Gestione PDF

##### Queste funzioni si occupano dell'estrazione di testo dai PDF, sia testuali che in formato immagine.

In [None]:
# Funzione per estrarre testo da PDF
def extract_text_from_pdf(pdf_content):
    reader = PdfReader(io.BytesIO(pdf_content))
    text = ""
    for page_num in range(len(reader.pages)):
        page = reader.pages[page_num]
        text += page.extract_text() or ""
    return text

# Funzione per estrarre testo da PDF con immagini utilizzando pytesseract
def extract_text_from_image_pdf(pdf_content):
    pdf = fitz.open(stream=pdf_content, filetype="pdf")
    extracted_text = ""
    for page_num in range(len(pdf)):
        page = pdf.load_page(page_num)
        pix = page.get_pixmap()
        img = Image.open(io.BytesIO(pix.tobytes()))
        text = pytesseract.image_to_string(img, lang='eng')
        extracted_text += text + "\n\n"
    return extracted_text


# 4. Funzioni per estrarre da PubChem

##### Funzioni per ottenere informazioni su un ingrediente utilizzando l'API PubChem.

In [None]:
# Funzione per ottenere il CID di un ingrediente da PubChem
def get_pubchem_cid(session, ingredient_name):
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{ingredient_name}/cids/JSON"
    try:
        response = session.get(url)
        response.raise_for_status()
        data = response.json()
        cid = str(data['IdentifierList']['CID'][0])
        return cid
    except (requests.RequestException, KeyError, IndexError, ValueError) as e:
        logging.error(f"Error retrieving PubChem CID for {ingredient_name}: {e}")
        return None

# Funzione per ottenere i valori LD50 da PubChem
def get_ld50_pubchem(session, cid):
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{cid}/JSON/"
    try:
        response = session.get(url)
        response.raise_for_status()
        data = response.json()
        sections = data['Record']['Section']
        ld50_values = []
        def extract_ld50(sections):
            for section in sections:
                if 'Section' in section:
                    extract_ld50(section['Section'])
                if 'Information' in section:
                    for info in section['Information']:
                        if 'Value' in info and 'StringWithMarkup' in info['Value']:
                            for item in info['Value']['StringWithMarkup']:
                                if 'LD50' in item['String']:
                                    ld50_values.append(item['String'])
        extract_ld50(sections)
        return ld50_values if ld50_values else None
    except (requests.RequestException, KeyError, IndexError, ValueError) as e:
        logging.error(f"Error retrieving LD50 from PubChem for CID {cid}: {e}")
        return None


# 5. Funzioni per estrarre da ECHA

##### Questa sezione si occupa dell'estrazione dei dati di tossicità da ECHA utilizzando Selenium per navigare il sito.

In [None]:
# Funzione per inizializzare il driver Selenium
def initialize_driver():
    options = webdriver.ChromeOptions()
    options.add_argument('--disable-extensions')
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--headless')
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

# Funzione per ottenere i dati di tossicità da ECHA
def get_toxicity_data_for_ingredient(driver, ingredient):
    api_url = f"https://chem.echa.europa.eu/api-substance/v1/substance?pageIndex=1&pageSize=100&searchText={ingredient.replace(' ', '%20')}"
    response = requests.get(api_url)
    response.raise_for_status()
    data = response.json()
    if not data['items']:
        return None, None
    rmlId = data['items'][0]['substanceIndex']['rmlId']
    dossier_api_url = f"https://chem.echa.europa.eu/api-dossier-list/v1/dossier?pageIndex=1&pageSize=100&rmlId={rmlId}&registrationStatuses=Active"
    dossier_response = requests.get(dossier_api_url)
    dossier_response.raise_for_status()
    dossier_data = dossier_response.json()
    if not dossier_data['items']:
        return None, None
    asset_external_id = dossier_data['items'][0]['assetExternalId']
    html_page_url = f"https://chem.echa.europa.eu/html-pages/{asset_external_id}/index.html"
    driver.get(html_page_url)
    WebDriverWait(driver, 10).until(lambda driver: driver.current_url == html_page_url)
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    href_value = extract_href(soup, 'Acute Toxicity')
    if not href_value:
        href_value = extract_href(soup, 'Toxicological information')
    document_url = f"https://chem.echa.europa.eu/html-pages/{asset_external_id}/documents/{href_value}.html"
    document_content = fetch_document_content(document_url)
    soup = BeautifulSoup(document_content, 'html.parser')
    text_content = soup.get_text(separator=' ', strip=True)
    echa_value = extract_values(text_content)
    return echa_value, document_url


# 6. Funzioni per estrarre da EFSA

##### Non la trovo più sorry!

# 7. Funzioni per estrazione dei valori specifici

##### Funzioni per estrarre specifici valori dai testi (e.g., LD50, NOAEL).

In [None]:
# Funzione per trovare valori specifici in un testo
def find_values(text, term):
    pattern = fr'{term}\s*[:/]?'
    if term == "LD50":
        pattern = fr'LD\s*[\n]*50\s*[:/]?'
    matches = re.finditer(pattern, text, re.IGNORECASE)
    values = []
    for match in matches:
        start_index = match.end()
        words = text[start_index:start_index+100].split()[:20]
        for word in words:
            if re.match(r'\d+(\.\d+)?', word):
                values.append((word, start_index))
                break
    return values


# 8. Connessione al Database

##### Funzioni per gestire la connessione e operazioni sul database SQLite.

In [None]:
# Funzione per stabilire la connessione al database
def get_db_connection():
    conn = sqlite3.connect('app/data/ingredients.db')
    conn.row_factory = sqlite3.Row
    return conn

# Funzione per aggiornare valori nel database
def update_ingredient_value_in_db(ingredient_id, value_updated):
    conn = get_db_connection()
    cursor = conn.cursor()
    query = """
    UPDATE ingredients
    SET value_updated = ?
    WHERE pcpc_ingredientid = ?
    """
    cursor.execute(query, (value_updated, ingredient_id))
    conn.commit()
    conn.close()

# Funzione per cercare ingredienti nel database
def search_ingredient(ingredient_name_or_id):
    conn = get_db_connection()
    cursor = conn.cursor()
    query = """
    SELECT 
        pcpc_ingredientid AS id, 
        pcpc_ingredientname AS name, 
        NOAEL_CIR, 
        LD50_CIR, 
        LD50_PubChem, 
        echa_value,
        echa_dossier
    FROM ingredients
    WHERE pcpc_ingredientid = ? OR pcpc_ingredientname = ?
    """
    cursor.execute(query, (ingredient_name_or_id, ingredient_name_or_id))
    result = cursor.fetchone()
    conn.close()
    return result

# Funzione per rimuovere valori aggiornati
def remove_updated_value(ingredient_id):
    conn = get_db_connection()
    cursor = conn.cursor()
    query = """
    UPDATE ingredients
    SET value_updated = NULL
    WHERE pcpc_ingredientid = ?
    """
    cursor.execute(query, (ingredient_id,))
    conn.commit()
    conn.close()


# 9. Mostra valori

##### Funzioni per mostrare i valori estratti e visualizzarli.

In [1]:
# Funzione per mostrare i valori trovati e i loro contesti
def display_values(common_values, pdf_text):
    if not common_values:
        st.write("No values found.")
        return

    for value, occurrences in common_values:
        with st.expander(f"Value: {value}"):
            for i, (_, start_index) in enumerate(occurrences, 1):
                text_before = pdf_text[:start_index].split()[-20:]
                text_after = pdf_text[start_index:].split()[:20]
                surrounding_text = ' '.join(text_before + [f"<span style='color:red; font-weight:bold;'>{value}</span>"] + text_after)
                st.write(f"Occurrence {i}:")
                st.markdown(f"...{surrounding_text}...", unsafe_allow_html=True)

# Funzione per visualizzare i risultati con streamlit
def show_ingredient_values(ingredient_name):
    ingredient = search_ingredient(ingredient_name)
    if ingredient:
        st.markdown(f"<h3>Ingredient: {ingredient['name']}</h3>", unsafe_allow_html=True)
        if ingredient['NOAEL_CIR']:
            st.write("NOAEL CIR Values:")
            for value, context in json.loads(ingredient['NOAEL_CIR']):
                st.write(f"Value: {value}")
                with st.expander("Context"):
                    st.write(context)
        if ingredient['LD50_CIR']:
            st.write("LD50 CIR Values:")
            for value, context in json.loads(ingredient['LD50_CIR']):
                st.write(f"Value: {value}")
                with st.expander("Context"):
                    st.write(context)
        if ingredient['LD50_PubChem']:
            st.write("LD50 PubChem Values:")
            for value, context in json.loads(ingredient['LD50_PubChem']):
                st.write(f"Value: {value}")
                with st.expander("Context"):
                    st.write(context)
    else:
        st.write("Ingredient not found.")
