# Web Classification Demo
Questo notebook permette di passare una lista di siti web e una lista di keyword.
Per ciascun sito:
- scarica il testo della pagina (senza rendering JS)
- calcola uno score basato su keyword
- calcola la similarità semantica con embeddings (SentenceTransformers)
- produce un punteggio finale e una classificazione (positivo/incerto/negativo)

In [1]:
import re, math, requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

# Configurazione modello embeddings
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

## Parametri di input
Inserisci la lista di URL e la lista di keyword da cercare.

In [31]:
df_url = pd.read_excel("./data/Template.xlsx", sheet_name='websites')
df_keywords = pd.read_excel("./data/Template.xlsx", sheet_name='keywords')

# Lista di keyword (personalizzabile)
keywords = {k.lower() for k in df_keywords['keywords'].to_list()}

# Pesi per combinare gli score
ALPHA_SEMANTIC = 0.7 # pesa la componente di similarità semantica
ALPHA_KEYWORD = 0.3 # pesa la componente di presenza della parola chiave

## Funzioni di supporto

In [25]:
def fetch_text(url: str, timeout=10) -> str:
    try:
        r = requests.get(url, timeout=timeout, headers={"User-Agent": "Mozilla/5.0"})
        if r.status_code >= 400:
            return ""
        soup = BeautifulSoup(r.text, "html.parser")
        for tag in soup(["script", "style", "noscript"]):
            tag.decompose()
        text = " ".join([x.get_text(" ", strip=True) for x in soup.find_all(["title","h1","h2","h3","p","li"])])
        return re.sub(r"\s+", " ", text)
    except Exception:
        return ""

def keyword_score(text: str, keywords: set) -> float:
    if not text:
        return 0.0
    t = text.lower()
    hits = sum(len(re.findall(r"\\b" + re.escape(k) + r"\\b", t)) for k in keywords)
    return 1.0 - math.exp(-hits/5.0)


def keyword_matches(text: str, keywords) -> dict:
    """
    Ritorna un dict {keyword_norm: count_match} sul testo.
    Usa \b ... \b per il confine di parola (match esatto).
    """
    if not text:
        return {k.lower(): 0 for k in keywords}
    t = text.lower()
    counts = {}
    for k in (kw.lower() for kw in keywords):
        patt = r"\b" + re.escape(k) + r"\b"
        counts[k] = len(re.findall(patt, t))
    return counts

def keyword_score_from_counts(counts: dict) -> float:
    hits = sum(counts.values())
    return 1.0 - math.exp(-hits/5.0)
    

def semantic_score(text: str, topic_phrases, model) -> float:
    if not text.strip():
        return 0.0
    vecs = model.encode([text] + topic_phrases, normalize_embeddings=True, show_progress_bar=False)
    sims = vecs[1:] @ vecs[0]
    return float(np.max(sims))

def classify_url(url: str, keywords, model):
    text = fetch_text(url)
    # conteggi e score keyword
    kmatch = keyword_matches(text, keywords)
    kscore = keyword_score_from_counts(kmatch)

    # embedding score
    sscore = semantic_score(text, list(keywords), model)

    # fusione
    final = ALPHA_SEMANTIC * sscore + ALPHA_KEYWORD * kscore

    # colonne per keyword: flag_ e count_
    per_kw_flags = {f"flag_{k.lower().replace(" ", "_")}": 1 if kmatch[k.lower()] > 0 else 0 for k in (kw.lower() for kw in keywords)}
    #per_kw_counts = {f"count_{k.lower()}": kmatch[k.lower()] for k in (kw.lower() for kw in keywords)}

    base = {
        "url": url,
        "text": text,
        "semantic_score": round(sscore, 3),
        "keyword_score": round(kscore, 3),
        "final_score": round(final, 3),
    }
    # se vuoi **solo** i flag, commenta la riga per_kw_counts
    base.update(per_kw_flags)
    #base.update(per_kw_counts)

    return base

## Esecuzione classificazione

In [39]:
%%time

for i, row in tqdm(df_url.iterrows()):
    url = row["url"]
    res = classify_url(url, keywords, model)

    df_url.at[i, "text"] = res["text"]
    df_url.at[i, "keyword_score"] = res["keyword_score"]
    df_url.at[i, "semantic_score"] = res["semantic_score"]
    df_url.at[i, "final_score"] = res["final_score"]

    # se classify_url ritorna anche flag_* o count_*, aggiungili automaticamente
    for k, v in res.items():
        if k not in df_url.columns:
            df_url[k] = np.nan
        df_url.at[i, k] = v

#df = pd.DataFrame(results)
df_url

3it [00:02,  1.49it/s]

CPU times: total: 3.28 s
Wall time: 2.02 s





Unnamed: 0,id,url,text,keyword_score,semantic_score,final_score,flag_nautica,flag_yacht,flag_barche_a_vela,flag_imbarcazioni,flag_cantieri_navali,flag_navi
0,1,https://www.cantierinavali.it,Cantieri Navali Chioggia Home Il cantiere Serv...,0.551,0.671,0.635,0.0,1.0,0.0,0.0,1.0,1.0
1,2,https://www.nautica.it,"Nautica On Line - Barche nuove, usate, gommoni...",0.998,0.61,0.726,1.0,1.0,1.0,1.0,0.0,1.0
2,3,https://www.studilegali.com/,Avvocati - StudiLegali.com Home StudiLegali.co...,0.0,0.229,0.16,0.0,0.0,0.0,0.0,0.0,0.0


## Salvataggio in CSV

In [None]:
df.to_csv("classification_results.csv", index=False)
print("Risultati salvati in classification_results.csv")