In [28]:
from pathlib import Path
from typing import Dict, List, Optional
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import download as nltk_download
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA

In [None]:
nltk_download("stopwords")
nltk_download("wordnet")
nltk_download("omw-1.4")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gustavoroos/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/gustavoroos/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/gustavoroos/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
ITEMS_PATH = "../data/items.csv"

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def clean_text(text: str) -> str:
    tokens = [word.lower() for word in text.split()]
    lemmatized = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    if not lemmatized:
        lemmatized = tokens
    return " ".join(lemmatized)

items_df = (
    pd.read_csv(ITEMS_PATH, encoding="utf-8-sig")
    .rename(columns={"Item": "item_id", "Title": "title", "Descriptions": "description"})
)
items_df["description"] = items_df["description"].fillna("")
items_df["title"] = items_df["title"].fillna("")
items_df["item_id"] = items_df["item_id"].astype(int)
items_df["text"] = (items_df["title"] + " " + items_df["description"]).str.strip()
items_df.loc[items_df["text"].eq(""), "text"] = items_df["title"]
items_df["clean_text"] = items_df["text"].apply(clean_text)

model_name = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)
embeddings = model.encode(
    items_df["clean_text"].tolist(),
    batch_size=4,
    normalize_embeddings=True,
    convert_to_numpy=True,
    show_progress_bar=True,
).astype(np.float32)

items_df.head()

Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Unnamed: 0,item_id,title,URL,description,text,clean_text
0,1,E-Commerce,https://en.wikipedia.org/wiki/Online_shopping,,E-Commerce,e-commerce
1,2,News Website,https://en.wikipedia.org/wiki/Online_newspaper,,News Website,news website
2,3,Flight ticket purchase system,https://en.wikipedia.org/wiki/Airline_reservat...,,Flight ticket purchase system,flight ticket purchase system
3,4,Inventory management system,https://en.wikipedia.org/wiki/Inventory_manage...,,Inventory management system,inventory management system
4,5,Library (Book Rental and Return) System,https://en.wikipedia.org/wiki/Digital_library,,Library (Book Rental and Return) System,library (book rental return) system


In [31]:
similarity_matrix = embeddings @ embeddings.T
np.clip(similarity_matrix, -1.0, 1.0, out=similarity_matrix)
np.fill_diagonal(similarity_matrix, 1.0)
distance_matrix = 1.0 - similarity_matrix

def top_k_neighbors(sim_matrix: np.ndarray, k: int = 3) -> tuple[Dict[int, List[int]], Dict[int, List[float]]]:
    neighbors: Dict[int, List[int]] = {}
    neighbor_distances: Dict[int, List[float]] = {}
    total_items = sim_matrix.shape[0]
    for idx in range(total_items):
        scores = sim_matrix[idx]
        candidate_count = min(k + 1, total_items)
        candidate_idx = np.argpartition(scores, -candidate_count)[-candidate_count:]
        sorted_idx = candidate_idx[np.argsort(scores[candidate_idx])[::-1]]

        selected: List[int] = []
        distances: List[float] = []
        for candidate in sorted_idx:
            if candidate == idx:
                continue
            selected.append(candidate)
            distances.append(1.0 - scores[candidate])
            if len(selected) == k:
                break

        neighbors[idx] = selected
        neighbor_distances[idx] = distances

    return neighbors, neighbor_distances

neighbor_indices, neighbor_distances = top_k_neighbors(similarity_matrix, k=3)
neighbor_indices[0], neighbor_distances[0]

([np.int64(6), np.int64(9), np.int64(1)],
 [np.float32(0.64492446), np.float32(0.79804325), np.float32(0.8312361)])

In [32]:
pca = PCA(n_components=2, random_state=42)
coords = pca.fit_transform(embeddings)

viz_df = items_df.copy()
viz_df["x"] = coords[:, 0]
viz_df["y"] = coords[:, 1]

def build_neighbor_summary(idx: int) -> str:
    lines: List[str] = []
    for neighbor_idx, dist in zip(neighbor_indices[idx], neighbor_distances[idx]):
        neighbor = items_df.iloc[neighbor_idx]
        label = neighbor["title"] if neighbor["title"] else f"Item {int(neighbor['item_id'])}"
        lines.append(f"{label} — dist={dist:.3f}")
    return "<br>".join(lines) if lines else "Nenhum vizinho disponível"

customdata = np.array(
    [
        [
            row["title"] if row["title"] else f"Item {int(row['item_id'])}",
            int(row["item_id"]),
            build_neighbor_summary(idx),
        ]
        for idx, row in viz_df.iterrows()
    ],
    dtype=object,
)

default_color = "#636EFA"
selected_color = "#000000"
neighbor_click_color = "#EF553B"
base_colors = np.full(len(viz_df), default_color, dtype=object)

fig = go.FigureWidget(
    data=[
        go.Scatter(
            x=viz_df["x"],
            y=viz_df["y"],
            mode="markers",
            marker=dict(
                size=10,
                color=base_colors.tolist(),
                line=dict(color="#FFFFFF", width=0.5),
            ),
            customdata=customdata,
            hovertemplate=("<b>%{customdata[0]}</b><br>ID: %{customdata[1]}<br><br>"
                           "Top 3 vizinhos:<br>%{customdata[2]}<extra></extra>"),
        )
    ]
)
fig.update_layout(
    title="Mapa Interativo de Similaridade entre Itens",
    xaxis_title="Componente Principal 1",
    yaxis_title="Componente Principal 2",
    template="plotly_white",
    dragmode="lasso",
)

scatter = fig.data[0]
default_colors = base_colors.copy()
selected_index: Optional[int] = None

def apply_selection(selection_idx: Optional[int]) -> None:
    colors = default_colors.copy()
    if selection_idx is not None:
        colors[selection_idx] = selected_color
        for neighbor_idx in neighbor_indices.get(selection_idx, []):
            colors[neighbor_idx] = neighbor_click_color
    scatter.marker.color = colors.tolist()

def handle_click(trace, points, state):
    global selected_index
    if points.point_inds:
        idx = points.point_inds[0]
        selected_index = None if selected_index == idx else idx
    else:
        selected_index = None
    apply_selection(selected_index)

scatter.on_click(handle_click)
apply_selection(selected_index)

In [33]:
output_path = Path("item_similarity_map.html")
fig.write_html(output_path, include_plotlyjs="cdn")
fig

FigureWidget({
    'data': [{'customdata': array([['E-Commerce ', 1,
                                    'Course-Enrollment System — dist=0.645<br>Social Networks — dist=0.798<br>News Website — dist=0.831'],
                                   ['News Website', 2,
                                    'Getting Real about Fake News — dist=0.562<br>Congress Trump Score — dist=0.675<br>Social Networks — dist=0.695'],
                                   ['Flight ticket purchase system', 3,
                                    'Inventory management system — dist=0.563<br>Food/Dishes Order System — dist=0.564<br>Hotels/Room Reservation System — dist=0.567'],
                                   ...,
                                   ['NBA Free Throws', 68,
                                    'Football Events — dist=0.503<br>Data Stories of US Airlines; 1987-2008 — dist=0.600<br>Video Game Sales — dist=0.601'],
                                   ['Airlines Delay', 69,
                               