# Project authors: Cezary Suchorski, Michał Żarnowski

Graph dataset source:

In [4]:
from datasets import load_dataset

ds = load_dataset("stanfordnlp/imdb")
train_reviews = ds['train']['text']
train_labels = ds['train']['label']
test_reviews = ds['test']['text']
test_labels = ds['test']['label']

In [11]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download("punkt_tab") 
# nltk.download("stopwords")
# nltk.download("wordnet")

def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    tokens = nltk.word_tokenize(text) 
    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer() 
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    processed_text = ' '.join(lemmatized_tokens)
    
    return processed_text


[nltk_data] Downloading package punkt_tab to /home/cezary/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [12]:
train_reviews_clean = [preprocess_text(review) for review in train_reviews]

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(train_reviews_clean)

In [25]:
import umap
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='cosine', random_state=42)
reduced = reducer.fit_transform(X)



'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [26]:
import plotly.express as px
import pandas as pd

# Assume `reduced` is your 2D UMAP output (shape: [num_docs, 2])
# and `labels` is a list or array of sentiment labels (e.g., 0/1 or 'positive'/'negative')

df = pd.DataFrame({
    'x': reduced[:, 0],
    'y': reduced[:, 1],
    'sentiment': train_labels  # Replace with your actual sentiment labels
})

fig = px.scatter(
    df,
    x='x',
    y='y',
    color='sentiment',
    title='UMAP Projection of TF-IDF Embeddings',
    labels={'x': 'UMAP-1', 'y': 'UMAP-2'},
    opacity=0.7
)

fig.show()



In [84]:
from sentence_transformers import SentenceTransformer
# Load model directly
model = SentenceTransformer("all-mpnet-base-v2")
clean_embeddings = model.encode(train_reviews_clean, batch_size=32, show_progress_bar=True)
# train_embeddings = model.encode(train_reviews, batch_size=32, show_progress_bar=True)
# test_embeddings = model.encode(test_reviews, batch_size=32, show_progress_bar=True)



modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/782 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
import numpy as np
np.save('train_embeddings.npy', train_embeddings)
np.save('test_embeddings.npy', test_embeddings)
np.save('clean_embeddings.npy', clean_embeddings)

In [85]:
embeddings = np.load("embeddings.npy")

In [16]:
from numpy.typing import NDArray
import pandas as pd
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from umap import UMAP
from sklearn.cluster import KMeans
import plotly.io as pio
pio.renderers.default = "browser"

In [23]:
def project_vectors(data: NDArray, technique: str = "tsne", **options) -> NDArray:
    if technique == "pca":
        transformer = PCA(**options)
    elif technique == "tsne":
        transformer = TSNE(**options)
    elif technique == "umap":
        transformer = UMAP(**options)
    else:
        raise ValueError(
            f"Invalid technique: {technique}. Choose from 'pca', 'tsne', or 'umap'."
        )

    transformed_data = transformer.fit_transform(data)
    return transformed_data

In [86]:
tsne_embeddings = project_vectors(embeddings, technique='tsne', n_components=2, random_state=0, perplexity=5)


In [None]:
pca_embeddings = project_vectors(embeddings, technique='pca', n_components=2, random_state=2)

In [None]:

umap_embeddings = project_vectors(embeddings, technique='umap', n_components=2, random_state=2)


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [87]:
df = pd.DataFrame(tsne_embeddings, columns=["x", "y"])
df["review"] = train_reviews
df["sentiment"] = train_labels
df

Unnamed: 0,x,y,review,sentiment
0,-13.597942,55.299484,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,-13.731962,55.250561,"""I Am Curious: Yellow"" is a risible and preten...",0
2,-14.545835,-17.440258,If only to avoid making this type of film in t...,0
3,18.703844,61.282150,This film was probably inspired by Godard's Ma...,0
4,14.880387,13.824750,"Oh, brother...after hearing about this ridicul...",0
...,...,...,...,...
24995,39.342255,-2.794872,A hit at the time but now better categorised a...,1
24996,39.420589,-2.893723,I love this movie like no other. Another time ...,1
24997,39.428230,-2.813113,This film and it's sequel Barry Mckenzie holds...,1
24998,39.578941,-3.195409,'The Adventures Of Barry McKenzie' started lif...,1


In [88]:
fig = px.scatter(df, x="x", y="y", color = "sentiment",hover_data=["review"]) 
fig.update_traces(marker=dict(size=6, opacity=0.7))
fig.update_layout(template="plotly")

fig.show()

In [77]:
from sklearn.neighbors import NearestNeighbors
def estimate_dbscan_eps(embeddings: NDArray, n_samples: int = 1000, k: int = 5, quantile: float = 0.1) -> float:
    """
    Estimates a suitable eps parameter for DBSCAN based on k-distance graph.
    
    Parameters:
    -----------
    embeddings : numpy.ndarray
        The embeddings to analyze, shape (n_samples, n_features)
    n_samples : int
        Number of samples to use for estimation (to speed up computation)
    k : int
        Number of neighbors to consider
    quantile : float
        Quantile to use for selecting the eps value (lower means tighter clusters)
    plot : bool
        Whether to generate and display a k-distance plot
        
    Returns:
    --------
    eps : float
        Estimated eps value for DBSCAN
    """
    if embeddings.shape[0] > n_samples:
        indices = np.random.choice(embeddings.shape[0], n_samples, replace=False)
        sample_data = embeddings[indices]
    else:
        sample_data = embeddings

    nbrs = NearestNeighbors(n_neighbors=k + 1).fit(sample_data)
    distances, _ = nbrs.kneighbors(sample_data)

    kdistances = np.sort(distances[:, k])

    eps = np.quantile(kdistances, quantile)

    print(f"Estimated eps value: {eps}")
    return eps

In [79]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

# Normalize for DBSCAN
X_scaled = StandardScaler().fit_transform(clean_embeddings)

eps = estimate_dbscan_eps(clean_embeddings)
# Run DBSCAN
dbscan = DBSCAN(eps=eps, min_samples=5)
db_labels = dbscan.fit_predict(X_scaled)

df["dbscan_cluster"] = db_labels



Estimated eps value: 0.8947702705860138


In [80]:
from sklearn.metrics import adjusted_rand_score

score = adjusted_rand_score(df["sentiment"], df["dbscan_cluster"])
print(f"Adjusted Rand Index (ARI): {score:.2f}")


Adjusted Rand Index (ARI): 0.00
