# Project authors: Cezary Suchorski, Michał Żarnowski

Graph dataset source:

In [8]:
from datasets import load_dataset

ds = load_dataset("stanfordnlp/imdb")
train_reviews = ds['train']['text']
train_labels = ds['train']['label']
test_reviews = ds['test']['text']
test_labels = ds['test']['label']

In [55]:
import re
import string

def preprocess_text(text):
    # 1. Lowercase the text
    text = text.lower()
    
    # 2. Remove HTML tags
    text = re.sub(r"<.*?>", "", text)
    
    # 3. Remove URLs
    text = re.sub(r"http\S+|www\S+", "", text)
    
    # 4. Remove punctuation (except emoticons or domain-specific things if needed)
    text = text.translate(str.maketrans("", "", string.punctuation))
    
    # 5. Remove numbers (optional)
    text = re.sub(r"\d+", "", text)
    
    # 6. Remove extra whitespace
    text = re.sub(r"\s+", " ", text).strip()
    
    return text


In [None]:
train_reviews_clean = [preprocess_text(review) for review in train_reviews]

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")
clean_embeddings = model.encode(train_reviews_clean, batch_size=32, show_progress_bar=True)
train_embeddings = model.encode(train_reviews, batch_size=32, show_progress_bar=True)
test_embeddings = model.encode(test_reviews, batch_size=32, show_progress_bar=True)



Batches:   0%|          | 0/782 [00:00<?, ?it/s]

In [None]:
import numpy as np
np.save('train_embeddings.npy', train_embeddings)
np.save('test_embeddings.npy', test_embeddings)
np.save('clean_embeddings.npy', clean_embeddings)

In [29]:
from numpy.typing import NDArray
import pandas as pd
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from umap import UMAP
from sklearn.cluster import KMeans
import plotly.io as pio
pio.renderers.default = "browser"

In [13]:
def project_vectors(data: NDArray, technique: str = "tsne", **options) -> NDArray:
    if technique == "pca":
        transformer = PCA(**options)
    elif technique == "tsne":
        transformer = TSNE(**options)
    elif technique == "umap":
        transformer = UMAP(**options)
    else:
        raise ValueError(
            f"Invalid technique: {technique}. Choose from 'pca', 'tsne', or 'umap'."
        )

    transformed_data = transformer.fit_transform(data)
    return transformed_data

In [None]:
tsne_embeddings = project_vectors(clean_embeddings, technique='tsne', n_components=2, random_state=0, perplexity=5)


In [None]:
pca_embeddings = project_vectors(clean_embeddings, technique='pca', n_components=2, random_state=2)

In [None]:

umap_embeddings = project_vectors(clean_embeddings, technique='umap', n_components=2, random_state=2)

In [None]:
tsne_df = pd.DataFrame(tsne_embeddings, columns=["x", "y"])
tsne_df["review"] = train_reviews_clean
tsne_df["sentiment"] = train_labels
tsne_df

Unnamed: 0,x,y,review,sentiment
0,-35.365406,16.689461,i rented i am curiousyellow from my video stor...,0
1,-35.997746,16.456532,i am curious yellow is a risible and pretentio...,0
2,-26.846764,-10.993340,if only to avoid making this type of film in t...,0
3,-36.712471,15.454796,this film was probably inspired by godards mas...,0
4,-35.809547,18.088961,oh brotherafter hearing about this ridiculous ...,0
...,...,...,...,...
24995,71.471939,-37.167122,a hit at the time but now better categorised a...,1
24996,71.538696,-37.048618,i love this movie like no other another time i...,1
24997,71.550247,-36.977451,this film and its sequel barry mckenzie holds ...,1
24998,71.644272,-36.552311,the adventures of barry mckenzie started life ...,1


In [68]:
fig = px.scatter(tsne_df, x="x", y="y", color = "sentiment",hover_data=["review"]) 
fig.update_traces(marker=dict(size=6, opacity=0.7))
fig.update_layout(template="plotly")

fig.show()