## Using Word Embeddings

In the previous sections, we have built a Word2Vec model from scratch using NumPy. Now, let's see how we can use the learned word embeddings for various NLP tasks with the focus on sentiment analysis.


In [2]:
%pip install gensim

import gensim.downloader as api

# Download the pretrained Word2Vec vectors (GoogleNews-vectors-negative300)
word2vec_model = api.load('word2vec-google-news-300')


Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd

df = pd.read_csv("https://github.com/febse/data/raw/refs/heads/main/ta/IMDB-Dataset-5000.csv.zip")
df.head()

Unnamed: 0,review,sentiment
0,I really liked this Summerslam due to the look...,positive
1,Not many television shows appeal to quite as m...,positive
2,The film quickly gets to a major chase scene w...,negative
3,Jane Austen would definitely approve of this o...,positive
4,Expectations were somewhat high for me when I ...,negative


In [4]:
import numpy as np

def document_embedding(text, model):
    words = [w for w in text.split() if w in model]
    if words:
        return np.mean([model[w] for w in words], axis=0)
    else:
        return np.zeros(model.vector_size)

df['avg_word'] = df['review'].apply(lambda x: document_embedding(x, word2vec_model))

In [5]:
df.head()

Unnamed: 0,review,sentiment,avg_word
0,I really liked this Summerslam due to the look...,positive,"[0.026841605, 0.044992644, 0.017120501, 0.0777..."
1,Not many television shows appeal to quite as m...,positive,"[0.044581123, 0.025324179, 0.020859266, 0.0939..."
2,The film quickly gets to a major chase scene w...,negative,"[0.04855869, 0.031639244, 0.0031714232, 0.0996..."
3,Jane Austen would definitely approve of this o...,positive,"[0.03325866, 0.024321612, -0.0009411083, 0.060..."
4,Expectations were somewhat high for me when I ...,negative,"[0.056710232, 0.029389769, 0.031232158, 0.0919..."


In [6]:
from sklearn.model_selection import train_test_split

# Convert avg_word column to numpy array
X = np.vstack(df['avg_word'].values)
y = df['sentiment'].map({'positive': 1, 'negative': 0}).values

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
X.shape

(5000, 300)

In [8]:
X[:5]

array([[ 0.02684161,  0.04499264,  0.0171205 , ..., -0.06149487,
         0.02433256, -0.02856859],
       [ 0.04458112,  0.02532418,  0.02085927, ..., -0.01347773,
         0.03515555, -0.0291379 ],
       [ 0.04855869,  0.03163924,  0.00317142, ..., -0.05177125,
         0.04260951, -0.02817884],
       [ 0.03325866,  0.02432161, -0.00094111, ..., -0.06046228,
         0.06452684, -0.00596103],
       [ 0.05671023,  0.02938977,  0.03123216, ..., -0.03327884,
         0.05188526, -0.03141383]], shape=(5, 300), dtype=float32)

In [9]:
y_train[:5]

array([0, 1, 1, 0, 1])

## Getting Document Vectors

![Large Language Models](https://bea.stollnitz.com/images/gpt-transformer/3-transformer.png)

In [17]:
from openai import OpenAI

# client = OpenAI(api_key="YOUR_OPENAI_API_KEY")

# def get_openai_embedding(text, model="text-embedding-3-small"):
#     """
#     Get embedding for a text using OpenAI's API (v1.0+ compatible).
    
#     Parameters:
#     text (str): The text to embed
#     model (str): The embedding model to use (default: "text-embedding-3-small")
    
#     Returns:
#     list: The embedding vector
#     """
#     response = client.embeddings.create(
#         model=model,
#         input=text
#     )
#     return response.data[0].embedding

# # Download embeddings for each review (this may take time and cost money)
# # Uncomment the line below to run (be aware of API costs)

# get_openai_embedding("Hello, world!")

In [20]:
import requests

def get_ollama_embedding(text, model="llama3.2:latest"):
    """
    Get embedding for a text using Ollama's local API.

    Parameters:
    text (str): The text to embed
    model (str): The embedding model to use (default: "nomic-embed-text")

    Returns:
    list: The embedding vector
    """
    url = "http://localhost:11434/api/embeddings"
    payload = {
        "model": model,
        "prompt": text
    }
    response = requests.post(url, json=payload)
    response.raise_for_status()
    return response.json()["embedding"]

# Example usage:
len(get_ollama_embedding("Hello, world!"))

3072