In [18]:
import pandas as pd

# Tokenizing with SciKit-Learn

In [19]:
speeches = pd.DataFrame(
    [["Foxes are the most majestic animal. Very few animals can eat foxes"],
    ["Foxes live in the praries of England. Sometimes foxes get into people's back yards."],
    ["The foxes love to eat meat. If a fox smells meat, it will eat meat."],
    ["Ducks are nice animals too. Ducks eat bread"]],
    columns=['sentence'])
speeches

Unnamed: 0,sentence
0,Foxes are the most majestic animal. Very few a...
1,Foxes live in the praries of England. Sometime...
2,The foxes love to eat meat. If a fox smells me...
3,Ducks are nice animals too. Ducks eat bread


In [20]:
from sklearn.feature_extraction.text import CountVectorizer

## YOU CAN EDIT THESE
BINARY=False
NGRAM_RANGE=(1,1)
MIN_DF=0.0

vectorizer = CountVectorizer(
    stop_words='english', # 'english' if not custom list
    ngram_range=NGRAM_RANGE,
    binary=BINARY,
    min_df=MIN_DF
)


In [21]:
X = vectorizer.fit_transform(speeches['sentence'])
X

<4x17 sparse matrix of type '<class 'numpy.int64'>'
	with 22 stored elements in Compressed Sparse Row format>

In [22]:
word_vectors = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
[print(x) for x in speeches.sentence]
word_vectors.round(2)

Foxes are the most majestic animal. Very few animals can eat foxes
Foxes live in the praries of England. Sometimes foxes get into people's back yards.
The foxes love to eat meat. If a fox smells meat, it will eat meat.
Ducks are nice animals too. Ducks eat bread


Unnamed: 0,animal,animals,bread,ducks,eat,england,fox,foxes,live,love,majestic,meat,nice,people,praries,smells,yards
0,1,1,0,0,1,0,0,2,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,1,0,2,1,0,0,0,0,1,1,0,1
2,0,0,0,0,2,0,1,1,0,1,0,3,0,0,0,1,0
3,0,1,1,2,1,0,0,0,0,0,0,0,1,0,0,0,0


# TF-IDF

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    stop_words='english', 
    ngram_range=NGRAM_RANGE,
    binary=BINARY,
    min_df=MIN_DF
)
X = vectorizer.fit_transform(speeches['sentence'])
word_vectors = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
[print(x) for x in speeches.sentence]
word_vectors.round(2)

Foxes are the most majestic animal. Very few animals can eat foxes
Foxes live in the praries of England. Sometimes foxes get into people's back yards.
The foxes love to eat meat. If a fox smells meat, it will eat meat.
Ducks are nice animals too. Ducks eat bread


Unnamed: 0,animal,animals,bread,ducks,eat,england,fox,foxes,live,love,majestic,meat,nice,people,praries,smells,yards
0,0.46,0.37,0.0,0.0,0.3,0.0,0.0,0.59,0.0,0.0,0.46,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.39,0.0,0.5,0.39,0.0,0.0,0.0,0.0,0.39,0.39,0.0,0.39
2,0.0,0.0,0.0,0.0,0.34,0.0,0.27,0.17,0.0,0.27,0.0,0.8,0.0,0.0,0.0,0.27,0.0
3,0.0,0.3,0.38,0.75,0.24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.38,0.0,0.0,0.0,0.0


# OpenAI Embeddings

In [24]:
from tqdm.notebook import tqdm

from openai import OpenAI
client = OpenAI()

def get_embeddings(texts, model="text-embedding-3-large"):
    # Replace newlines in each text and ensure it's a list of texts
    texts = [text.replace("\n", " ") for text in texts]
    # OpenAI's embeddings.create can process multiple inputs as a list
    response = client.embeddings.create(input=texts, model=model)
    # Extract embeddings from the response
    embeddings = [item.embedding for item in response.data]
    return embeddings

# Function to process DataFrame in batches and return a list of embeddings
def process_in_batches(df, column_name, batch_size=10):
    # Break the DataFrame into batches of size `batch_size`
    batches = [df[column_name].iloc[i:i + batch_size] for i in range(0, len(df), batch_size)]
    # Process each batch and collect embeddings
    all_embeddings = []
    for batch in tqdm(batches, desc="Processing batches"):
        batch_embeddings = get_embeddings(batch.tolist())
        all_embeddings.extend(batch_embeddings)
    return all_embeddings

# Example usage
batch_size = 100  # Adjust based on your preference and rate limits
speeches['embedding'] = process_in_batches(speeches, 'sentence', batch_size=batch_size)


Processing batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [25]:
speeches


Unnamed: 0,sentence,embedding
0,Foxes are the most majestic animal. Very few a...,"[-0.004551408812403679, 0.011563858948647976, ..."
1,Foxes live in the praries of England. Sometime...,"[0.004513243213295937, 0.03383251652121544, -0..."
2,The foxes love to eat meat. If a fox smells me...,"[0.0006457185372710228, 0.019046274945139885, ..."
3,Ducks are nice animals too. Ducks eat bread,"[-0.028854859992861748, -0.016620498150587082,..."
