In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


Examine data

In [17]:
train_df = pd.read_csv("antique_train_split.csv")  # Contains 'doc_id' and 'text'
train_df.head()

Unnamed: 0,doc_id,text
0,1711818_11,"During the first part of the war, the M-14 was..."
1,2898971_4,my religion is umm... oh.. ''Moonism'' and you...
2,610031_8,democracy is suposed to be for example a gover...
3,657762_0,I suppose since they've been naming hurricanes...
4,2449050_7,When you live your life in a way that is pleas...


In [18]:
train_df['text'][0] # Multiple sentences in a single string...

'During the first part of the war, the M-14 was standard issue.  The M-14 is a semi-automatic rifle.  It was latter replaced by the M-16.  The M-16 is a much lighter rifle which is capable of operating fully automatic/'

Encode data

In [9]:
# Load validation data (train already here)
valid_df = pd.read_csv("antique_valid_split.csv")

# Load a transformer model for sentence embeddings
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Function to encode text in batches
def encode_texts(texts):
    return model.encode(texts, batch_size=32, convert_to_numpy=True, show_progress_bar=True)

# Compute embeddings
train_embeddings = encode_texts(train_df["text"].tolist())
valid_embeddings = encode_texts(valid_df["text"].tolist())

# Save embeddings
np.save("antique_train_embeddings.npy", train_embeddings)
np.save("antique_valid_embeddings.npy", valid_embeddings)

print(f"Train embeddings shape: {train_embeddings.shape}")
print(f"Validation embeddings shape: {valid_embeddings.shape}")

Batches: 100%|██████████| 204/204 [00:03<00:00, 58.70it/s] 
Batches: 100%|██████████| 69/69 [00:01<00:00, 65.26it/s]


Train embeddings shape: (6500, 384)
Validation embeddings shape: (2200, 384)


Save data

In [None]:
# Load original text data
train_df = pd.read_csv("antique_train_split.csv")
valid_df = pd.read_csv("antique_valid_split.csv")

# Load corresponding embeddings
train_embeddings = np.load("antique_train_embeddings.npy")
valid_embeddings = np.load("antique_valid_embeddings.npy")

# Store the entire embedding as a list inside a single column
train_df["embedding"] = train_embeddings.tolist()
valid_df["embedding"] = valid_embeddings.tolist()

# Save combined DataFrames
train_df.to_csv("antique_train_with_embeddings.csv", index=False)
valid_df.to_csv("antique_valid_with_embeddings.csv", index=False)

# train_df.to_parquet("antique_train_with_embeddings.parquet", index=False)
# valid_df.to_parquet("antique_valid_with_embeddings.parquet", index=False)

View data

In [21]:
# Display first few rows
print("Train Combined DataFrame:")
display(train_df.head())

print("\nValidation Combined DataFrame:")
display(valid_df.head())

Train Combined DataFrame:


Unnamed: 0,doc_id,text,embedding
0,1711818_11,"During the first part of the war, the M-14 was...","[-0.012949175201356411, 0.11205881834030151, 0..."
1,2898971_4,my religion is umm... oh.. ''Moonism'' and you...,"[0.01773860864341259, -0.01257272157818079, 0...."
2,610031_8,democracy is suposed to be for example a gover...,"[-0.014321690425276756, -0.04826303571462631, ..."
3,657762_0,I suppose since they've been naming hurricanes...,"[-0.04391418769955635, -0.016200868412852287, ..."
4,2449050_7,When you live your life in a way that is pleas...,"[-0.017229679971933365, 0.06264427304267883, -..."



Validation Combined DataFrame:


Unnamed: 0,doc_id,text,embedding
0,2904891_8,IN ANGER HUMANS RESPOND BY DOING TO OTHER PEOP...,"[0.06494822353124619, 0.0025993359740823507, 0..."
1,2254222_3,It is not correct to suppose that nothing is t...,"[-0.04835139587521553, 0.018058868125081062, -..."
2,4047817_4,"Cause puppies are adorable, fun to watch them ...","[-0.03127238526940346, -0.008739751763641834, ..."
3,2101776_1,blow your nose,"[0.05105193331837654, 0.004676510114222765, 0...."
4,2775108_4,Wht model of ipod do u have ??,"[-0.0021119285374879837, 0.024135807529091835,..."
