## Train Embedding Model

In [31]:
from tokenizers import Tokenizer, trainers, pre_tokenizers, models
import sentencepiece as spm
from gensim.models import Word2Vec, FastText
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import joblib
import pandas as pd

The cleaned data is loaded and converted into a list of strings to be fed into the tokenizers.

In [32]:
# Load the data and convert to a list of strings
df = pd.read_csv('../Datasets/train_cleaned.csv')
corpus = df['body'].tolist()  # Assuming 'body' is the column containing text data

Each tokenizer is used to convert the body text into tokens before feeding them into the embedding model for training. We load in the two previously trained tokenizers here as well as create two simple ones.

In [33]:
# Load in the pre-trained BPE tokenizer
bpe_tokenizer = Tokenizer.from_file("../Tokenizers/bpe_tokenizer.json")

# Load in the pre-trained SentencePiece tokenizer
sp_tokenizer = spm.SentencePieceProcessor(model_file="../Tokenizers/sp_model.model")

# Create a n-gram tokenizer
def generate_ngrams(text, n):
    tokens = text.split()
    ngrams = []
    for i in range(len(tokens)-n+1):
        ngram = ' '.join(tokens[i:i+n])
        ngrams.append(ngram)
    return ngrams

# Create a whitespace tokenizer
def whitespace_tokenizer(text):
    return text.split()

Now we can use each tokenizer and train a Word2Vec, FastText, and binary embedding models.

In [38]:
# Generate tokens of the data in many different forms
whitespace_tokenized_corpus = [whitespace_tokenizer(text) for text in corpus]
n2gram_tokenized_corpus = [generate_ngrams(text, 2) for text in corpus]
n3gram_tokenized_corpus = [generate_ngrams(text, 3) for text in corpus]
bpe_tokenized_corpus = [bpe_tokenizer.encode(text).tokens for text in corpus]
sp_tokenized_corpus = [sp_tokenizer.encode(text, out_type=str) for text in corpus]

# Combine all the list of lists
tokenized_corpus = whitespace_tokenized_corpus + n2gram_tokenized_corpus + n3gram_tokenized_corpus + bpe_tokenized_corpus + sp_tokenized_corpus

# Train a Word2Vec model
model = Word2Vec(sentences=tokenized_corpus, 
                 vector_size=100, 
                 window=5, 
                 min_count=5, 
                 sg=0, 
                 negative=5, 
                 epochs=10,
                 sample=1e-5)
model.save("../Embeddings/word2vec_model.bin")

# Train a FastText model
model = FastText(sentences=tokenized_corpus, 
                 vector_size=100, 
                 window=5, 
                 min_count=5, 
                 sg=0, 
                 negative=5, 
                 epochs=10,
                 sample=1e-5)
model.save("../Embeddings/fasttext_model.bin")

model = TfidfVectorizer(max_features=3000)
model.fit([item for sublist in tokenized_corpus for item in sublist])
joblib.dump(model, "../Embeddings/tfidf_model.joblib")

# Train a Count Vectorizer model
model = CountVectorizer(max_features=3000)
model.fit([item for sublist in tokenized_corpus for item in sublist])
joblib.dump(model, "../Embeddings/count_model.joblib")

['../Embeddings/count_model.joblib']

Let us test the tokenize-embedding strategy using a given pair

In [46]:
import numpy as np

# Make new dataset that has all body text replaced with their embedding using the BPE tokenizer and Word2Vec model
df = pd.read_csv('../Datasets/train_cleaned.csv')
corpus = df['body'].tolist()

bpe_tokenizer = Tokenizer.from_file("../Tokenizers/bpe_tokenizer.json")
tokenized_corpus = [bpe_tokenizer.encode(text).tokens for text in corpus]

model = Word2Vec.load("../Embeddings/word2vec_model.bin")
word2vec_embeddings = []

# Every sample is one embedding that is the average of all the word embeddings in the sample
for sample in tokenized_corpus:
    embeddings = []
    for word in sample:
        try:
            embeddings.append(model.wv[word])
        except KeyError:
            pass
    word2vec_embeddings.append(np.mean(embeddings, axis=0))

# Replace the body text in df with the embeddings
df['body'] = word2vec_embeddings
df.head()

Unnamed: 0,body,subreddit
0,"[-0.003420533, 0.047891144, 0.042080585, 0.009...",Toronto
1,"[-0.0022828227, 0.04957727, 0.0435968, 0.01011...",Toronto
2,"[-0.0026055586, 0.043765545, 0.036465846, 0.00...",Toronto
3,"[-0.0011125315, 0.04322434, 0.037288498, 0.008...",Toronto
4,"[-0.00599845, 0.057221085, 0.049849674, 0.0111...",Toronto


In [48]:
# See how many times "This" appears in the tokenized corpus
count = 0
for sample in tokenized_corpus:
    if "sample" in sample:
        count += 1
print(count)

0
