## Train Embedding Model

In [5]:
from tokenizers import Tokenizer, trainers, pre_tokenizers, models
import sentencepiece as spm
from gensim.models import Word2Vec, FastText
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import joblib
import pandas as pd

The cleaned data is loaded and converted into a list of strings to be fed into the tokenizers.

In [6]:
# Load the data and convert to a list of strings
df = pd.read_csv('../Datasets/train_cleaned.csv')
corpus = df['body'].tolist()  # Assuming 'body' is the column containing text data

Each tokenizer is used to convert the body text into tokens before feeding them into the embedding model for training. We load in the two previously trained tokenizers here as well as create two simple ones.

In [7]:
# Load in the pre-trained BPE tokenizer
bpe_tokenizer = Tokenizer.from_file("../Tokenizers/bpe_tokenizer.json")

# Load in the pre-trained SentencePiece tokenizer
sp_tokenizer = spm.SentencePieceProcessor(model_file="../Tokenizers/sp_model.model")

# Create a n-gram tokenizer
def generate_ngrams(text, n):
    tokens = text.split()
    ngrams = []
    for i in range(len(tokens)-n+1):
        ngram = ' '.join(tokens[i:i+n])
        ngrams.append(ngram)
    return ngrams

# Create a whitespace tokenizer
def whitespace_tokenizer(text):
    return text.split()

Now we can use each tokenizer and train a Word2Vec, FastText, and binary embedding models.

In [8]:
# Generate tokens of the data in many different forms
whitespace_tokenized_corpus = [whitespace_tokenizer(text) for text in corpus]
n2gram_tokenized_corpus = [generate_ngrams(text, 2) for text in corpus]
n3gram_tokenized_corpus = [generate_ngrams(text, 3) for text in corpus]
bpe_tokenized_corpus = [bpe_tokenizer.encode(text).tokens for text in corpus]
sp_tokenized_corpus = [sp_tokenizer.encode(text, out_type=str) for text in corpus]

# Combine all the list of lists
tokenized_corpus = whitespace_tokenized_corpus + n2gram_tokenized_corpus + n3gram_tokenized_corpus + bpe_tokenized_corpus + sp_tokenized_corpus

# Train a Word2Vec model
model = Word2Vec(sentences=tokenized_corpus, 
                 vector_size=100, 
                 window=5, 
                 min_count=5, 
                 sg=0, 
                 negative=5, 
                 epochs=10, 
                 sample=1e-5)
model.save("../Embeddings/word2vec_model.bin")

# Train a FastText model
model = FastText(sentences=tokenized_corpus, 
                 vector_size=100, 
                 window=5, 
                 min_count=5, 
                 sg=0, 
                 negative=5, 
                 epochs=10, 
                 sample=1e-5)
model.save("../Embeddings/fasttext_model.bin")

# Train a TFid model
model = TfidfVectorizer(max_features=3000)
model.fit(corpus)
joblib.dump(model, "../Embeddings/tfidf_model.joblib")

# Train a Count Vectorizer model
model = CountVectorizer(max_features=3000)
model.fit(corpus)
joblib.dump(model, "../Embeddings/count_model.joblib")

['../Embeddings/count_model.joblib']