# FastText Embedding

## Import libraries

In [1]:
import os
import json
import pandas as pd
import numpy as np
import re
import time
from tqdm import tqdm
from collections import Counter
from gensim.models import FastText
import pickle

## Identify file directory

In [2]:
docs_dir = "/Users/davepipon/Desktop/DS397 Data/coleridgeinitiative-show-us-the-data/train/"
train_csv = "/Users/davepipon/Desktop/DS397 Data/coleridgeinitiative-show-us-the-data/train.csv"

## Develop Custom FastText

### Pre-processing functions

In [3]:
def load_and_preprocess_documents(docs_dir, train_csv=None, n_docs=1000):
    """Load and preprocess up to n_docs documents for training."""
    documents = []
    processed_sentences = []

    print(f"Loading {n_docs} documents...")

    if train_csv and os.path.exists(train_csv):
        df = pd.read_csv(train_csv).fillna("")
        df = df.sample(n=min(n_docs, len(df)), random_state=42)

        for _, row in tqdm(df.iterrows(), total=len(df), desc="Loading from CSV"):
            pub_id = row["Id"]
            json_path = os.path.join(docs_dir, f"{pub_id}.json")
            if os.path.exists(json_path):
                try:
                    with open(json_path, "r", encoding="utf-8") as f:
                        doc = json.load(f)
                    text = extract_text_from_json(doc)
                    if text.strip():
                        documents.append(text)
                except:
                    continue
    else:
        json_files = [f for f in os.listdir(docs_dir) if f.endswith(".json")][:n_docs]
        for filename in tqdm(json_files, desc="Loading from directory"):
            json_path = os.path.join(docs_dir, filename)
            try:
                with open(json_path, "r", encoding="utf-8") as f:
                    doc = json.load(f)
                text = extract_text_from_json(doc)
                if text.strip():
                    documents.append(text)
            except:
                continue

    print(f"Loaded {len(documents)} documents")

    # Preprocess into sentences with 3+ character words
    for doc in tqdm(documents, desc="Preprocessing"):
        sentences = doc.split(".")
        for sentence in sentences:
            tokens = re.sub(r"[^a-zA-Z0-9\s]", " ", sentence.lower()).split()
            tokens = [t for t in tokens if len(t) >= 3]
            if len(tokens) >= 3:
                processed_sentences.append(tokens)

    print(f"Created {len(processed_sentences)} sentences for training")
    return processed_sentences


def extract_text_from_json(doc):
    """Recursively extract text from JSON document."""
    texts = []

    def recurse(x):
        if isinstance(x, dict):
            for v in x.values():
                recurse(v)
        elif isinstance(x, list):
            for v in x:
                recurse(v)
        elif isinstance(x, str):
            texts.append(x)

    recurse(doc)
    return " ".join(texts)

### Create FastText wrapper

In [4]:
class CustomFastText:
    def __init__(self, vectors, char_ngrams, vocab, size):
        self.vectors = vectors
        self.char_ngram_vectors = char_ngrams
        self.vocabulary = vocab
        self.vector_size = size

    def get_vector(self, word):
        if word in self.vectors:
            return self.vectors[word]
        w = f"<{word}>"
        vec, count = np.zeros(self.vector_size), 0
        for n in range(3, 7):
            for i in range(len(w) - n + 1):
                ngram = w[i:i+n]
                if ngram in self.char_ngram_vectors:
                    vec += self.char_ngram_vectors[ngram]
                    count += 1
        return vec / max(count, 1)

    def most_similar(self, word, topn=10):
        if word not in self.vocabulary:
            return []
        wv = self.get_vector(word)
        sims = []
        for other, vec in self.vectors.items():
            if other == word:
                continue
            cos = np.dot(wv, vec) / (np.linalg.norm(wv) * np.linalg.norm(vec) + 1e-6)
            sims.append((other, cos))
        sims.sort(key=lambda x: x[1], reverse=True)
        return sims[:topn]

### Create FastText trainer

In [None]:
def train_custom_fasttext(sentences, vector_size=100, window=5, min_count=5, epochs=5):
    """Train a simple custom FastText-like model."""
    print("\nTraining Custom FastText Model...")

    word_counts = Counter(w for s in sentences for w in s if len(w) >= 3)
    vocab = {w: c for w, c in word_counts.items() if c >= min_count}
    print(f"Vocabulary size: {len(vocab)}")

    np.random.seed(42)
    vectors = {w: np.random.normal(0, 0.1, vector_size) for w in vocab}

    char_ngram_vectors = {}
    for word in vocab:
        w = f"<{word}>"
        for n in range(3, 7):
            for i in range(len(w) - n + 1):
                ngram = w[i:i+n]
                if ngram not in char_ngram_vectors:
                    char_ngram_vectors[ngram] = np.random.normal(0, 0.1, vector_size)

    for epoch in range(epochs):
        print(f"Epoch {epoch+1}/{epochs}")
        for sentence in tqdm(sentences, desc=f"Training epoch {epoch+1}"):
            for i, target in enumerate(sentence):
                if target not in vocab:
                    continue
                start = max(0, i - window)
                end = min(len(sentence), i + window + 1)
                for j in range(start, end):
                    if i == j or sentence[j] not in vocab:
                        continue
                    context = sentence[j]

                    dot = np.dot(vectors[target], vectors[context])
                    cos_sim = dot / (np.linalg.norm(vectors[target]) * np.linalg.norm(vectors[context]) + 1e-6)

                    lr = 0.01 * (1.0 / (epoch + 1))
                    vectors[target] += lr * cos_sim * vectors[context]

                    norm = np.linalg.norm(vectors[target])
                    if norm > 0:
                        vectors[target] /= norm

    print("Custom FastText training done.")
    return CustomFastText(vectors, char_ngram_vectors, vocab, vector_size)

## Develop Gensim FastText

In [6]:
def train_gensim_fasttext(sentences, vector_size=100, window=5, min_count=5, epochs=5):
    """Train real FastText model via gensim."""
    print("\nTraining Gensim FastText...")
    model = FastText(
        sentences=sentences,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        min_n=3,
        max_n=6,
        epochs=epochs,
        sg=1,
        workers=1
    )
    print("Gensim FastText training done.")
    return model

## Compare Custom FastText with Gensim FastText

### Define comparator function

In [7]:
def compare_models(custom_model, gensim_model):
    print("\n=== MODEL COMPARISON ===")

    custom_vocab_size = len(custom_model.vocabulary)
    gensim_vocab_size = len(gensim_model.wv.key_to_index)

    print(f"Custom vocab: {custom_vocab_size}, Gensim vocab: {gensim_vocab_size}")

    common = set(custom_model.vocabulary.keys()) & set(gensim_model.wv.key_to_index.keys())
    print(f"Common words: {len(common)}")

    if common:
        test_word = list(common)[0]
        print(f"\nSimilarity test for word '{test_word}':")
        print("Custom:", custom_model.most_similar(test_word, topn=3))
        print("Gensim:", gensim_model.wv.most_similar(test_word, topn=3))

    print("\nOOV test:")
    for w in ["unknownword", "newterm123"]:
        try:
            _ = custom_model.get_vector(w)
            custom_ok = "✓"
        except:
            custom_ok = "✗"
        try:
            _ = gensim_model.wv.get_vector(w)
            gensim_ok = "✓"
        except:
            gensim_ok = "✗"
        print(f"{w}: Custom={custom_ok}, Gensim={gensim_ok}")


### Comparator implementer

In [8]:
def main(docs_dir, train_csv=None, n_docs=1000):
    print("=== FASTTEXT PROGRAM ===")

    sentences = load_and_preprocess_documents(docs_dir, train_csv, n_docs)
    if not sentences:
        print("No data found.")
        return

    start = time.time()
    custom_model = train_custom_fasttext(sentences)
    print(f"Custom training took {time.time()-start:.2f} sec")

    start = time.time()
    gensim_model = train_gensim_fasttext(sentences)
    print(f"Gensim training took {time.time()-start:.2f} sec")

    compare_models(custom_model, gensim_model)

    print("\nSaving models...")
    gensim_model.save("gensim_fasttext.model")
    with open("custom_fasttext.pkl", "wb") as f:
        pickle.dump(custom_model, f)
    print("Models saved.")


## Implement comparison

In [9]:
if __name__ == "__main__":
    docs_dir = docs_dir
    train_csv = train_csv
    main(docs_dir, train_csv, n_docs=1000)

=== FASTTEXT PROGRAM ===
Loading 1000 documents...


Loading from CSV: 100%|██████████| 1000/1000 [00:01<00:00, 779.07it/s]


Loaded 1000 documents


Preprocessing: 100%|██████████| 1000/1000 [00:03<00:00, 250.53it/s]


Created 354736 sentences for training

Training Custom FastText Model...
Vocabulary size: 28999
Epoch 1/5


Training epoch 1: 100%|██████████| 354736/354736 [12:08<00:00, 487.16it/s] 


Epoch 2/5


Training epoch 2: 100%|██████████| 354736/354736 [11:42<00:00, 504.67it/s] 


Epoch 3/5


Training epoch 3: 100%|██████████| 354736/354736 [11:26<00:00, 516.92it/s] 


Epoch 4/5


Training epoch 4: 100%|██████████| 354736/354736 [11:41<00:00, 506.01it/s] 


Epoch 5/5


Training epoch 5: 100%|██████████| 354736/354736 [11:26<00:00, 516.90it/s] 


Custom FastText training done.
Custom training took 3507.18 sec

Training Gensim FastText...
Gensim FastText training done.
Gensim training took 342.77 sec

=== MODEL COMPARISON ===
Custom vocab: 28999, Gensim vocab: 28999
Common words: 28999

Similarity test for word 'barnett':
Custom: [('experimenter', 0.48483675928085657), ('justifies', 0.469858204498151), ('muddy', 0.45716120321185527)]
Gensim: [('bennett', 0.8931403160095215), ('barnes', 0.8920161128044128), ('barkley', 0.8908674716949463)]

OOV test:
unknownword: Custom=✓, Gensim=✓
newterm123: Custom=✓, Gensim=✓

Saving models...
Models saved.


## Key takeaways

The custom FastText model developed is a simple illustration of the core idea on the use of subword embeddings and handles out of vocabulary words. As expected, it performs poorly on the similarity test compared to the Gensim's FastText. Also, despite the simplicity of the custom FastText, the training took a lot longer than Gensim. To improve this model, the following can be done:
1.  Add negative sampling: Sample random "negative words" that don’t occur in context and push their vectors apart.
2. Use a better loss function: Instead of cosine similarity, implement a logistic loss (like in Word2Vec).
3. Normalize embeddings more carefully: After each update, normalize all vectors to unit length to stabilize similarity scores.
4. Train longer and with more data: More epochs and a bigger corpus can help, though it won’t reach Gensim-level quality.