# **Advanced Embedding**

Modified embeddings
* training texts filtered based on id (not on unique text content)
* `min_count` of words for fine tuning and self build model = 10
* `window size` for fine tuning and self build model = 5 (explicitly set in self, for finetune apparently default)



## **Packages**

In [3]:
!pip install sentence-transformers==2.2.2
!pip install huggingface_hub #==0.25.1

Collecting huggingface-hub>=0.4.0 (from sentence-transformers==2.2.2)
  Using cached huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Using cached huggingface_hub-0.30.2-py3-none-any.whl (481 kB)
Installing collected packages: huggingface-hub
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface-hub 0.25.1
    Uninstalling huggingface-hub-0.25.1:
      Successfully uninstalled huggingface-hub-0.25.1
Successfully installed huggingface-hub-0.30.2


In [4]:
from sentence_transformers import SentenceTransformer
from sentence_transformers import util
from sentence_transformers import models
from sentence_transformers.models import WordEmbeddings
from sentence_transformers.models.tokenizer import WhitespaceTokenizer
from nltk.corpus import stopwords
import nltk
import numpy as np
import pandas as pd

from sentence_transformers import SentenceTransformer, models
import torch.nn.functional as F
import torch
import torch.nn as nn

import os
import json

In [5]:
#from token_normalize import TokenNormalize

## **File Paths**

In [17]:
# stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')
#stop_words

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
finetuned_input_w_dim = "embedding_files/fine_tuned_glove_word2vec_wordlim_10.txt"
self_build_w_dim = "embedding_files/self_build_word2vec_wordlim_10_window_5.txt"

finetuned_input = "embedding_files/finetuned_input_word2vec_wordlim_10_window_5.txt"
self_build_input = "embedding_files/self_input_word2vec_wordlim_10_window_5.txt"

normalized_finetuned_input = 'embedding_files/normalized_finetuned_word2vec_wordlim_10_window_5.txt'
normalized_self_build_input = 'embedding_files/normalized_self_input_word2vec_wordlim_10_window_5.txt'


## **Remove Dimensions from Finetuned and Self Build Embeddings**

(if not already done in normalization analysis)

In [5]:
# Finetuned embeddings

# read txt embeddings
with open(finetuned_input_w_dim, 'r') as file:
    lines = file.readlines()


# write relevant lines only
with open(finetuned_input, 'w') as new_file:
    new_file.writelines(lines[1:])

In [7]:
# check ft model

with open(finetuned_input, 'r') as f:
    for _ in range(5):  # Print first 5 lines
        print(f.readline())

trump -0.6135888 -0.05122478 0.32378477 -0.018552344 -0.04039653 -0.7179959 -1.4525905 0.535624 -1.5831982 -0.32613507 0.6316913 2.1569276 0.34307536 -0.062699385 -0.4873404 0.37337807 0.6948798 -0.8668169 -0.00010414087 0.5997524 -0.62814766 -0.16490321 -1.1172909 2.528996 1.3801602 0.8241034 -0.26796582 -0.671873 0.2545098 0.97736526 1.014585 -0.7936119 -0.105332196 -0.19707057 0.22297364 -0.6990499 -1.6896893 1.7435157 -1.9377967 0.25481412 -0.03202884 3.8575087 -0.060180493 -1.0402737 -0.6887274 -1.640598 0.45344245 -0.16767389 0.84069604 1.5578289 0.9771874 -0.89675 0.9967128 0.90804654 -0.007272807 -0.029031191 0.8605955 2.4673493 1.4746195 -0.4503221 1.8652802 0.41773197 1.3121616 -0.192177 0.77656454 0.04783074 -0.08766447 0.8279369 0.5085095 -1.0751499 1.2548716 -0.5749214 2.3748658 -0.30434716 1.6162771 0.49036238 -0.30241713 1.2241513 0.19421458 1.574174 -0.15816964 -1.6831776 -1.7084638 -0.83086646 -0.89024335 1.0556635 0.24655919 -0.8254698 -1.5905627 -0.37951446 -0.437451

In [7]:
# Self build model

# read txt embeddings
with open(self_build_w_dim, 'r') as file:
    lines = file.readlines()


# write relevant lines only
with open(self_build_input, 'w') as new_file:
    new_file.writelines(lines[1:])

In [8]:
# check self model

with open(self_build_input, 'r') as f:
    for _ in range(5):  # Print first 5 lines
        print(f.readline())

trump 0.59948117 -0.28408778 0.6504988 0.43734848 -0.6227624 -0.6972409 0.9597554 -0.008057123 -0.8670799 -0.72377723 -0.82790774 -0.42517745 -0.9437413 0.57381374 -0.373582 0.21953763 -0.51725835 1.2853409 -0.08310709 -1.4900138 0.6531159 0.069898136 -0.22441697 -0.63979894 0.100700974 0.84639233 0.48628098 0.3515258 0.24299438 0.5723756 -0.27149495 0.45208332 -0.16213135 0.13250393 0.24455719 0.5482613 0.8628233 -0.12933758 0.25566643 0.3456846 -0.7715811 -0.08185323 -0.34849134 -0.16647597 0.27219388 -0.34899205 -0.36642495 -0.14317276 0.2615827 0.5620519 -0.14176743 -0.95004207 -0.7693819 -0.79598457 1.3206114 -0.5447436 0.55283064 -0.5398319 -0.40984643 0.44866917 0.5866288 0.2146417 -0.74650306 -0.5409015 -0.11758082 1.1736859 0.564685 0.3442389 -0.28383437 0.035038754 0.13543443 0.11346874 0.0856265 -0.18126954 0.4666765 0.41795218 0.20135987 0.8923833 0.41415718 -0.10463093 -0.5322809 0.097537965 -0.23520711 0.77364516 0.12296794 0.30868208 0.25286117 -0.1536013 1.1614325 -0.36

## **Make Two Types of Sentence Model**

* one normalizing word embeddings prior to averaging
* one not normalizing

In [23]:
# normalize word embeddings


def normalize_embeddings(input_path, output_path):
    with open(input_path, 'r', encoding='utf-8') as f_in, open(output_path, 'w', encoding='utf-8') as f_out:
        for line in f_in:
            parts = line.strip().split()
            if len(parts) < 2:
                continue  # Skip malformed lines
            word = parts[0]
            try:
                vec = np.array([float(x) for x in parts[1:]])
                norm = np.linalg.norm(vec)
                if norm == 0:
                    norm = 1  # Avoid division by zero
                vec_norm = vec / norm
                vec_str = ' '.join(f'{x:.6f}' for x in vec_norm)
                f_out.write(f"{word} {vec_str}\n")
            except ValueError:
                print(f"Skipping line due to parsing error: {line.strip()}")

# Example usage
normalize_embeddings(finetuned_input, normalized_finetuned_input)
normalize_embeddings(self_build_input, normalized_self_build_input)



In [24]:
# check
def check_first_n_vectors(file_path, n=5, tolerance=1e-5):
    with open(file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i >= n:
                break
            parts = line.strip().split()
            if len(parts) < 2:
                print(f"Line {i+1} is malformed.")
                continue
            word = parts[0]
            try:
                vec = np.array([float(x) for x in parts[1:]])
                norm = np.linalg.norm(vec)
                is_normalized = abs(norm - 1.0) < tolerance
                print(f"{word}: norm = {norm:.6f} -> {'OK ✅' if is_normalized else 'NOT normalized ❌'}")
            except ValueError:
                print(f"Line {i+1} has a parsing error.")

# Example usage
check_first_n_vectors(normalized_finetuned_input)
check_first_n_vectors(normalized_self_build_input)


trump: norm = 1.000000 -> OK ✅
people: norm = 1.000000 -> OK ✅
would: norm = 1.000000 -> OK ✅
like: norm = 1.000000 -> OK ✅
think: norm = 1.000000 -> OK ✅
trump: norm = 1.000000 -> OK ✅
people: norm = 0.999999 -> OK ✅
would: norm = 1.000000 -> OK ✅
like: norm = 0.999999 -> OK ✅
think: norm = 1.000000 -> OK ✅


In [25]:
def model_normalize_before_pooling(embeddings_file, output_path, stopwords):
    embedding_model = WordEmbeddings.from_text_file(
        embeddings_file,
        tokenizer=WhitespaceTokenizer(stop_words=stopwords)
    )

    pooling_model = models.Pooling(
        word_embedding_dimension=embedding_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True
    )
    
    model = SentenceTransformer(modules=[
        embedding_model, 
        pooling_model
    ])
    
    model.save(output_path)
    #return model

In [46]:
#model_test_pre = model_normalize_before_pooling(self_build_input, output_self_build_pre_pool, stop_words)
#model_test_after = model_normalize_after_pooling(self_build_input, output_self_build_pre_pool, stop_words)

Load Word Embeddings: 70449Embeddings [00:02, 23530.02Embeddings/s]
Load Word Embeddings: 70449Embeddings [00:03, 23406.89Embeddings/s]


In [51]:
#apple = model_test_pre.encode("apple")
#great = model_test_pre.encode("great")
#sentence = model_test_pre.encode("apple great")

#from numpy import linalg as LA
#print(LA.norm(great))
#print(LA.norm(apple))
#print(LA.norm(sentence))

1.0
0.9999999
0.69474727


In [52]:
#apple = model_test_after.encode("apple")
#great = model_test_after.encode("great")
#sentence = model_test_after.encode("apple great")

#from numpy import linalg as LA
#print(LA.norm(great))
#print(LA.norm(apple))
#print(LA.norm(sentence))

1.0
0.9999999
0.99999994


In [61]:
# Post Pooling Normalization


def model_normalize_after_pooling(embeddings_file, output_path, stopwords):
    embedding_model = WordEmbeddings.from_text_file(
        embeddings_file,
        tokenizer=WhitespaceTokenizer(stop_words=stopwords)
    )
    
    pooling_model = models.Pooling(
        word_embedding_dimension=embedding_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True
    )

    normalize_model = models.Normalize()

    model = SentenceTransformer(modules=[
        embedding_model,
        pooling_model,
        normalize_model
    ])

    model.save(output_path)
    #return model

### Apply

In [28]:
# File Paths

# pre pool normalizing model
output_fine_tuned_model_pre_pool = "pre_pool_finetuned_sentence_model_wordlim_10_window_5"
output_self_build_pre_pool = "pre_pool_selfbuild_sentence_model_wordlim_10_window_5"


# post pool model
output_fine_tuned_model_after_pool = "after_pool_finetuned_sentence_model_wordlim_10_window_5"
output_self_build_after_pool = "after_pool_self_build_sentence_model_wordlim_10_window_5"


In [29]:
# pre pool


model_normalize_before_pooling(normalized_finetuned_input, output_fine_tuned_model_pre_pool, stop_words)
model_normalize_before_pooling(normalized_self_build_input, output_self_build_pre_pool, stop_words)


Load Word Embeddings: 1921604Embeddings [01:19, 24145.92Embeddings/s]
Load Word Embeddings: 70449Embeddings [00:02, 25379.71Embeddings/s]


In [64]:
# after pooling

model_normalize_after_pooling(finetuned_input, output_fine_tuned_model_after_pool, stop_words)
model_normalize_after_pooling(self_build_input, output_self_build_after_pool, stop_words)

Load Word Embeddings: 1921604Embeddings [01:19, 24154.55Embeddings/s]
Load Word Embeddings: 70449Embeddings [00:02, 23972.82Embeddings/s]
