# Word Embeddings

Pipeline to generate essays mathemaically using word embedding representations of semantics to create perturbation.


Ideas:
- add Gaussian noise in word embeddings to shift meaning subtly (target subjective bias of LLMs)
- use sentence embeddings to rearrange essay structures randomly (target coherence valuations of LLMs)
- find embeddings that changes the ton of content (target persuasiveness)
- use topic modeling to shift essay's focus (target topic distribution)

In [2]:
import pandas as pd
import numpy as np
import nltk
import random
from collections import defaultdict
from sentence_transformers import SentenceTransformer, util
from nltk.tokenize import word_tokenize, sent_tokenize

# Download necessary components
nltk.download('punkt')

# Load pre-trained word embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Load CSV file with topics
df = pd.read_csv("topics.csv")

# Load a corpus to build a Markov model for essay generation
with open("corpus.txt", "r", encoding="utf-8") as f:
    corpus_text = f.read()

# Tokenize corpus into sentences and words
sentences = sent_tokenize(corpus_text)
tokens = [word_tokenize(sent) for sent in sentences]

# Build a Markov Chain model
markov_model = defaultdict(list)
for sentence in tokens:
    for i in range(len(sentence) - 1):
        markov_model[sentence[i]].append(sentence[i + 1])

def generate_essay(topic, word_limit=100):
    words = word_tokenize(topic)
    essay = words[:]
    
    for _ in range(word_limit - len(words)):
        last_word = essay[-1]
        next_word = random.choice(markov_model.get(last_word, ["."]))
        essay.append(next_word)
        if next_word == ".":
            break
    
    return " ".join(essay)

# Function to add Gaussian noise to word embeddings
def add_noise_to_text(text, epsilon=0.05):
    words = word_tokenize(text)
    word_embeddings = embedding_model.encode(words, convert_to_numpy=True)
    
    # Apply Gaussian noise
    noise = np.random.normal(0, epsilon, word_embeddings.shape)
    noisy_embeddings = word_embeddings + noise
    
    # Find closest words to noisy embeddings
    noisy_words = [util.dot_score(noisy_embeddings[i], word_embeddings).argmax() for i in range(len(words))]
    
    # Reconstruct the noisy text
    noisy_text = " ".join([words[i] if i < len(words) else words[i-1] for i in noisy_words])
    return noisy_text

# Generate and perturb essays
def process_essays(df):
    essays = []
    noisy_essays = []
    
    for topic in df['topic']:
        essay = generate_essay(topic)
        noisy_essay = add_noise_to_text(essay)
        essays.append(essay)
        noisy_essays.append(noisy_essay)
    
    df["Generated_Essay"] = essays
    df["Noisy_Essay"] = noisy_essays
    df.to_csv("generated_essays.csv", index=False)
    print("Saved essays to generated_essays.csv")

# Run pipeline
process_essays(df)


ModuleNotFoundError: No module named 'pandas'