# Embedding Text Documents 

In [2]:
# best practice notebook magic
%load_ext autoreload
%autoreload 2
%matplotlib inline

# Loading the embedding model

In [3]:
from sentence_transformers import SentenceTransformer
from fastcore.all import *
import nltk
import numpy as np

model = SentenceTransformer('sentence-transformers/multi-qa-mpnet-base-cos-v1')#"all-MiniLM-L6-v2")

Embedding a few sentences.

In [4]:
sentences = [
    "This framework generates embeddings for each input sentence",
    "Sentences are passed as a list of string.",
    "The quick brown fox jumps over the lazy dog.",
]
sentence_embeddings = model.encode(sentences)

def encode(txt: str) -> np.ndarray:
    "Small wraper to encode `txt` with the `model`."
    return model.encode(txt)

In [8]:
## viewing the embeddings
# for sentence, embedding in zip(sentences, sentence_embeddings):
#     print("Sentence:", sentence)
#     print("Embedding:", embedding)
#     print("")

# Loading different splitters

Goal is to compare the workings and performance of different splitters.

In [9]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import NLTKTextSplitter
from langchain.text_splitter import SpacyTextSplitter

splitters = {
    'char': CharacterTextSplitter(),
    'nltk': NLTKTextSplitter(),
    'spacy': SpacyTextSplitter(),
}

# start with the ntlk splitter
splitter = splitters['char']

# Sample document to embed

In [10]:
# path with the extracted Diaxtasis content
docs_path = Path("../content")
docs_path.ls()

# start with the how-to guide
fname = "how-to-use-diataxis.txt"


In [11]:
# load the document
with open(docs_path/fname, 'r') as f:
    content = f.read()

In [12]:
# cleanup the content
content = content.replace(" ¶ ", '. ')
content = content.replace("\r", " ")
content = content.replace("\\.", ".")
content = content.replace("\.", " ")


def diataxis_preproc(txt: str) -> str:
    txt = txt.replace(" ¶ ", '. ')
    txt = txt.replace("\r", " ")
    txt = txt.replace("\\.", ".")
    txt = txt.replace("\.", " ")
    return txt

content = diataxis_preproc(content)

content;

In [16]:
# test the splitter on the cleaned content
# NOTE: still not working
splits = L(splitter.split_text(content));
splits;

In [17]:
# split the doc into sentences
sentences = L(nltk.sent_tokenize(content))
sentences

def ntlk_sent_tokenize(txt: str) -> L:
    "Tokenize `txt` into sentences using the `nltk` sentence tokenizer."
    return L(nltk.sent_tokenize(txt))

sentences = ntlk_sent_tokenize(content)

In [18]:
sentences[0], sentences[-1]

('How to use Diátaxis.',
 'However it can always be complete: useful to users, appropriate to its current stage of development, and in a healthy structural state and ready to go on to the next stage.')

# Embedding the split up sentences

In [19]:
list(enumerate(sentences))

[(0, 'How to use Diátaxis.'),
 (1, 'In short, the answer is: pragmatically.'),
 (2,
  'Diátaxis is based on sound theoretical principles and has been proven in practice, but it’s not the final word in documentation.'),
 (3,
  'The only value that it can offer you is to be useful in helping make your documentation better for its users, and easier for you to create and maintain.'),
 (4,
  'The best thing you can do with it therefore is take from it however much seems to work for you: as much or as little as you wish.'),
 (5, 'Use Diátaxis as a guide, not a plan.'),
 (6, 'Diátaxis describes a complete picture of documentation.'),
 (7,
  'However the structure it proposes is not intended to be a plan, something you must complete in your documentation.'),
 (8,
  'It’s a guide, a map to help you check that you’re in the right place and going in the right directions.'),
 (9,
  'The point of Diátaxis is to give you a way to think about and understand your documentation, so that you can make be

In [180]:
# embed the sentences
embeds = L(enumerate(sentences)).map(
    lambda obj: {'idx': obj[0], 'txt': obj[1], 'embed': encode(obj[1])},
)

In [181]:
import pandas as pd

In [182]:
df = pd.DataFrame(embeds)

In [183]:
df

Unnamed: 0,idx,txt,embed
0,0,How to use Diátaxis.,"[0.049325798, -0.04237737, -0.0069784094, 0.01..."
1,1,"In short, the answer is: pragmatically.","[0.0022748124, 0.050606053, 0.004992658, -0.02..."
2,2,Diátaxis is based on sound theoretical princip...,"[0.03447037, 0.029921759, -0.013568475, 0.0060..."
3,3,The only value that it can offer you is to be ...,"[0.009317927, -0.0018541908, -0.014145311, -0...."
4,4,The best thing you can do with it therefore is...,"[0.03571525, 0.007364556, -0.003224689, 0.0388..."
...,...,...,...
62,62,"As a living, growing organism, a plant is neve...","[0.02837495, 0.038374577, -0.011913994, -0.022..."
63,63,"But, at every stage of its development, from s...","[0.03240156, 0.079276085, -0.029753845, 0.0153..."
64,64,"At any point, it is in a state that is appropr...","[-0.026164208, -0.029552646, 0.005358615, -0.0..."
65,65,"Similarly, documentation is also never finishe...","[0.06910032, -0.0031751303, -0.03958649, -0.05..."


# Saving the output embeddings

In [188]:
embeds[0]['embed'].shape

(768,)

In [184]:

# save it to embeds output path
out_path = Path('../embeds')

out_fname = 'test_embeds_v0.csv'
df.to_csv(out_path / out_fname, index=False)