Providing and cleaning up the documents

In [None]:
import os
from tqdm.auto import tqdm
import random
def text_formatter_md(text: str) -> str:
"""formatting md"""
cleaned_text = text.replace("\n", " ").strip()
return cleaned_text
def open_and_read_md_folder(folder_path: str) -> list[dict]:
all_texts = []
for filename in os.listdir(folder_path):
if filename.endswith(".md"):
md_path = os.path.join(folder_path, filename)
with open(md_path, 'r', encoding='utf-8') as file:
text = file.read()
text = text_formatter_md(text=text)
all_texts.append({
"filename": filename,
"char_count": len(text),
"word_count": len(text.split(" ")),
"sentence_count_raw": len(text.split(". ")),
"token_count": len(text) / 5, # subword tokenizing average es
timate in English is ~ 5
"text": text
})
return all_texts
# Specify the path to your folder containing .md files
folder_path = "content_pull_request"
texts = open_and_read_md_folder(folder_path=folder_path)
#check some examples
random.sample(texts, k=1)

Displaying the document details for the first 5 documents

In [None]:
import pandas as pd
df = pd.DataFrame(texts)
df.head()

In [None]:
df.describe().round(2)

Splitting the text by sentences

In [None]:
#!pip install -qU langchain-text-splitters
from langchain_text_splitters import Language, RecursiveCharacterTextSplitter
# Initialize the Markdown splitter
md_splitter = RecursiveCharacterTextSplitter.from_language(
language=Language.MARKDOWN, chunk_size=100, chunk_overlap=0
)
for item in tqdm(texts):
# Split the markdown content into chunks
md_docs = md_splitter.create_documents([item["text"]])
# Initialize an empty list to store sentences after splitting
item["sentences"] = []
# Extract text from each chunk to form sentences
for doc in md_docs:
doc_text = doc.page_content # Access page content directly
#use the below option when not using stop-word removal
item["sentences"].append(doc_text)
# Further process with spaCy to handle stopwords or refine text
#nlp_doc = nlp(doc_text)
#filtered_sentence = " ".join(token.text for token in nlp_doc if not
token.is_stop)
#item["sentences"].append(filtered_sentence)
# Count the number of sentences
item["sentence_count_md"] = len(item["sentences"])
# Ensure all sentences are strings
item["sentences"] = [str(sentence) for sentence in item["sentences"]]

In [None]:
random.sample(texts, k=1)

In [None]:
import pandas as pd
df = pd.DataFrame(texts)
df.head()

In [None]:
df = pd.DataFrame(texts)
df.describe().round(2)

Splitting the text to sentence chunks

In [None]:
# Define split size
num_sentence_chunk_size = 18
# Create function to split text recursively
def split_list(input_list: list,
slice_size: int=num_sentence_chunk_size) -> list[list[str]]:
return [input_list[i:i + slice_size] for i in range(0, len(input_list), s
lice_size)]
test_list = list(range(25))
split_list(test_list)

In [None]:
# Apply for the git documents and split sentences into chunks
for item in tqdm(texts):
item["sentence_chunks"] = split_list(input_list=item["sentences"],
slice_size=num_sentence_chunk_size)
item["num_chunks"] = len(item["sentence_chunks"])

In [None]:
random.sample(texts, k=1)

In [None]:
df=pd.DataFrame(texts)
df.describe().round(2)

Splitting each chunk to a new item

In [None]:
import re
#split each chunk into its own item
files_and_chunks = []
for item in tqdm(texts):
for sentence_chunk in item["sentence_chunks"]:
chunk_dict = {}
chunk_dict["filename"] = item["filename"]
#join the sentences back together into paragraph-like structure
joined_sentence_chunk =
"".join(sentence_chunk).replace(" ", " ").st
rip()
joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence
_chunk) # add a space to any full stop which is followed by a capital letter:
".A" -> ". A"
#joined_sentence_chunk =
chunk_dict["sentence_chunk"] = joined_sentence_chunk
#statistics
chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
chunk_dict["chunk_word_count"] = len([word for word in joined_sentenc
e_chunk.split(" ")])
chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 5 # le
t's assume with subword tokenizing, 1 token = ~ 5 chars
files_and_chunks.append(chunk_dict)
len(files_and_chunks)

In [None]:
random.sample(files_and_chunks, k=1)

In [None]:
df = pd.DataFrame(files_and_chunks)
df.describe().round(2)

Embedding the text chunks

In [None]:
#vector size of embedding model
embeddings[0].shape

In [None]:
import torch
print("PyTorch Version:", torch.__version__)
print("CUDA Available:", torch.cuda.is_available())
print("CUDA Version:", torch.version.cuda)

In [None]:
%%time
#prerequisite: pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer
import torch
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2")
embedding_model.to("cuda" if torch.cuda.is_available() else "cpu")
#embedd chunks one by one
for item in tqdm(files_and_chunks):
item["embedding"] = embedding_model.encode(item["sentence_chunk"])

In [None]:
# Turn text chunks into a single list
text_chunks = [item["sentence_chunk"] for item in files_and_chunks]

In [None]:
%%time
# Embed all texts in batches
text_chunk_embeddings = embedding_model.encode(text_chunks,
batch_size=32, #Select the bat
ch size that works best
convert_to_tensor=True) #to re
turn embeddings as tensor instead of array
text_chunk_embeddings

Save the embeddings to a file

In [None]:
#display an example file:
files_and_chunks[119]

In [None]:
# create a single .csv file from the embedded chunk list
text_chunks_and_embeddings_df = pd.DataFrame(files_and_chunks)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [None]:
#import the saved file and view it
text_chunks_and_embeddings_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embeddings_df_load.head()