In [5]:
import markdown
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re

# Processing markdown documents into raw text

In [6]:
def read_markdown_folder_to_df(folder_path):
    """
    Reads all .md files from a given folder and returns a DataFrame
    with 'title' (from filename) and 'content' (from file text).
    
    Parameters:
        folder_path (str): Path to the folder containing .md files.
    
    Returns:
        pd.DataFrame: DataFrame with columns ['title', 'content']
    """
    data = []

    for filename in os.listdir(folder_path):
        if filename.endswith(".md"):
            filepath = os.path.join(folder_path, filename)
            with open(filepath, "r", encoding="utf-8") as file:
                content = file.read()
            title = os.path.splitext(filename)[0]
            data.append({"title": title, "content": content})

    return pd.DataFrame(data)


In [7]:
df = read_markdown_folder_to_df("./documents")

In [8]:
from bs4 import BeautifulSoup

def markdown_to_text(md_content):
    # 1. Convert markdown to HTML
    html = markdown.markdown(md_content)
    
    # 2. Extract text
    soup = BeautifulSoup(html, "html.parser")
    text = soup.get_text(separator="\n")

    # 3. Remove multiple newlines
    text = re.sub(r'\n+', '\n', text)

    # 4. Remove newline before and after colon
    text = re.sub(r'\n\s*:\s*', ': ', text)
    text = re.sub(r':\s*\n', ': ', text)

    # 5. Remove newline before '('
    text = re.sub(r'\n\s*\(', '(', text)

    # 6. Clean extra spaces at line ends and join lines
    lines = [line.strip() for line in text.split('\n') if line.strip()]
    clean_text = '\n'.join(lines)

    return clean_text


In [9]:
df['content'] = df['content'].apply(markdown_to_text)

In [10]:
df

Unnamed: 0,title,content
0,relevant_projects,Relevant Projects\nNode.js web application for...
1,languages,Languages\nEnglish\n– Level C1\nGerman\n– Leve...
2,contact,Benedek Czotter\nEmail: czottibeni@gmail.com\n...
3,work_experience,Work experience\nData Engineer Trainee\nRobert...
4,skills,Skills\nCurrently studying in the specializati...
5,certificates,Certificates\nAWS Certified Cloud Practitioner...
6,education,Education\n2017 – 2022: Zrínyi Miklós High Sch...
7,about_me,"As a 22-year-old university student, I priorit..."


In [11]:
print(df.loc[5, 'content'])

Certificates
AWS Certified Cloud Practitioner(issued on 08.08.2024)


# Embedding

In [12]:
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [14]:
df['embedding'] = df['content'].apply(lambda x: model.encode(x))

In [15]:
df

Unnamed: 0,title,content,embedding
0,relevant_projects,Relevant Projects\nNode.js web application for...,"[-0.08795594, -0.029305805, -0.010981977, 0.01..."
1,languages,Languages\nEnglish\n– Level C1\nGerman\n– Leve...,"[-0.030317135, -0.07237996, 0.061659515, -0.05..."
2,contact,Benedek Czotter\nEmail: czottibeni@gmail.com\n...,"[-0.1331627, 0.049100894, -0.008418502, -0.038..."
3,work_experience,Work experience\nData Engineer Trainee\nRobert...,"[-0.08950661, 0.025780125, 0.022420205, -0.020..."
4,skills,Skills\nCurrently studying in the specializati...,"[-0.04616482, -0.07183436, 0.010111546, 0.0411..."
5,certificates,Certificates\nAWS Certified Cloud Practitioner...,"[-0.012080462, 0.02366498, 0.012267847, 0.0002..."
6,education,Education\n2017 – 2022: Zrínyi Miklós High Sch...,"[-0.058915474, -0.033082172, 0.07754192, 0.010..."
7,about_me,"As a 22-year-old university student, I priorit...","[0.08529875, 0.027285557, 0.02865278, -0.01366..."


In [16]:
len(df.loc[0, 'embedding'])

384

# Getting keywords

In [19]:
from keybert import KeyBERT

In [20]:
kw_model = KeyBERT("all-MiniLM-L6-v2")

In [21]:
def extract_keywords_keybert(text, top_n=10):
    keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 1), stop_words='english', top_n=top_n)
    return [kw for kw, score in keywords]

In [22]:
df["keywords"] = df["content"].apply(lambda x: extract_keywords_keybert(x))

In [23]:
df.loc[0, 'keywords']

['mongodb',
 'node',
 'dataset',
 'middleware',
 'forecasting',
 'data',
 'prediction',
 'loan',
 'pet',
 'clustering']

# Practice returning the best document for given question

In [24]:
from sklearn.metrics.pairwise import cosine_similarity

In [25]:
def retrieve_most_relevant(query: str, df):
    # Step 1: Embed the query
    query_embedding = model.encode([query])[0]  # shape: (dim,)
    
    # Step 2: Compute cosine similarities
    doc_embeddings = np.vstack(df['embedding'].values)
    similarities = cosine_similarity([query_embedding], doc_embeddings)[0]
    
    # Step 3: Find the index of the most similar document
    best_idx = np.argmax(similarities)
    
    # Step 4: Return the most relevant row
    return df.iloc[best_idx]

In [26]:
query = "school"

In [27]:
result = retrieve_most_relevant(query, df)

print("Title:", result['title'])
print("Content:", result['content'])

Title: education
Content: Education
2017 – 2022: Zrínyi Miklós High School, Zalaegerszeg
2022 – Present: Budapest University of Technology and Economics
Faculty of Electrical Engineering and Informatics, Computer Science Engineering major
Currently writing my thesis about the segmentation of time series with ensemble models.
