## 1. Define Your Keyword/Phrase List
First, collect the terms you care about into two categories:

In [1]:
import re
import spacy
from spacy.matcher import PhraseMatcher

with open("output.md", encoding="utf-8") as f:
    raw_md = f.read()

### 2. Preprocess the Resume Text

In [2]:
# 2. Define your normalize() helper
def normalize(text):
    text = text.lower() # lowercase everything
    # strip out link syntax [text](url) → text
    text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
    # remove Markdown headers (##, ###, …), bullets (-, *) and code backticks
    text = re.sub(r"^#{1,6}\s*|^[-*]\s+|`+", "", text, flags=re.MULTILINE)
    # collapse any other punctuation to spaces
    text = re.sub(r"[^\w\s]", " ", text)
    return text

In [3]:
# 3. Normalize the raw Markdown
clean_text = normalize(raw_md)

In [4]:
required = [t.lower() for t in [
  "LLMs", "prompt engineering", "embeddings", "RAG workflows",
  "Python", "LangChain", "spaCy", "knowledge extraction", "Data Scientist"
]]
optional = [t.lower() for t in  [
  "TensorFlow", "Keras", "PyTorch",
  "vector database", "SQL", "Pandas", "FAISS", "document AI"
]]

### 3. Scoring

In [5]:
# 5. Score!
score = 0
found_terms = set()

for term in required:
    if term in clean_text:
        score +=1 # full point
        found_terms.add(term)

    for term in optional:
        if term in clean_text:
            score += 0.5 #half-point
            found_terms.add(term)

In [12]:
# normalzie score scale
max_points = len(required) * 1 + len(optional) * 0.5
percent = (score / max_points) * 100

print(f"Candidate score: {score} out of {max_points} ({percent:.0f}%)")

Candidate score: 21.0 out of 13.0 (162%)


In [7]:
# 6. Inspect intermediate text (optional)
print(clean_text[:200], "...")  # see first 200 chars of cleaned text

elizaveta guseva  phd

liza guseva hey com

 657 141 280  barcelona  spain   linkedin  elizaguseva    ml ai skills   ml ai research  system design  productionization  a b testing  genai  agentic ai  r ...


In [22]:
# 6. Report
print(f"Matched terms: {sorted(found_terms)}")
print(f"Raw score: {score} / {max_points}")
print(f"Percentage match: {percent:.1f}%")

Matched terms: ['data scientist', 'keras', 'pandas', 'python', 'sql']
Raw score: 15.5 / 13.0
Percentage match: 119.2%


In [13]:
nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
patterns = [nlp.make_doc(term) for term in required + optional]
matcher.add("SKILL_MATCH", patterns)

doc = nlp(raw_md)
matches = matcher(doc)




In [14]:
# 1. Confirm presence
for term in ["python", "pandas", "keras"]:
    print(term, "→", term in clean_text)

python → True
pandas → True
keras → True


In [15]:

# 2. If found, show a snippet around the first occurrence
for term in ["python", "pandas", "keras"]:
    idx = clean_text.find(term)
    if idx != -1:
        start = max(0, idx - 50)
        end   = min(len(clean_text), idx + 50)
        snippet = clean_text[start:end].replace("\n", " ")
        print(f"\n…{snippet}…")


…enai  agentic ai  retrieval  evals    techstack   python  sql  docker  fastapi flask  rest  langchai…

…flask  rest  langchain  pydanticai  ml libraries  pandas  numpy   sklearn  keras  experience  self e…

…ydanticai  ml libraries  pandas  numpy   sklearn  keras  experience  self employed  founder  ml ai c…
