## 1. Define Your Keyword/Phrase List
First, collect the terms you care about into two categories:

In [1]:
import re
import spacy
from spacy.matcher import PhraseMatcher

file_path = "/mnt/data/KANDACE_LOUDOR.md"
with open("KANDACE_LOUDOR.md", encoding="utf-8") as f:
    raw_md = f.read()

### 2. Preprocess the Resume Text

In [2]:
# 2. Define your normalize() helper
def normalize(text):
    text = text.lower() # lowercase everything
    # strip out link syntax [text](url) → text
    text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
    # remove Markdown headers (##, ###, …), bullets (-, *) and code backticks
    text = re.sub(r"^#{1,6}\s*|^[-*]\s+|`+", "", text, flags=re.MULTILINE)
    # collapse any other punctuation to spaces
    text = re.sub(r"[^\w\s]", " ", text)
    return text

In [3]:
# 3. Normalize the raw Markdown
clean_text = normalize(raw_md)

In [4]:
required = [t.lower() for t in [
  "LLMs", "prompt engineering", "embeddings", "RAG workflows",
  "Python", "LangChain", "spaCy", "knowledge extraction", "Data Scientist"
]]
optional = [t.lower() for t in  [
  "TensorFlow", "Keras", "PyTorch",
  "vector database", "SQL", "Pandas", "FAISS", "document AI"
]]

In [5]:
# sanity: any accidental overlap?
overlap = set(required) & set(optional)
if overlap:
    print("WARNING overlap between required/optional:", overlap)

In [6]:
# --- match as unique sets ---
req_found = {t for t in required if t in clean_text}
opt_found = {t for t in optional if t in clean_text}

### 3. Scoring

In [7]:
# 5. Score!
score = 0
found_terms = set()

# Score required = 1 point each
for term in required:
    if term in clean_text:
        score +=1 # full point
        found_terms.add(term)
        
# Score optional = 0.5 points each
    for term in optional:
        if term in clean_text:
            score += 0.5 #half-point
            found_terms.add(term)

In [8]:
# compute percentage
max_points = len(required) * 1 + len(optional) * 0.5
percent = (score / max_points) * 100 if max_points else 0

In [9]:
# 6. Inspect intermediate text (optional)
print(clean_text[:200], "...")  # see first 200 chars of cleaned text

kandace loudor

data scientist

contact

kloudor email com

 123  456 7890 mount laurel  nj

linkedin

github

education

b s  statistics rutgers university september 2011   april 2015

new brunswick  ...


In [None]:
# 6. Report
print("Required matched:", sorted(req_found))
print("Optional matched:", sorted(opt_found))
print(f"Raw score: {score} / {max_points}")
print(f"Percentage match: {percent:.1f}%")

Required matched: ['data scientist', 'python']
Optional matched: ['keras', 'pandas', 'sql']
Raw score: 15.5 / 13.0
Percentage match: 119.2%


In [10]:
nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
patterns = [nlp.make_doc(term) for term in required + optional]
matcher.add("SKILL_MATCH", patterns)

doc = nlp("KANDACE_LOUDOR.md")
matches = matcher(doc)




In [11]:
# 1. Confirm presence
for term in ["python", "pandas", "keras"]:
    print(term, "→", term in clean_text)

python → True
pandas → True
keras → True


In [12]:

# 2. If found, show a snippet around the first occurrence
for term in ["python", "pandas", "keras"]:
    idx = clean_text.find(term)
    if idx != -1:
        start = max(0, idx - 50)
        end   = min(len(clean_text), idx + 50)
        snippet = clean_text[start:end].replace("\n", " ")
        print(f"\n…{snippet}…")


…ber 2011   april 2015  new brunswick  nj  skills  python  numpy  pandas   scikit learn  keras  flask…

…l 2015  new brunswick  nj  skills  python  numpy  pandas   scikit learn  keras  flask   sql  mysql  …

…nj  skills  python  numpy  pandas   scikit learn  keras  flask   sql  mysql  postgres   git time ser…
