# Dataset

[Resume Corpus](https://github.com/florex/resume_corpus)

# Setup

In [10]:
import pandas as pd
import os
import codecs
from bs4 import BeautifulSoup 
import codecs
import spacy
import anthropic
import json
import tqdm

# Extract

In [11]:
#nlp = spacy.load("en_core_web_lg")

def xfm(raw_text):
    plain_text = BeautifulSoup(raw_text).text
    return plain_text

In [12]:
files = [fn for fn in os.listdir("../data/resumes_corpus") if fn[-4:] == ".lab" ]

In [13]:
recs = []
for fn in files:

    fn_prefix = fn[:-4]
    with codecs.open("../data/resumes_corpus/" + fn_prefix + ".lab", 'r', encoding='utf-8', errors='ignore') as f:
        labels = f.read()         
    labels_split = labels.split("\n")

    with codecs.open("../data/resumes_corpus/" + fn_prefix + ".txt", 'r', encoding='utf-8', errors='ignore') as f:
        raw_text = f.read()         
    plain_text = xfm(raw_text)
    
    recs.append({"id": fn_prefix, "labels": labels_split, "cv": plain_text})
    

# Transform

In [14]:
client = anthropic.Anthropic()

with open("./system.txt") as f:
    system_msg = f.read()

def entity_parsing(cv):
    try:
        message = client.messages.create(
            model="claude-3-7-sonnet-20250219",
            max_tokens=4096,
            system = system_msg,
            messages=[{"role": "user", "content": cv }]
            )
        return json.loads(message.content[0].text.replace("```json","").replace("```",""))
    except Exception as e:
        return {}


In [None]:
for rec in tqdm.tqdm(recs[0:1000]):
    res = entity_parsing(rec["cv"])
    res["cv"] = rec["cv"]
    res["id"] = rec["id"]
    with open("output.jsonl", "a") as f:
        f.write(json.dumps(res) + "\n")

# Load

In [19]:
from database import MyGraphDB
from faker import Faker
db = MyGraphDB()

In [17]:
with open("output.jsonl") as f:
    jsonl = f.readlines()
recs = [json.loads(jsn) for jsn in jsonl]

In [20]:
for rec in tqdm.tqdm(recs):
    fake = Faker()
    db.add_person(fake.name(), fake.email(), rec["id"])

companies = []
certs = []
skills = []
for rec in tqdm.tqdm(recs):
    for role in rec['past_roles']:              
        companies.append(role['company'])
    for cert in rec['certifications']:
        certs.append(cert['title'])
    for skill in rec['skills']:
        skills.append(skill)

companies = list(set(companies))
certs = list(set(certs))
skills = list(set(skills))

for c in companies:
    db.add_company(c)

for c in certs:
    db.add_certification(c)

for s in skills:
    db.add_skill(s)

for rec in tqdm.tqdm(recs):
    personId = rec["id"]
    
    for role in rec['past_roles']:        
        db.add_WORKED_AT_rel(personId, role['company'], role['role_level'], "2025-01-01", "2025-01-01")

    for cert in rec['certifications']:
        db.add_HAS_CERTIFICATION_rel(personId, cert['title'], "2025-01-01")

    for skill in rec['skills']:
        db.add_HAS_SKILL_rel(personId, skill)


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 340/340 [00:01<00:00, 208.57it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 340/340 [00:00<00:00, 137557.96it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 340/340 [01:11<00:00,  4.73it/s]


In [8]:
recs[0]["id"]

'16939'

In [11]:
recs[0]["past_roles"]

[{'company': 'United States Air Force',
  'start_date': 'June 2011',
  'end_date': 'Present',
  'role_name': 'Network Administrator',
  'role_level': 'Manager'}]