# Jobify — SkillNer Pretrained Model

- This notebook contains everything needed to run skill extraction using the SkillNER pretrained model:

1. Install dependencies (Transformers, Torch, pdfplumber, pypandoc + Pandoc).
2. Import libraries needed for NLP + file handling.
3. Convert input file to text (PDF / DOCX / TXT) + clean the text.
4. Load the SkillNER pretrained model (ihk/skillner) and tokenizer.
5. Build the token-classification pipeline (with aggregation to merge subwords/spans).
6. Run skill extraction on the cleaned text and display the extracted skills + confidence.

## 1. Install Requirements

In [None]:
!pip -q install transformers pdfplumber pypandoc
!apt-get -qq update
!apt-get -qq install -y pandoc

## 2. Imports

In [None]:
import os
import re
import pdfplumber
import pypandoc

from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

## 3. Utils

In [None]:
# clean extracted text from docx and pdfs
def clean_cv_text(text):
    # Convert all text to lowercase
    text = text.lower()  

    # Replace multiple consecutive newlines with a single newline
    text = re.sub(r'\n+', '\n', text)  
    
    # Replace multiple spaces, tabs, or newlines with a single space
    text = re.sub(r'\s+', ' ', text)  

    # Remove page number artifacts commonly found in CV footers
    text = re.sub(r'page \d+ of \d+', '', text)  

    # Remove bullet characters and long dashes
    text = re.sub(r'[•●▪■–—]', ' ', text)  

    # Remove leading and trailing whitespace before sending text to the ML model
    return text.strip()  



# transform input from docx to text
def docx_to_text(filepath):
    text = pypandoc.convert_file(filepath, to='plain')
    return text

# transform input from pdf to text
def pdf_to_text(filepath):
    text = ""

    with pdfplumber.open(filepath) as pdf:
        for page in pdf.pages:
            text += page.extract_text() or ""

    return text

# exctract text from input (docx/pdf/txt)
def extract_text(filepath):
    type = os.path.splitext(filepath)[1].lower()

    if type=='.docx':
        return clean_cv_text(docx_to_text(filepath))
    
    elif type=='.pdf':
        return clean_cv_text(pdf_to_text(filepath))
    
    elif type=='.txt':
        with open(filepath, "r", encoding="utf-8") as f:
            return clean_cv_text(f.read())
    
    else:
        raise ValueError(f"Unsupported '{type}' file type.")

## 4. Model Loader

In [None]:
# load model
def load_skillner():
    tokenizer = AutoTokenizer.from_pretrained("ihk/skillner")
    model = AutoModelForTokenClassification.from_pretrained("ihk/skillner")

    return {'tokenizer': tokenizer, 'model': model}

## 5. Skills Extraction

In [None]:
# Exctract skills from input (input text -> model -> skills)
def build_skill_pipeline(model, tokenizer):
    ner = pipeline(
        task="token-classification",
        model=model,
        tokenizer = tokenizer,
        aggregation_strategy="simple"   # merges B/I tokens
    )

    return ner

def extract_skills(input, pipeline):
    return pipeline(input)

## 6. Main Pipeline

In [None]:
# load the skillner model and tokenizer once (run once)
loaded = load_skillner()

In [None]:
# build skills extraction pipeline once (run once)
skillner = loaded['model']
tokenizer = loaded['tokenizer']

ner = build_skill_pipeline(model=skillner, tokenizer=tokenizer)

In [None]:
def run_main_pipeline(filepath, userID, userType):
    # Convert file to text for input
    input = extract_text(filepath=filepath)

    # Extract skills from input
    extracted_skills = extract_skills(input, ner)

    # check if the user is a JobSeeker or a Company
    if userType == 'company':
        isJobSeeker = False
    else:
        isJobSeeker = True

    # send skills to db
    # send_to_db(extracted_skills, userID, isJobSeeker=isJobSeeker)

    # print skills extracted
    for i, skill in enumerate(extracted_skills):
        print(f"{i}. skill: {skill['word']}, confidence: {skill['score']}")

## 7. Run Main Pipeline

In [None]:
filepath = 'PUT FILEPATH TO THE CV HERE (DOCX/PDF/TXT)'

results = run_main_pipeline(
    filepath=filepath,
    userID= 000000,
    userType="admin",
)

## 8. Quick Test

In [None]:
test = pipeline(
    "token-classification",
    model=skillner,
    tokenizer=tokenizer,
    aggregation_strategy="simple"
)

text = "Experienced with Python, SQL, Docker, FastAPI, and React."
test(text)