In [2]:
import re
import torch
from transformers import BertTokenizer, BertModel
from PIL import Image
import textract
import pytesseract

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

def extract_text(file_path):
    try:
        if file_path.endswith(('.jpg', '.jpeg', '.png')):
            text = pytesseract.image_to_string(Image.open(file_path))
        else:
            text = textract.process(file_path).decode('utf-8')
        return text, None
    except Exception as e:
        return None, str(e)

def get_bert_embedding(text, model, tokenizer):
    tokens = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**tokens)
    return outputs.last_hidden_state.mean(dim=1)

def compatibility(resume_text, job_desc):
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    model = BertModel.from_pretrained("bert-base-uncased")

    resume_embedding = get_bert_embedding(preprocess_text(resume_text), model, tokenizer)
    job_desc_embedding = get_bert_embedding(preprocess_text(job_desc), model, tokenizer)

    score = torch.nn.functional.cosine_similarity(resume_embedding, job_desc_embedding)
    return round(score.item() * 100, 2)

In [1]:
!pip install textract

distutils: /usr/local/include/python3.11/UNKNOWN
sysconfig: /usr/include/python3.11/UNKNOWN[0m
user = False
home = None
root = None
prefix = None[0m


In [3]:
!pip install pip==21.2

Collecting pip==21.2
  Downloading pip-21.2-py3-none-any.whl.metadata (4.2 kB)
Reason for being yanked: See https://github.com/pypa/pip/issues/8711[0m[33m
[0mDownloading pip-21.2-py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-21.2


In [4]:
!pip install pytesseract

distutils: /usr/local/include/python3.11/UNKNOWN
sysconfig: /usr/include/python3.11/UNKNOWN[0m
user = False
home = None
root = None
prefix = None[0m
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
  distutils: /usr/local/include/python3.11/pytesseract
  sysconfig: /usr/include/python3.11/pytesseract[0m
Successfully installed pytesseract-0.3.13
