In [None]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from PIL import Image
import textract
import pytesseract


def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

def extract_text(file_path):
    try:
        if file_path.endswith(('.jpg', '.jpeg', '.png')):
            text = pytesseract.image_to_string(Image.open(file_path))
        else:
            text = textract.process(file_path).decode('utf-8')
        return text
    except Exception as e:
        print(f"Error reading file: {e}")
        return None

def compatibility(resume_text, job_desc):
    clean = preprocess_text(resume_text)
    job_desc_clean = preprocess_text(job_desc)

    vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=5000,
    min_df=2,
    stop_words='english'
)
    matrix = vectorizer.fit_transform([clean, job_desc_clean])

    score = cosine_similarity(matrix[0:1], matrix[1:2])
    return round(score[0][0] * 100, 2)

if __name__ == "__main__":
    resume_file = input("Enter path to resume(supported: .pdf, .jpg, .txt): ")
    job_desc = input("Enter job description: ")

    resume_text = extract_text(resume_file)

    if resume_text:
        score = compatibility(resume_text, job_desc)
        print("Compatibility Score: " + str(score) + "%")
    else:
        print("Failed to process resume. Please check file")

Enter path to resume(supported: .pdf, .jpg, .txt): /content/Resume-Abhishek Ghaisas-1 (1).pdf
Enter job description: High School diploma required.  Current student pursuing a bachelor's degree in Computer Science, Data Engineering, Data Science, Informatics or related field. Previous internship is a plus.  Ability to validate data received from various web services and other integration technologies (REST, XML/JSON, SOAP, ETL).  Knowledge of relational databases, SQL queries, relational and dimensional data modeling and data warehouses.  Strong attention to detail.
Compatibility Score: 94.81%


In [None]:
!pip install textract

distutils: /usr/local/include/python3.11/UNKNOWN
sysconfig: /usr/include/python3.11/UNKNOWN[0m
user = False
home = None
root = None
prefix = None[0m
Collecting textract
  Downloading textract-1.6.5-py3-none-any.whl (23 kB)
Collecting SpeechRecognition~=3.8.1
  Downloading SpeechRecognition-3.8.1-py2.py3-none-any.whl (32.8 MB)
[K     |████████████████████████████████| 32.8 MB 79 kB/s 
[?25hCollecting argcomplete~=1.10.0
  Downloading argcomplete-1.10.3-py2.py3-none-any.whl (36 kB)
Collecting xlrd~=1.2.0
  Downloading xlrd-1.2.0-py2.py3-none-any.whl (103 kB)
[K     |████████████████████████████████| 103 kB 58.7 MB/s 
[?25hCollecting extract-msg<=0.29.*
  Downloading extract_msg-0.28.7-py2.py3-none-any.whl (69 kB)
[K     |████████████████████████████████| 69 kB 6.4 MB/s 
[?25hCollecting pdfminer.six==20191110
  Downloading pdfminer.six-20191110-py2.py3-none-any.whl (5.6 MB)
[K     |████████████████████████████████| 5.6 MB 48.6 MB/s 
[?25hCollecting python-pptx~=0.6.18
  Download

In [None]:
!pip install pip==21.2

Collecting pip==21.2
  Downloading pip-21.2-py3-none-any.whl.metadata (4.2 kB)
Reason for being yanked: See https://github.com/pypa/pip/issues/8711[0m[33m
[0mDownloading pip-21.2-py3-none-any.whl (1.6 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/1.6 MB[0m [31m4.6 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m0.8/1.6 MB[0m [31m13.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-21.2


In [None]:
!pip install pytesseract

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13
