EXTRACT TEXT FROM FOLDERS OF RESUME AND JOB REQUIREMENTS THEN TRANSPORT IT TO CSV FOR DATA TRAINING PROCESS

In [24]:
import os
import textract
import PyPDF2
import docx
import pandas as pd
import pytesseract
from PIL import Image

# Set Tesseract OCR Path (Windows Only)
# Update this if Tesseract is installed in a different location
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

def extract_text_from_docx(file_path):
    """Extract text from a DOCX file."""
    try:
        doc = docx.Document(file_path)
        return "\n".join([para.text for para in doc.paragraphs])
    except Exception as e:
        print(f"Error extracting text from {file_path} using python-docx: {e}")
        return None

def extract_text_from_pdf(file_path):
    """Extract text from a PDF file using PyPDF2."""
    try:
        with open(file_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
            return text if text else None
    except Exception as e:
        print(f"Error extracting text from {file_path} using PyPDF2: {e}")
        return None

def extract_text_with_textract(file_path):
    """Fallback text extraction using Textract for unsupported formats."""
    try:
        return textract.process(file_path).decode("utf-8")
    except Exception as e:
        print(f"Error extracting text from {file_path} using textract: {e}")
        return None

def extract_text_from_image(file_path):
    """Extract text from an image file using Tesseract OCR."""
    try:
        image = Image.open(file_path)
        text = pytesseract.image_to_string(image, config='--psm 6')
        return text.strip() if text else None
    except Exception as e:
        print(f"Error extracting text from {file_path} using OCR: {e}")
        return None

def extract_text(file_path):
    """Detect file type and extract text accordingly."""
    text = None

    if file_path.endswith(".docx"):
        text = extract_text_from_docx(file_path)
    elif file_path.endswith(".pdf"):
        text = extract_text_from_pdf(file_path)
    elif file_path.endswith((".jpg", ".jpeg", ".png")):
        text = extract_text_from_image(file_path)
    else:
        print(f"Unsupported file format: {file_path}")
        return None
    
    # Fallback to Textract if other methods fail
    if text is None:
        text = extract_text_with_textract(file_path)
    
    return text

def process_files(resume_folder, job_folder, output_csv):
    data = []
    
    resumes = []
    for filename in os.listdir(resume_folder):
        file_path = os.path.join(resume_folder, filename)
        if os.path.isfile(file_path) and file_path.endswith((".pdf", ".docx", ".jpg", ".jpeg", ".png")):
            print(f"Extracting text from resume: {filename}")
            text = extract_text(file_path)
            if text:
                resumes.append({"Resume Text":text})
    
    job_requirements = []
    for filename in os.listdir(job_folder):
        file_path = os.path.join(job_folder, filename)
        if os.path.isfile(file_path) and file_path.endswith((".pdf", ".docx", ".jpg", ".jpeg", ".png")):
            print(f"Extracting text from job requirement: {filename}")
            text = extract_text(file_path)
            if text:
                job_requirements.append({"Job Requirement Text":text})
    
    # Save to CSV
    df = pd.DataFrame(data)
    df.to_csv(output_csv, index=False, encoding="utf-8")
    print(f"Extraction complete. Data saved to {output_csv}")

# Example usage
resume_folder = "resumes"  # Change this to your resume folder path
job_folder = "job_requirements"  # Change this to your job requirements folder path
output_csv = "output/dataset/resume-job-requirements.csv"
process_files(resume_folder, job_folder, output_csv)


Extracting text from resume: Accounting Profesional.pdf
Extracting text from resume: Call Center Agent.pdf
Extracting text from resume: Computer-Engineer.pdf
Extracting text from resume: Cybersecurity.pdf
Extracting text from resume: Data-Encoder.pdf
Extracting text from resume: Data-Scientist.pdf
Extracting text from resume: Freelance.pdf
Extracting text from resume: Graphic Artist.pdf
Extracting text from resume: Instructor.pdf
Extracting text from resume: IT-student.pdf
Extracting text from resume: Profile (0).pdf
Extracting text from resume: Profile (1).pdf
Extracting text from resume: Profile (10).pdf
Extracting text from resume: Profile (100).pdf
Extracting text from resume: Profile (101).pdf
Extracting text from resume: profile (102).pdf
Extracting text from resume: Profile (103).pdf
Extracting text from resume: Profile (104).pdf
Extracting text from resume: Profile (105).pdf
Extracting text from resume: Profile (106).pdf
Extracting text from resume: Profile (107).pdf
Extracting

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'job_requirements'

PRE-PROCESSING

In [None]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove special characters
    text = " ".join([word for word in text.split() if word not in stop_words])  # Remove stop words
    return text

SCORE RESUMES USING JACCARD SIMILARITY BASED ON JOB REQUIREMENTS READY FOR AUTO LABELLING

In [None]:
from sklearn.metrics import jaccard_score
from sklearn.feature_extraction.text import CountVectorizer

def calculate_jaccard_similarity(text1, text2):
    vectorizer = CountVectorizer(binary=True)
    X = vectorizer.fit_transform([text1, text2]).toarray()
    return jaccard_score(X[0], X[1])

# Create all possible resume-job requirement pairs and compute Jaccard similarity
for resume_text in resumes:
    for job_text in job_requirements:
        jaccard_score_value = calculate_jaccard_similarity(resume_text, job_text)
        data.append({
            "Resume Text": resume_text,
            "Job Requirement Text": job_text,
            "Jaccard Score": jaccard_score_value
        })

AUTO-LABELLING

In [None]:
import pandas as pd



# Load the extracted text dataset
df = pd.read_csv("scored_dataset.csv")

# If Jaccard Score is <= 0, label as 0 (Not Suitable), else 1 (Suitable)
df["Label"] = df["Jaccard Score"].apply(lambda x: 1 if x > 0.5 else 0)

# Save the labeled dataset / Labelled for XGBoost Classification
df.to_csv("labeled_data.csv", index=False)

# Display label distribution
print(df["Label"].value_counts())  # Check how many are suitable (1) vs. not suitable (0)


In [20]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

# Load the labeled dataset
df = pd.read_csv("labeled_data.csv")

# Feature extraction: Use TF-IDF to convert text into numerical format
vectorizer = TfidfVectorizer(max_features=5000)  # Limit to 5000 features for efficiency
X = vectorizer.fit_transform(df["Resume Text"] + " " + df["Job Requirement Text"])  # Combine both texts
y = df["Label"]  # Target variable (0 = Not Suitable, 1 = Suitable)

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Train XGBoost model
params = {
    "objective": "binary:logistic",  # Binary classification
    "eval_metric": "logloss",
    "max_depth": 6,
    "learning_rate": 0.1,
    "n_estimators": 100,
}
model = xgb.train(params, dtrain, num_boost_round=100)

# Predict on test data
y_pred = model.predict(dtest)
y_pred = [1 if p >= 0.5 else 0 for p in y_pred]

# Evaluate model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Save the trained model
model.save_model("xgboost_resume_classifier.json")


Parameters: { "n_estimators" } are not used.



Accuracy: 0.8288639687957094
              precision    recall  f1-score   support

           0       0.85      0.83      0.84      1099
           1       0.81      0.83      0.82       952

    accuracy                           0.83      2051
   macro avg       0.83      0.83      0.83      2051
weighted avg       0.83      0.83      0.83      2051



📌 Steps to Predict New Resumes

✅ 1. Load Your Trained XGBoost Model

In [21]:
import xgboost as xgb

# Load the trained model correctly
xgb_model = xgb.Booster()
xgb_model.load_model("xgboost_resume_classifier.json")  # Load the JSON model

✅ 2. Preprocess New Resumes

In [22]:
def preprocess_text(text):
    text = text.lower()  
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  
    text = " ".join([word for word in text.split() if word not in stop_words])  
    return text


✅ 3. Compute Jaccard Score Against Job Descriptions

In [None]:
from sklearn.metrics import jaccard_score
from sklearn.feature_extraction.text import CountVectorizer

def compute_jaccard(resume_text, job_text):
    vectorizer = CountVectorizer(binary=True)
    text_data = [resume_text, job_text]
    
    # Convert text to binary vectors
    binary_vectors = vectorizer.fit_transform(text_data).toarray()
    
    # Compute Jaccard score
    return jaccard_score(binary_vectors[0], binary_vectors[1])

# Apply Jaccard similarity for new resumes
new_resumes["Jaccard Score"] = new_resumes.apply(
    lambda row: compute_jaccard(row["Resume Text"], row["Job Requirement Text"]), axis=1
)


✅ 4. Predict Suitability (0 = Not Suitable, 1 = Suitable)

In [None]:
# Select the feature used in training (Jaccard Score)
X_new = new_resumes[["Jaccard Score"]]

# Predict using the trained model
new_resumes["Prediction"] = xgb_model.predict(X_new)

# Map Predictions
new_resumes["Prediction Label"] = new_resumes["Prediction"].map({0: "Not Suitable", 1: "Suitable"})

✅ 5. Save the Predictions

In [None]:
new_resumes.to_csv("predicted_resumes.csv", index=False)
print("Predictions saved to predicted_resumes.csv")

To implement job recommendations for rejected resumes, you can follow these steps:

Identify Rejected Resumes: Filter resumes where the model predicts them as "Not Suitable" (label = 0).
Compare with Other Job Descriptions: Compute similarity scores (e.g., Jaccard, cosine similarity) between rejected resumes and other job descriptions.
Rank Suitable Jobs: Sort job descriptions based on similarity scores and suggest the top matches.
Save Recommendations: Store job suggestions in a CSV or database for further analysis.

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
resumes_df = pd.read_csv("resumes.csv")  # Your dataset containing resumes and suitability labels
jobs_df = pd.read_csv("job_descriptions.csv")  # Your dataset containing job descriptions

# Filter out rejected resumes
rejected_resumes = resumes_df[resumes_df["Label"] == 0]

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english')
resume_vectors = vectorizer.fit_transform(rejected_resumes["Resume Text"])
job_vectors = vectorizer.transform(jobs_df["Job Requirement Text"])

# Compute Cosine Similarity
similarity_matrix = cosine_similarity(resume_vectors, job_vectors)

# Generate job recommendations
job_suggestions = []
for i, resume in rejected_resumes.iterrows():
    top_indices = similarity_matrix[i].argsort()[::-1][:3]  # Get top 3 job matches
    suggested_jobs = jobs_df.iloc[top_indices]["Job Requirement Text"].tolist()
    
    job_suggestions.append({
        "Resume": resume["Resume Text"],
        "Suggested Jobs": suggested_jobs
    })

# Save recommendations
recommendations_df = pd.DataFrame(job_suggestions)
recommendations_df.to_csv("job_recommendations.csv", index=False)

print("Job recommendations saved to job_recommendations.csv")
