PRE-PROCESSING STEPS

In [None]:
import os
import textract
import PyPDF2
import docx
import pandas as pd
import pytesseract
from PIL import Image
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import inflect

# Download necessary nltk data (if needed)
# nltk.download('stopwords')
# nltk.download('wordnet')

# Set up inflect engine for number-to-word conversion
p = inflect.engine()

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Set Tesseract OCR Path (Windows Only)
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

#----------------------------
# TEXT EXTRACTION FUNCTIONS
#----------------------------
def extract_text_from_docx(file_path):
    try:
        doc = docx.Document(file_path)
        return "\n".join([para.text for para in doc.paragraphs])
    except Exception as e:
        print(f"Error extracting text from {file_path} using python-docx: {e}")
        return None

def extract_text_from_pdf(file_path):
    try:
        with open(file_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
            return text if text else None
    except Exception as e:
        print(f"Error extracting text from {file_path} using PyPDF2: {e}")
        return None

def extract_text_from_image(file_path):
    try:
        image = Image.open(file_path)
        text = pytesseract.image_to_string(image, config='--psm 6')
        return text.strip() if text else None
    except Exception as e:
        print(f"Error extracting text from {file_path} using OCR: {e}")
        return None

def extract_text(file_path):
    """Detect file type and extract text accordingly."""
    text = None
    if file_path.endswith(".docx"):
        text = extract_text_from_docx(file_path)
    elif file_path.endswith(".pdf"):
        text = extract_text_from_pdf(file_path)
    elif file_path.endswith((".jpg", ".jpeg", ".png")):
        text = extract_text_from_image(file_path)
    
    # Fallback to Textract if primary extraction fails
    if text is None:
        try:
            text = textract.process(file_path).decode("utf-8")
        except Exception as e:
            print(f"Error extracting text from {file_path} using textract: {e}")
            return None
    
    return text

#----------------------------
# TEXT PREPROCESSING
#----------------------------
def preprocess_text(text):
    """Lowercase, remove special characters, normalize numbers, lemmatize, and remove stopwords."""
    if not text:
        return ""

    text = text.lower()
    
    # Normalize numbers (convert digits to words)
    words = text.split()
    normalized_words = []
    for word in words:
        if word.isdigit():  # Check if it's a number
            try:
                word = p.number_to_words(int(word))  # Convert to words
            except:
                word = p.number_to_words(int(word))
                # pass  # If conversion fails, keep the original number
        normalized_words.append(word)
    
    text = " ".join(normalized_words)

    # Remove special characters
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    
    # Lemmatization and Stopword Removal
    text = " ".join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])

    return text

#----------------------------
# JACCARD SIMILARITY
#----------------------------
def calculate_jaccard_similarity(text1, text2):
    set1, set2 = set(text1.split()), set(text2.split())
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union) if union else 0, intersection

#----------------------------
# PROCESS FILES & GENERATE CSV
#----------------------------
def process_files(resume_folder, job_folder, output_csv):
    data = []
    resumes = []
    job_requirements = []
    
    # Extract and preprocess resumes
    for filename in os.listdir(resume_folder):
        file_path = os.path.join(resume_folder, filename)
        if file_path.endswith((".pdf", ".docx", ".jpg", ".jpeg", ".png")):
            print(f"Extracting text from resume: {filename}")
            text = extract_text(file_path)
            if text:
                resumes.append({"filename": filename, "text": preprocess_text(text)})

    # Extract and preprocess job descriptions
    for filename in os.listdir(job_folder):
        file_path = os.path.join(job_folder, filename)
        if file_path.endswith((".pdf", ".docx", ".jpg", ".jpeg", ".png")):
            print(f"Extracting text from job requirement: {filename}")
            text = extract_text(file_path)
            if text:
                job_requirements.append({"filename": filename, "text": preprocess_text(text)})

    # Compute Jaccard similarity
    for resume in resumes:
        for job in job_requirements:
            jaccard_score_value, common_words = calculate_jaccard_similarity(resume["text"], job["text"])
            data.append({
                "Resume Text": resume["text"],
                "Job Text": job["text"],
                "Jaccard Score": jaccard_score_value,
                "Common Words": " ".join(common_words)  # Convert set to string
            })

    # Save to CSV
    df = pd.DataFrame(data)
    df.to_csv(output_csv, index=False, encoding="utf-8")
    print(f"Extraction complete. Data saved to {output_csv}")

# Example usage
resume_folder = "data-for-training/resumes"  # Change this to your resume folder path
job_folder = "data-for-training/job requirements"  # Change this to your job requirements folder path
output_csv = "output/scored-dataset.csv"
process_files(resume_folder, job_folder, output_csv)


Extracting text from resume: Accounting Profesional.pdf
Extracting text from resume: Call Center Agent.pdf
Extracting text from resume: Computer-Engineer.pdf
Extracting text from resume: Cybersecurity.pdf
Extracting text from resume: Data-Encoder.pdf
Extracting text from resume: Data-Scientist.pdf
Extracting text from resume: Freelance.pdf
Extracting text from resume: Graphic Artist.pdf
Extracting text from resume: Instructor.pdf
Extracting text from resume: IT-student.pdf
Extracting text from resume: Profile (0).pdf
Extracting text from resume: Profile (1).pdf
Extracting text from resume: Profile (10).pdf
Extracting text from resume: Profile (100).pdf
Extracting text from resume: Profile (101).pdf
Extracting text from resume: profile (102).pdf
Extracting text from resume: Profile (103).pdf
Extracting text from resume: Profile (104).pdf
Extracting text from resume: Profile (105).pdf
Extracting text from resume: Profile (106).pdf
Extracting text from resume: Profile (107).pdf
Extracting

AUTO-LABELLING BASED ON JACCARD SCORES

In [41]:
import pandas as pd

# Load the extracted text dataset
df = pd.read_csv("output/scored-dataset.csv")

# Ensure "Jaccard Score" exists
if "Jaccard Score" not in df.columns:
    raise ValueError("Jaccard Score column is missing from the dataset!")

# # Convert Jaccard Score to float, handling errors
# df["Jaccard Score"] = pd.to_numeric(df["Jaccard Score"], errors="coerce")

# # Handle NaN values (replace with 0 to avoid errors)
# df["Jaccard Score"] = df["Jaccard Score"].fillna(0)

# Apply label based on Jaccard Score threshold
df["Label"] = df["Jaccard Score"].apply(lambda x: 1 if x > 0.05 else 0)

# Save the labeled dataset for XGBoost classification
df.to_csv("output/labeled-data.csv", index=False)

# Display label distribution
print("Label distribution:\n", df["Label"].value_counts())


Label distribution:
 Label
0    11780
1     2908
Name: count, dtype: int64


In [20]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

# Load the labeled dataset
df = pd.read_csv("output/labeled-data.csv")

# Feature extraction: Use TF-IDF to convert text into numerical format
vectorizer = TfidfVectorizer(max_features=5000)  # Limit to 5000 features for efficiency
X = vectorizer.fit_transform(df["Resume Text"] + " " + df["Job Text"])  # Combine both texts
y = df["Label"]  # Target variable (0 = Not Suitable, 1 = Suitable)

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Train XGBoost model
params = {
    "objective": "binary:logistic",  # Binary classification
    "eval_metric": "logloss",
    "max_depth": 6,
    "learning_rate": 0.1,
    "n_estimators": 100,
}
model = xgb.train(params, dtrain, num_boost_round=100)

# Predict on test data
y_pred = model.predict(dtest)
y_pred = [1 if p >= 0.05 else 0 for p in y_pred]

# Evaluate model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Save the trained model
model.save_model("xgboost_resume_classifier.json")


Parameters: { "n_estimators" } are not used.



Accuracy: 0.6099387338325392
              precision    recall  f1-score   support

           0       0.99      0.52      0.69      2387
           1       0.32      0.98      0.49       551

    accuracy                           0.61      2938
   macro avg       0.66      0.75      0.59      2938
weighted avg       0.87      0.61      0.65      2938



📌 Steps to Predict New Resumes

✅ 1. Load Trained XGBoost Model

In [21]:
import xgboost as xgb

# Load the trained model correctly
xgb_model = xgb.Booster()
xgb_model.load_model("xgboost_resume_classifier.json")  # Load the JSON model

✅ Step 2: Load & Preprocess New Resumes

In [22]:
import pandas as pd

# Load new resumes dataset (Assuming it's a CSV)
new_resumes = pd.read_csv("output/new_resumes.csv")  # Update with your filename

# Apply the same text preprocessing function
new_resumes["processed_text"] = new_resumes["Resume Text"].apply(preprocess_text)


✅ 3. Compute Jaccard Score Against Job Descriptions

To implement job recommendations for rejected resumes, you can follow these steps:

Identify Rejected Resumes: Filter resumes where the model predicts them as "Not Suitable" (label = 0).
Compare with Other Job Descriptions: Compute similarity scores (e.g., Jaccard, cosine similarity) between rejected resumes and other job descriptions.
Rank Suitable Jobs: Sort job descriptions based on similarity scores and suggest the top matches.
Save Recommendations: Store job suggestions in a CSV or database for further analysis.

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
resumes_df = pd.read_csv("resumes.csv")  # Your dataset containing resumes and suitability labels
jobs_df = pd.read_csv("job_descriptions.csv")  # Your dataset containing job descriptions

# Filter out rejected resumes
rejected_resumes = resumes_df[resumes_df["Label"] == 0]

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english')
resume_vectors = vectorizer.fit_transform(rejected_resumes["Resume Text"])
job_vectors = vectorizer.transform(jobs_df["Job Requirement Text"])

# Compute Cosine Similarity
similarity_matrix = cosine_similarity(resume_vectors, job_vectors)

# Generate job recommendations
job_suggestions = []
for i, resume in rejected_resumes.iterrows():
    top_indices = similarity_matrix[i].argsort()[::-1][:3]  # Get top 3 job matches
    suggested_jobs = jobs_df.iloc[top_indices]["Job Requirement Text"].tolist()
    
    job_suggestions.append({
        "Resume": resume["Resume Text"],
        "Suggested Jobs": suggested_jobs
    })

# Save recommendations
recommendations_df = pd.DataFrame(job_suggestions)
recommendations_df.to_csv("job_recommendations.csv", index=False)

print("Job recommendations saved to job_recommendations.csv")
