TEXT EXTRACTION


In [20]:
import os
import textract
import PyPDF2
import docx
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.metrics import jaccard_score
from sklearn.feature_extraction.text import CountVectorizer

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def extract_text_from_docx(file_path):
    try:
        doc = docx.Document(file_path)
        return "\n".join([para.text for para in doc.paragraphs])
    except Exception as e:
        print(f"Error extracting text from {file_path} using python-docx: {e}")
        return None

def extract_text_from_pdf(file_path):
    try:
        with open(file_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
            return text if text else None
    except Exception as e:
        print(f"Error extracting text from {file_path} using PyPDF2: {e}")
        return None

def extract_text_with_textract(file_path):
    try:
        return textract.process(file_path).decode("utf-8")
    except Exception as e:
        print(f"Error extracting text from {file_path} using textract: {e}")
        return None

def extract_text(file_path):
    if file_path.endswith(".docx"):
        text = extract_text_from_docx(file_path)
    elif file_path.endswith(".pdf"):
        text = extract_text_from_pdf(file_path)
    else:
        print(f"Unsupported file format: {file_path}")
        return None
    
    # Fallback to textract if needed
    if text is None:
        text = extract_text_with_textract(file_path)
    
    return text

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove special characters
    text = " ".join([word for word in text.split() if word not in stop_words])  # Remove stop words
    return text

def calculate_jaccard_similarity(text1, text2):
    vectorizer = CountVectorizer(binary=True)
    X = vectorizer.fit_transform([text1, text2]).toarray()
    return jaccard_score(X[0], X[1])

def process_files(resume_folder, job_folder, output_csv):
    data = []
    
    resumes = []
    for filename in os.listdir(resume_folder):
        file_path = os.path.join(resume_folder, filename)
        if os.path.isfile(file_path) and file_path.endswith((".pdf", ".docx")):
            print(f"Extracting text from resume: {filename}")
            text = extract_text(file_path)
            if text:
                resumes.append(preprocess_text(text))
    
    job_requirements = []
    for filename in os.listdir(job_folder):
        file_path = os.path.join(job_folder, filename)
        if os.path.isfile(file_path) and file_path.endswith((".pdf", ".docx")):
            print(f"Extracting text from job requirement: {filename}")
            text = extract_text(file_path)
            if text:
                job_requirements.append(preprocess_text(text))
    
    # Create all possible resume-job requirement pairs and compute Jaccard similarity
    for resume_text in resumes:
        for job_text in job_requirements:
            jaccard_score_value = calculate_jaccard_similarity(resume_text, job_text)
            data.append({
                "Resume Text": resume_text,
                "Job Requirement Text": job_text,
                "Jaccard Score": jaccard_score_value
            })
    
    # Save to CSV
    df = pd.DataFrame(data)
    df.to_csv(output_csv, index=False, encoding="utf-8")
    print(f"Extraction complete. Data saved to {output_csv}")

# Example usage
resume_folder = "resumes"  # Change this to your resume folder path
job_folder = "job_requirements"  # Change this to your job requirements folder path
output_csv = "scored_dataset.csv"
process_files(resume_folder, job_folder, output_csv)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Extracting text from resume: Accounting Profesional.pdf
Extracting text from resume: Call Center Agent.pdf
Extracting text from resume: Computer-Engineer.pdf
Extracting text from resume: Cybersecurity.pdf
Extracting text from resume: Data-Encoder.pdf
Extracting text from resume: Data-Scientist.pdf
Extracting text from resume: Freelance.pdf
Extracting text from resume: Graphic Artist.pdf
Extracting text from resume: Instructor.pdf
Extracting text from resume: IT-student.pdf
Extracting text from resume: Profile (0).pdf
Extracting text from resume: Profile (1).pdf
Extracting text from resume: Profile (10).pdf
Extracting text from resume: Profile (100).pdf
Extracting text from resume: Profile (101).pdf
Extracting text from resume: profile (102).pdf
Extracting text from resume: Profile (103).pdf
Extracting text from resume: Profile (104).pdf
Extracting text from resume: Profile (105).pdf
Extracting text from resume: Profile (106).pdf
Extracting text from resume: Profile (107).pdf
Extracting

In [22]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer

# Load Data
df = pd.read_csv("scored_dataset.csv")

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
resume_tfidf = vectorizer.fit_transform(df['Resume Text'])
job_tfidf = vectorizer.transform(df['Job Requirement Text'])

# Convert to dense arrays
resume_tfidf = resume_tfidf.toarray()
job_tfidf = job_tfidf.toarray()

# Combine features (Resume + Job + Jaccard Score)
X = np.hstack((resume_tfidf, job_tfidf, df[['Jaccard Score']].values))

# Grouping data by Job Requirement (Each job gets its own ranking)
df['group'] = df.groupby('Job Requirement Text').ngroup()

# XGBoost Ranker Training
params = {
    'objective': 'rank:pairwise',  # Pairwise ranking
    'eta': 0.1,
    'gamma': 1.0,
    'min_child_weight': 0.1,
    'max_depth': 6,
    'n_estimators': 100
}

# Convert data to DMatrix (XGBoost format)
dtrain = xgb.DMatrix(X, label=df['Jaccard Score'], group=df.groupby('group').size().values)

# Train XGBoost Ranker
model = xgb.train(params, dtrain, num_boost_round=100)

# Predict rankings
df['Rank Score'] = model.predict(dtrain)

# Sort resumes by rank for each job
df = df.sort_values(by=['Job Requirement Text', 'Rank Score'], ascending=[True, False])

# Save ranked results
df.to_csv("ranked_resumes.csv", index=False)
print("Ranking Complete. Results saved to ranked_resumes.csv")
model.save_model("xgboost_ranking_model.json")


Parameters: { "n_estimators" } are not used.



Ranking Complete. Results saved to ranked_resumes.csv


In [23]:
from sklearn.metrics import ndcg_score

# Convert Rank Scores to Predictions (Higher Score = Higher Rank)
df['Predicted Rank'] = df.groupby('Job Requirement Text')['Rank Score'].rank(ascending=False)

# Define a function to calculate NDCG
def calculate_ndcg(df):
    ndcg_scores = []
    for job, group in df.groupby('Job Requirement Text'):
        true_relevance = group['Jaccard Score'].values  # Using Jaccard as relevance
        predicted_relevance = group['Predicted Rank'].values
        ndcg_scores.append(ndcg_score([true_relevance], [predicted_relevance]))
    return np.mean(ndcg_scores)

# Compute NDCG
ndcg_value = calculate_ndcg(df)
print(f"NDCG Score: {ndcg_value:.4f}")  # Closer to 1 means better ranking

# Save evaluation results
with open("ranking_evaluation.txt", "w") as f:
    f.write(f"NDCG Score: {ndcg_value:.4f}")


NDCG Score: 0.7331


In [24]:
loaded_model = xgb.Booster()
loaded_model.load_model("xgboost_ranking_model.json")


1️⃣ Generate Job Suggestions for Low-Scoring Resumes

In [10]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# Load Data
df = pd.read_csv("scored_dataset.csv")

# Print available columns
print("Available Columns:", df.columns)

# Ensure the correct column names exist
resume_col = "Resume Text"  # Change if different
job_col = "Job Requirement Text"  # Change if different
jaccard_col = "Jaccard Score"  # Change if different

# Load Model
model = xgb.Booster()
model.load_model("xgboost_ranking_model.json")

# Prepare TF-IDF Features
vectorizer = TfidfVectorizer(max_features=5000)
resume_tfidf = vectorizer.fit_transform(df[resume_col])
job_tfidf = vectorizer.transform(df[job_col])

resume_tfidf = resume_tfidf.toarray()
job_tfidf = job_tfidf.toarray()

# Combine Features (Resume + Job + Jaccard Score)
X = np.hstack((resume_tfidf, job_tfidf, df[[jaccard_col]].values))

# Convert Data to DMatrix for XGBoost
dtest = xgb.DMatrix(X)

# Predict Rankings
df['Rank Score'] = model.predict(dtest)

# Sort to get low-ranked resumes
df_sorted = df.sort_values(by=['Rank Score'])

# Get bottom 10 resumes
low_rank_resumes = df_sorted.head(10)

job_suggestions = []
for index, row in low_rank_resumes.iterrows():
    resume_text = row[resume_col]

    # Compare this resume against all job descriptions
    temp_df = df[df[resume_col] != resume_text].copy()

    # Prepare test set for this resume against all jobs
    job_tfidf = vectorizer.transform(temp_df[job_col]).toarray()
    resume_tfidf = np.tile(vectorizer.transform([resume_text]).toarray(), (len(temp_df), 1))

    X_test = np.hstack((resume_tfidf, job_tfidf, temp_df[[jaccard_col]].values))
    dtest_temp = xgb.DMatrix(X_test)

    # Predict scores
    temp_df['Re-Rank Score'] = model.predict(dtest_temp)

    top_suggested_jobs = temp_df.sort_values(by='Re-Rank Score', ascending=False).head(3)

    job_suggestions.append({
        "Resume": row.get("Resume Filename", "Unknown Resume"),
        "Suggested Jobs": top_suggested_jobs["Job Requirement Text"].tolist()  # ✅ FIXED
    })

# Save suggestions to a CSV
pd.DataFrame(job_suggestions).to_csv("job_suggestions.csv", index=False)

print("Job Suggestions Generated! Check 'job_suggestions.csv'")


Available Columns: Index(['Resume Text', 'Job Requirement Text', 'Jaccard Score'], dtype='object')
Job Suggestions Generated! Check 'job_suggestions.csv'
