# Introduction: AI-Powered Resume Screening System
This notebook demonstrates how to process resumes using NLP and rank candidates based on job descriptions.


# Install & Import Dependencies

In [4]:
!pip install pandas scikit-learn joblib numpy nltk




In [5]:
import os
import time
import nltk
import re
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, precision_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity

# Download NLTK dependencies and define paths

In [6]:
start_time = time.time()
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
BASE_DIR = os.getcwd()  # Get the current working directory
DATA_PATH = os.path.join(BASE_DIR, "resume_dataset.csv")
OUTPUT_PATH = os.path.join(BASE_DIR, "processed_resume_dataset.csv")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ayogaius\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ayogaius\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ayogaius\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Funtion definition for text preprocessing

In [7]:
# Function to normalize text
def remove_non_ascii(text):
    """Remove non-ASCII characters from text."""
    return ''.join(char for char in text if ord(char) < 128)

def preprocess_text(text):
    text = text.lower()
    text = remove_non_ascii(text)  # Remove non-ASCII characters
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return " ".join(words)

# Load dataset
df = pd.read_csv(DATA_PATH, encoding="utf-8")

# Apply function to dataset
df["Cleaned_Resume"] = df["Resume"].apply(preprocess_text)

# Generate processed data

In [8]:

print("Sample stopwords:", list(stop_words)[:5])  # Check stopwords
print("Lemmatization example:", lemmatizer.lemmatize("running"))  # Check lemmatization
print(df["Cleaned_Resume"].head())
df.to_csv(OUTPUT_PATH, index=False, encoding="utf-8")
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Processing Time: {elapsed_time:.2f} seconds")

Sample stopwords: ['being', 'won', 'am', 'against', "i'll"]
Lemmatization example: running
0    skill programming language python panda numpy ...
1    education detail may 2013 may 2017 b e uit rgp...
2    area interest deep learning control system des...
3    skill r python sap hana tableau sap hana sql s...
4    education detail mca ymcaust faridabad haryana...
Name: Cleaned_Resume, dtype: object
Processing Time: 10.22 seconds


# Load preprocessed dataset

In [9]:
start_time = time.time()
MODEL_PATH = os.path.join(BASE_DIR, "models/")
df = pd.read_csv(OUTPUT_PATH, encoding="utf-8")

## Convert Text to Numerical Features

In [10]:
vectorizer = TfidfVectorizer(max_features=5000) 
X = vectorizer.fit_transform(df["Cleaned_Resume"])

## Encode Categories (Convert Labels to Numbers)

In [11]:
encoder = LabelEncoder()
y = encoder.fit_transform(df["Category"])

## Train test(To evaluate model performance on the data)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Model training

In [13]:
model = LogisticRegression()
model.fit(X_train, y_train)

## Accuracy and precision test

In [14]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
print(f"Accuracy: {accuracy}, Precision: {precision}")
print(f" Model Accuracy: {accuracy:.2f}")
print("\n Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9948186528497409, Precision: 0.9954663212435233
 Model Accuracy: 0.99

 Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           1       1.00      1.00      1.00         6
           2       1.00      1.00      1.00         5
           3       1.00      1.00      1.00         7
           4       1.00      1.00      1.00         4
           5       1.00      1.00      1.00         9
           6       1.00      1.00      1.00         5
           7       1.00      1.00      1.00         8
           8       1.00      0.93      0.96        14
           9       1.00      1.00      1.00         5
          10       1.00      1.00      1.00         7
          11       1.00      1.00      1.00         6
          12       1.00      1.00      1.00        12
          13       1.00      1.00      1.00         4
          14       1.00      1.00      1.00         7
          15       1.00    

## Save the trained model & vectorizer

In [15]:
os.makedirs(MODEL_PATH, exist_ok=True)
joblib.dump(model, os.path.join(MODEL_PATH, "resume_classifier.pkl"))
joblib.dump(vectorizer, os.path.join(MODEL_PATH, "tfidf_vectorizer.pkl"))
joblib.dump(encoder, os.path.join(MODEL_PATH, "category_encoder.pkl"))
print(" Model and vectorizer saved successfully!")
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Processing Time: {elapsed_time:.2f} seconds")

 Model and vectorizer saved successfully!
Processing Time: 12.26 seconds


# Load trained model & vectorizer

In [16]:

VECTOR_PATH = os.path.join(BASE_DIR, "../models/tfidf_vectorizer.pkl")
CLASSIFIER_PATH = os.path.join(BASE_DIR, "../models/resume_classifier.pkl")
vectorizer = joblib.load(VECTOR_PATH)
model = joblib.load(CLASSIFIER_PATH)

# Load dataset and apply vectorizer

In [17]:
df = pd.read_csv(OUTPUT_PATH, encoding="utf-8")
df["Cleaned_Resume"] = df["Resume"].apply(preprocess_text)
X = vectorizer.transform(df["Cleaned_Resume"])

# Job Description Input

In [18]:
job_description = """Data Science"""
job_vector = vectorizer.transform([preprocess_text(job_description)])

# Compute similarity


In [19]:
similarity_scores = cosine_similarity(job_vector, X)
# Add scores to dataset
df["Similarity_Score"] = similarity_scores[0]

# Sort candidates in an ascending order

In [20]:

ranked_resumes = df.sort_values(by="Similarity_Score", ascending=False)
ranked_resumes = ranked_resumes.sort_values(by="Similarity_Score", ascending=False)
ranked_resumes["Resume"] = ranked_resumes["Resume"].str.replace(r"\r\n", " ", regex=True)


# List top matches

In [21]:
from IPython.display import display
display(ranked_resumes[["Resume", "Similarity_Score"]].head(10))
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Processing Time: {elapsed_time:.2f} seconds")

Unnamed: 0,Resume,Similarity_Score
29,Expertise â Data and Quantitative Analysis â...,0.497922
19,Expertise â Data and Quantitative Analysis â...,0.497922
39,Expertise â Data and Quantitative Analysis â...,0.497922
9,Expertise â Data and Quantitative Analysis â...,0.497922
16,Skills â¢ Python â¢ Tableau â¢ Data Visuali...,0.394161
26,Skills â¢ Python â¢ Tableau â¢ Data Visuali...,0.394161
6,Skills â¢ Python â¢ Tableau â¢ Data Visuali...,0.394161
36,Skills â¢ Python â¢ Tableau â¢ Data Visuali...,0.394161
25,"SKILLS C Basics, IOT, Python, MATLAB, Data Sci...",0.283778
35,"SKILLS C Basics, IOT, Python, MATLAB, Data Sci...",0.283778


Processing Time: 16.50 seconds
