In [1]:
pip install pandas numpy scikit-learn nltk matplotlib


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\disha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\disha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [33]:
df = pd.read_csv("AI_Resume_Screening.csv")
df.head()


Unnamed: 0,Resume_ID,Name,Skills,Experience (Years),Education,Certifications,Job Role,Recruiter Decision,Salary Expectation ($),Projects Count,AI Score (0-100)
0,1,Ashley Ali,"TensorFlow, NLP, Pytorch",10,B.Sc,,AI Researcher,Hire,104895,8,100
1,2,Wesley Roman,"Deep Learning, Machine Learning, Python, SQL",10,MBA,Google ML,Data Scientist,Hire,113002,1,100
2,3,Corey Sanchez,"Ethical Hacking, Cybersecurity, Linux",1,MBA,Deep Learning Specialization,Cybersecurity Analyst,Hire,71766,7,70
3,4,Elizabeth Carney,"Python, Pytorch, TensorFlow",7,B.Tech,AWS Certified,AI Researcher,Hire,46848,0,95
4,5,Julie Hill,"SQL, React, Java",4,PhD,,Software Engineer,Hire,87441,9,100


In [35]:
df.info()
df.columns


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Resume_ID               1000 non-null   int64 
 1   Name                    1000 non-null   object
 2   Skills                  1000 non-null   object
 3   Experience (Years)      1000 non-null   int64 
 4   Education               1000 non-null   object
 5   Certifications          726 non-null    object
 6   Job Role                1000 non-null   object
 7   Recruiter Decision      1000 non-null   object
 8   Salary Expectation ($)  1000 non-null   int64 
 9   Projects Count          1000 non-null   int64 
 10  AI Score (0-100)        1000 non-null   int64 
dtypes: int64(5), object(6)
memory usage: 86.1+ KB


Index(['Resume_ID', 'Name', 'Skills', 'Experience (Years)', 'Education',
       'Certifications', 'Job Role', 'Recruiter Decision',
       'Salary Expectation ($)', 'Projects Count', 'AI Score (0-100)'],
      dtype='object')

In [36]:
df_decision = df[
    ['Skills', 'Experience (Years)', 'Education', 'Recruiter Decision']
].dropna()




In [37]:
df['Resume_Text'] = (
    df['Skills'].astype(str) + ' ' +
    df['Experience (Years)'].astype(str) + ' ' +
    df['Education'].astype(str)
)


In [38]:
from nltk.corpus import stopwords
import re

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    words = text.split()   # <-- no nltk tokenizer
    words = [w for w in words if w not in stop_words]
    return ' '.join(words)


In [39]:
df['Cleaned_Text'] = df['Resume_Text'].apply(clean_text)

In [40]:
tfidf = TfidfVectorizer(max_features=3000)
X = tfidf.fit_transform(df['Cleaned_Text'])
y = df['Job Role']


In [41]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [42]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


In [43]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 1.0
                       precision    recall  f1-score   support

        AI Researcher       1.00      1.00      1.00        52
Cybersecurity Analyst       1.00      1.00      1.00        51
       Data Scientist       1.00      1.00      1.00        45
    Software Engineer       1.00      1.00      1.00        52

             accuracy                           1.00       200
            macro avg       1.00      1.00      1.00       200
         weighted avg       1.00      1.00      1.00       200



In [44]:
df['Job Role'].value_counts()


Job Role
AI Researcher            257
Data Scientist           255
Cybersecurity Analyst    255
Software Engineer        233
Name: count, dtype: int64

In [45]:
#Predict Job Role for New Resume
def predict_job_role(skills, experience, education):
    text = skills + ' ' + experience + ' ' + education
    cleaned = clean_text(text)
    vector = tfidf.transform([cleaned])
    return model.predict(vector)[0]


In [46]:
#Test is
predict_job_role(
    "Python, SQL, Machine Learning",
    "2 years experience as data analyst",
    "B.Tech in Computer Science"
)


'Data Scientist'

In [47]:
predict_job_role(
    "Network security, penetration testing, firewalls",
    "3 years experience in cyber security operations",
    "BSc in Information Technology"
)


'Software Engineer'

In [48]:
df.columns.tolist()


['Resume_ID',
 'Name',
 'Skills',
 'Experience (Years)',
 'Education',
 'Certifications',
 'Job Role',
 'Recruiter Decision',
 'Salary Expectation ($)',
 'Projects Count',
 'AI Score (0-100)',
 'Resume_Text',
 'Cleaned_Text']

In [49]:
#Note:The dataset is synthetic and highly structured, which resultedin very high accuracy. In real-world scenarios, accuracy may vary.


In [50]:
#Recruiter Decision Prediction
#Predict whether a resume will be shortlisted or rejected.
df_decision = df.copy()

df_decision = df_decision[
    ['Skills', 'Experience (Years)', 'Education', 'Recruiter Decision']
].dropna()


In [51]:
df_decision['Resume_Text'] = (
    df_decision['Skills'].astype(str) + ' ' +
    df_decision['Experience (Years)'].astype(str) + ' ' +
    df_decision['Education'].astype(str)
)


In [52]:
#Using the same clean_text() function you already have:
df_decision['Cleaned_Text'] = df_decision['Resume_Text'].apply(clean_text)


In [53]:
#td-idf + model
X = tfidf.fit_transform(df_decision['Cleaned_Text'])
y = df_decision['Recruiter Decision']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

decision_model = LogisticRegression(max_iter=1000)
decision_model.fit(X_train, y_train)


In [54]:
y_pred = decision_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.77
              precision    recall  f1-score   support

        Hire       0.77      1.00      0.87       154
      Reject       0.00      0.00      0.00        46

    accuracy                           0.77       200
   macro avg       0.39      0.50      0.44       200
weighted avg       0.59      0.77      0.67       200



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [63]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# ----- Recruiter Decision Model -----

# Select correct columns
X_decision = (
    df['Skills'].astype(str) + " " +
    df['Education'].astype(str) + " " +
    df['Experience (Years)'].astype(str)
)

y_decision = df['Recruiter Decision']

# Use SAME TF-IDF
X_decision_vec = tfidf.transform(X_decision)

# Train-test split
X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(
    X_decision_vec, y_decision, test_size=0.2, random_state=42
)

# Train model
recruiter_model = LogisticRegression(max_iter=1000)
recruiter_model.fit(X_train_d, y_train_d)

# Accuracy check
print("Recruiter Decision Accuracy:",
      accuracy_score(y_test_d, recruiter_model.predict(X_test_d)))


Recruiter Decision Accuracy: 0.77


In [64]:
#resume matching job description
from sklearn.metrics.pairwise import cosine_similarity
#job fit function
def job_fit_score(resume_text, job_description):
    resume_clean = clean_text(resume_text)
    job_clean = clean_text(job_description)

    vectors = tfidf.transform([resume_clean, job_clean])
    score = cosine_similarity(vectors[0], vectors[1])[0][0]

    return round(float(score * 100), 2)


In [65]:
resume = "Python, SQL, Machine Learning, 2 years experience, B.Tech CSE"
job_desc = "Looking for a Data Scientist with Python, SQL and ML experience"

job_fit_score(resume, job_desc)


56.74

In [66]:
pickle.dump(model, open("job_role_model.pkl", "wb"))

# Save TF-IDF vectorizer
pickle.dump(tfidf, open("tfidf.pkl", "wb"))

# Save recruiter model
pickle.dump(recruiter_model, open("recruiter_model.pkl", "wb"))


print("All models saved successfully!")

All models saved successfully!


In [67]:
import os
os.listdir()


['.ipynb_checkpoints',
 '.RData',
 '.Rhistory',
 '1.jpg',
 '2.jpg',
 '3.jpg',
 '4.jpg',
 '5.jpg',
 '6.jpeg',
 '6.jpg',
 'AI_Resume_Screening.csv',
 'all_stocks_5yr.csv',
 'amitha.py',
 'an2.py',
 'anaconda_projects',
 'Arduino',
 'asi2.py',
 'asi3.py',
 'asi4.py',
 'asi_2.docx',
 'Book5.csv',
 'Book6.csv',
 'Breast Cancer Classification using Machine.docx',
 'canva4.jpg',
 'canva4.png',
 'CONCATENATION.docx',
 'CONCATENATION.pdf',
 'Custom Office Templates',
 'Dell',
 'desktop.ini',
 'Disha v suvarna resume.docx',
 'download (1).jpg',
 'East.xlsx',
 'ex1.java',
 'java_prgm1.java',
 'job_role_model.pkl',
 'joing.jpg',
 'kannada question paper.docx',
 'lab record2.docx',
 'lab1.py',
 'lab2.py',
 'lab3.py',
 'lab4.py',
 'lab5.py',
 'macro1.xlsm',
 'main.jpeg',
 'My Tableau Repository',
 'New folder',
 'NISAR[1].pptx',
 'office',
 'Python_Record[1].pdf',
 'recruiter_model.pkl',
 'Rescued document 1.doc',
 'Rescued document.doc',
 'resume_app.py',
 'Resume_Screening_Project.ipynb',
 'sales.

In [60]:
df.columns.tolist()


['Resume_ID',
 'Name',
 'Skills',
 'Experience (Years)',
 'Education',
 'Certifications',
 'Job Role',
 'Recruiter Decision',
 'Salary Expectation ($)',
 'Projects Count',
 'AI Score (0-100)',
 'Resume_Text',
 'Cleaned_Text']

In [None]:
#model = pickle.load(open("job_role_model.pkl", "rb"))   tfidf = pickle.load(open("tfidf.pkl", "rb"))