In [1]:
import pandas as pd

In [2]:
resumes = pd.read_csv("D:/resume screener/my_datasets/resume_dataset.csv")
jobs = pd.read_csv("D:/resume screener/my_datasets/job_posts.csv")

In [3]:
print(resumes.shape)
print(resumes.columns)
print(resumes.head())

(962, 2)
Index(['Category', 'Resume'], dtype='object')
       Category                                             Resume
0  Data Science  Skills * Programming Languages: Python (pandas...
1  Data Science  Education Details \r\nMay 2013 to May 2017 B.E...
2  Data Science  Areas of Interest Deep Learning, Control Syste...
3  Data Science  Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4  Data Science  Education Details \r\n MCA   YMCAUST,  Faridab...


In [4]:
print(jobs.shape)
print(jobs.columns)
print(jobs.head())

(19001, 24)
Index(['jobpost', 'date', 'Title', 'Company', 'AnnouncementCode', 'Term',
       'Eligibility', 'Audience', 'StartDate', 'Duration', 'Location',
       'JobDescription', 'JobRequirment', 'RequiredQual', 'Salary',
       'ApplicationP', 'OpeningDate', 'Deadline', 'Notes', 'AboutC', 'Attach',
       'Year', 'Month', 'IT'],
      dtype='object')
                                             jobpost          date  \
0  AMERIA Investment Consulting Company\r\nJOB TI...   Jan 5, 2004   
1  International Research & Exchanges Board (IREX...   Jan 7, 2004   
2  Caucasus Environmental NGO Network (CENN)\r\nJ...   Jan 7, 2004   
3  Manoff Group\r\nJOB TITLE:  BCC Specialist\r\n...   Jan 7, 2004   
4  Yerevan Brandy Company\r\nJOB TITLE:  Software...  Jan 10, 2004   

                                               Title  \
0                            Chief Financial Officer   
1  Full-time Community Connections Intern (paid i...   
2                                Country Coordinator  

In [5]:
import re

In [6]:
def clean_text(text):
    text = re.sub(r'<.*?>', ' ', str(text))
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

resumes['cleaned_text'] = resumes['Resume'].apply(clean_text)
jobs['Category'] = resumes['Category'].str.lower().str.strip()

In [7]:
jobs = jobs[['Title', 'JobDescription', 'RequiredQual']]
jobs = jobs.fillna("")
for col in ['Title', 'JobDescription', 'RequiredQual']:
    jobs[col] = jobs[col].apply(clean_text)

In [8]:
import random 
pairs = []

for idx, row in resumes.iterrows():
    resume_text = row['cleaned_text']
    category = row['Category']

    # Positive job match
    positive_jobs = jobs[jobs['Title'].str.contains(category.split()[0], case=False, na=False)]
    if not positive_jobs.empty:
        job_row = positive_jobs.sample(1).iloc[0]
        pairs.append([resume_text, job_row['JobDescription'], 1])

    # Negative job match
    negative_jobs = jobs[~jobs['Title'].str.contains(category.split()[0], case=False, na=False)]
    if not negative_jobs.empty:
        job_row = negative_jobs.sample(1).iloc[0]
        pairs.append([resume_text, job_row['JobDescription'], 0])

pairs_df = pd.DataFrame(pairs, columns=['Resume', 'Job_Desc', 'Label'])
print(pairs_df.head())


                                              Resume  \
0  skills programming languages python pandas num...   
1  skills programming languages python pandas num...   
2  education details may 2013 to may 2017 b e uit...   
3  education details may 2013 to may 2017 b e uit...   
4  areas of interest deep learning control system...   

                                            Job_Desc  Label  
0  software engineer will take part in design and...      1  
1  under the overall guidance of the undp climate...      0  
2  epam systems is actively looking for ms sql da...      1  
3  veya investments limited is looking for a prof...      0  
4  it department of chamber of commerce and indus...      1  


In [9]:
pairs_df.to_csv("D:/resume screener/my_datasets/training_pairs.csv", index=False)


In [10]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [11]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gg185\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gg185\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [13]:
def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans("","", string.punctuation))
    text = re.sub(r'/d+', '', text)
    text = text.strip()
    words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
    return " ".join(words)

pairs_df['Resume_clean'] = pairs_df['Resume'].apply(clean_text)
pairs_df['Job_Desc_clean'] = pairs_df['Job_Desc'].apply(clean_text)

print(pairs_df[['Resume_clean', 'Job_Desc_clean', 'Label']].head())

                                        Resume_clean  \
0  skill programming language python panda numpy ...   
1  skill programming language python panda numpy ...   
2  education detail may 2013 may 2017 b e uit rgp...   
3  education detail may 2013 may 2017 b e uit rgp...   
4  area interest deep learning control system des...   

                                      Job_Desc_clean  Label  
0  software engineer take part design implementat...      1  
1  overall guidance undp climate change programme...      0  
2  epam system actively looking m sql database de...      1  
3  veya investment limited looking professional o...      0  
4  department chamber commerce industry ra associ...      1  


In [14]:
from sentence_transformers import SentenceTransformer
import numpy as np

In [15]:
model_bert = SentenceTransformer('all-MiniLM-L6-v2')

In [16]:
resume_embeddings = model_bert.encode(pairs_df['Resume_clean'].tolist(), convert_to_numpy=True)
job_embeddings = model_bert.encode(pairs_df['Job_Desc_clean'].tolist(), convert_to_numpy=True)

In [17]:
X = np.hstack([resume_embeddings, job_embeddings])
y = pairs_df['Label'].values

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = LogisticRegression(max_iter=200)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.7079889807162535

Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.79      0.74       196
           1       0.71      0.62      0.66       167

    accuracy                           0.71       363
   macro avg       0.71      0.70      0.70       363
weighted avg       0.71      0.71      0.71       363



In [19]:
import joblib

In [20]:
joblib.dump(clf, "resume_match_model.pkl")
joblib.dump(model_bert, "sbert_model.pkl")


['sbert_model.pkl']

Fine Tuning BERT Model

In [21]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


In [22]:
pairs_df = pd.read_csv("D:/resume screener/my_datasets/training_pairs.csv")

train_texts, val_texts, train_labels, val_labels = train_test_split(
    pairs_df[['Resume', 'Job_Desc']].values.tolist(),
    pairs_df['Label'].tolist(),
    test_size=0.2,
    random_state=42
)


In [23]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class ResumeJobDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        resume, job = self.texts[idx]
        encoding = self.tokenizer(
            resume,
            job,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }


In [24]:
train_dataset = ResumeJobDataset(train_texts, train_labels, tokenizer)
val_dataset = ResumeJobDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)


In [25]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
