In [1]:

import json
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset_json = [] 
import json

try:
    with open('data.json', 'r') as file:
        dataset_json = json.load(file)
    print("JSON data from file:")
except FileNotFoundError:
    print("Error: The file 'data.json' was not found.")
except json.JSONDecodeError:
    print("Error: Failed to decode JSON from the file (malformed JSON).")


JSON data from file:


In [3]:
# Convert to Dataset
texts = [item["text"] for item in dataset_json]
labels = [item["label"] for item in dataset_json]

# Encode labels to integers
le = LabelEncoder()
labels_encoded = le.fit_transform(labels)

In [4]:
# Train/test split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels_encoded, test_size=0.2, random_state=42
)

train_dataset = Dataset.from_dict({"text": train_texts, "label": train_labels})
val_dataset = Dataset.from_dict({"text": val_texts, "label": val_labels})

In [5]:
# ========================
# Step 2: Tokenization
# ========================
model_name = "prajjwal1/bert-tiny"  # lightweight and fast
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["text"], padding='max_length', truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)

# Set format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

Map: 100%|████████████████████████████████████████████████████████████████| 1260/1260 [00:00<00:00, 7227.25 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████| 316/316 [00:00<00:00, 6898.20 examples/s]


In [6]:


from transformers import AutoModelForSequenceClassification

num_labels = len(le.classes_)

# This will print progress while downloading/loading
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    force_download=False,  # Set True if you want to re-download
    local_files_only=False  # Allow download if not cached
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# ========================
# Step 4: Define Metrics
# ========================
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average="weighted")
    acc = accuracy_score(p.label_ids, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

In [8]:
training_args = TrainingArguments(
    output_dir="./cv_classifier_model",
    num_train_epochs=30,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    learning_rate=2e-5,
    load_best_model_at_end=False  # no need for best model
)


In [9]:
# ========================
# Step 6: Trainer
# ========================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

In [10]:
# ========================
# Step 7: Train
# ========================
trainer.train()

Step,Training Loss
500,0.7307
1000,0.2821
1500,0.1696
2000,0.1324
2500,0.1217
3000,0.1082
3500,0.1015
4000,0.0919
4500,0.0954


TrainOutput(global_step=4740, training_loss=0.19785178582879562, metrics={'train_runtime': 721.4024, 'train_samples_per_second': 52.398, 'train_steps_per_second': 6.571, 'total_flos': 12009847449600.0, 'train_loss': 0.19785178582879562, 'epoch': 30.0})

In [11]:
# ========================
# Step 8: Save model & tokenizer
# ========================
model.save_pretrained("./cv_classifier_model")
tokenizer.save_pretrained("./cv_classifier_model")

('./cv_classifier_model\\tokenizer_config.json',
 './cv_classifier_model\\special_tokens_map.json',
 './cv_classifier_model\\vocab.txt',
 './cv_classifier_model\\added_tokens.json',
 './cv_classifier_model\\tokenizer.json')

In [12]:
# ========================
# Step 9: Example Inference
# ========================
def predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    outputs = model(**inputs)
    preds = torch.argmax(outputs.logits, dim=1).item()
    label = le.inverse_transform([preds])[0]
    return label

# Example prediction
example_text = " artificial intelligence "
print("Predicted Label:", predict(example_text))

Predicted Label: QUALIFICATION


In [13]:
import pandas as pd 

In [14]:
df_csv_test = pd.read_csv('data 02.csv')

In [15]:
df_csv_test = df_csv_test [253:]

In [16]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dehem\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [17]:
unlabeled_texts = []
for i in df_csv_test['Resume']:
    sentences = sent_tokenize(i)
    unlabeled_texts.extend(sentences)

In [18]:
# Step 2: Tokenize unlabeled data
unlabeled_dataset = Dataset.from_dict({"text": unlabeled_texts})
unlabeled_dataset = unlabeled_dataset.map(tokenize, batched=True)
unlabeled_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

Map: 100%|██████████████████████████████████████████████████████████████████| 882/882 [00:00<00:00, 6530.40 examples/s]


In [19]:
# Step 3: Get pseudo-labels from your trained model
model.eval()
pseudo_labels = []
for batch in unlabeled_dataset:
    input_ids = batch["input_ids"].unsqueeze(0)  # batch of 1
    attention_mask = batch["attention_mask"].unsqueeze(0)
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        pred = torch.argmax(outputs.logits, dim=1).item()
    pseudo_labels.append(pred)

In [20]:
# Step 4: Create pseudo-labeled dataset
pseudo_dataset = Dataset.from_dict({
    "text": unlabeled_texts,
    "label": pseudo_labels
})

In [21]:
train_dataset = train_dataset.map(tokenize, batched=True)
pseudo_dataset = pseudo_dataset.map(tokenize, batched=True)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
pseudo_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


Map: 100%|████████████████████████████████████████████████████████████████| 1260/1260 [00:00<00:00, 5469.30 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████| 882/882 [00:00<00:00, 5266.34 examples/s]


In [22]:
from datasets import concatenate_datasets
# Step 5: Combine with original labeled data
combined_dataset = concatenate_datasets([train_dataset, pseudo_dataset])

In [23]:
# Step 6: Retrain / fine-tune the model on combined dataset
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=combined_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)
trainer.train()


Step,Training Loss
500,0.0776
1000,0.0712
1500,0.0599
2000,0.0616
2500,0.0499
3000,0.055
3500,0.0502
4000,0.0459
4500,0.0396
5000,0.0525


TrainOutput(global_step=8040, training_loss=0.05028470446221271, metrics={'train_runtime': 1171.949, 'train_samples_per_second': 54.832, 'train_steps_per_second': 6.86, 'total_flos': 20416740664320.0, 'train_loss': 0.05028470446221271, 'epoch': 30.0})

## Upload CV and Classify the data 

In [142]:

def classify_line(line):
    inputs = tokenizer(line, return_tensors="pt", truncation=True)
    outputs = model(**inputs)
    return outputs.logits.argmax(dim=1).item()


In [144]:
import pdfplumber
import re
text = ""
with pdfplumber.open("resume.pdf") as pdf:
    for page in pdf.pages:
        text += page.extract_text() + "\n"


In [145]:
github_links = re.findall(r"github.com/[A-Za-z0-9_-]+", text)
github_link = "https://"+github_links[0]
print(github_link)

https://github.com/dehemiweerakoon


In [126]:
# import re

# clean_text = re.sub(r"[^a-zA-Z0-9\s]", "", text.lower())


In [146]:
lines = [l.strip() for l in text.split("\n") if len(l.strip()) > 3]


In [147]:
LABELS = {
    0: "SKILL",
    1: "QUALIFICATION",
    2: "EXPERIENCE"
}


In [148]:
result = {
    "skills": [],
    "qualifications": [],
    "experience": []
}

for line in lines:
    label = LABELS[classify_line(line)]
    if label == "SKILL":
        result["skills"].append(line)
    elif label == "QUALIFICATION":
        result["qualifications"].append(line)
    elif label == "EXPERIENCE":
        result["experience"].append(line)


## Job posting classifications 

In [149]:
with open('Dataset_jotpars.csv', 'rb') as f:
    content = f.read()

# Decode safely, ignoring bad bytes
text = content.decode('utf-8', errors='ignore')

# Save a clean CSV
with open('Dataset_jotpars_clean.csv', 'w', encoding='utf-8') as f:
    f.write(text)

# Read the clean CSV
df_job = pd.read_csv('Dataset_jotpars_clean.csv')


In [150]:
df_job['full job post'] = df_job['requirment'] + ' ' + df_job['description']

In [151]:
df_job['full job post']

0       r2  ,  soa  ,  t-sql  ,  database  ,  security...
1       software development  ,  satellite  ,  android...
2       hibernate  ,  java developer  ,  spring  ,  sp...
3       css  ,  team player  ,  ajax  ,  javascript  ,...
4       css  ,  html5  ,  ajax  ,  oop  ,  windows pla...
                              ...                        
8792    css  ,  wordpress  ,  php  ,  html  ,  mysql  ...
8793    css  ,  wordpress  ,  wordpress cms  ,  bootst...
8794    wordpress  ,  html5  ,  symfony  ,  jquery  , ...
8795    css  ,  wordpress  ,  mobile  ,  php  ,  web d...
8796    css  ,  wordpress  ,  php  ,  html  ,  mysql  ...
Name: full job post, Length: 8797, dtype: object

In [152]:
job_lines = []

for rec in df_job['full job post'][4237:4238]:  # slice first 500
    lines_1 = [line.strip() for line in rec.split(",") or rec.split('\n')  if len(line.strip()) > 3]
    job_lines.extend(lines_1)  # add all lines to job_lines


In [153]:
result_jd = {
    "skills": [],
    "qualifications": [],
    "experience": [],
    "other":[]
}

for line in job_lines:
    label = LABELS[classify_line(line)]
    if label == "SKILL":
        result_jd["skills"].append(line)
    elif label == "QUALIFICATION":
        result_jd["qualifications"].append(line)
    elif label == "EXPERIENCE":
        result_jd["experience"].append(line)
    else:
         result_jd['other'].append(line)

# Similarity Score is generated in here 

In [155]:


from sentence_transformers import SentenceTransformer, util

# Load a pre-trained embedding model
model2 = SentenceTransformer('all-MiniLM-L6-v2')

# Encode texts
cv_embedding_skill = model2.encode(result['skills'], convert_to_tensor=True)
job_embedding_skill = model2.encode(result_jd['skills'], convert_to_tensor=True)


In [156]:
similarity_score_skill = util.cos_sim(cv_embedding_skill, job_embedding_skill).max().item()
print(f"Semantic Similarity Score skills : {similarity_score_skill*100:.2f}%")


Semantic Similarity Score skills : 43.70%


In [157]:
# quqlification meching for the cv 
cv_embedding_qualification = model2.encode(result['qualifications'], convert_to_tensor=True)
job_embedding_qualification = model2.encode(result_jd['qualifications'], convert_to_tensor=True)

similarity_score_qualification = util.cos_sim(cv_embedding_qualification,job_embedding_qualification).max().item()
print(f"Qualification Similarity Score : {similarity_score_qualification*100:.2f}%")

Qualification Similarity Score : 59.92%


In [158]:
# experience meching for the cv 
cv_embedding_experience = model2.encode(result['experience'], convert_to_tensor=True)
job_embedding_experience = model2.encode(result_jd['experience'], convert_to_tensor=True)

similarity_score_experience = util.cos_sim(cv_embedding_experience,job_embedding_experience).max().item()
print(f"Experience Similarity Score : {similarity_score_experience*100:.2f}%")

Experience Similarity Score : 66.16%
