In [1]:
# Install required libraries first (if not installed)
# !pip install transformers datasets torch scikit-learn
1
import json
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset_json = [] 
import json

try:
    with open('data.json', 'r') as file:
        dataset_json = json.load(file)
    print("JSON data from file:")
except FileNotFoundError:
    print("Error: The file 'data.json' was not found.")
except json.JSONDecodeError:
    print("Error: Failed to decode JSON from the file (malformed JSON).")


JSON data from file:


In [3]:
# Convert to Dataset
texts = [item["text"] for item in dataset_json]
labels = [item["label"] for item in dataset_json]

# Encode labels to integers
le = LabelEncoder()
labels_encoded = le.fit_transform(labels)

In [4]:
# Train/test split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels_encoded, test_size=0.2, random_state=42
)

train_dataset = Dataset.from_dict({"text": train_texts, "label": train_labels})
val_dataset = Dataset.from_dict({"text": val_texts, "label": val_labels})

In [5]:
# ========================
# Step 2: Tokenization
# ========================
model_name = "prajjwal1/bert-tiny"  # lightweight and fast
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["text"], padding='max_length', truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)

# Set format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

Map: 100%|████████████████████████████████████████████████████████████████| 1260/1260 [00:00<00:00, 8565.98 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████| 316/316 [00:00<00:00, 9270.80 examples/s]


In [6]:


from transformers import AutoModelForSequenceClassification

num_labels = len(le.classes_)

# This will print progress while downloading/loading
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    force_download=False,  # Set True if you want to re-download
    local_files_only=False  # Allow download if not cached
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# ========================
# Step 4: Define Metrics
# ========================
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average="weighted")
    acc = accuracy_score(p.label_ids, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

In [8]:
training_args = TrainingArguments(
    output_dir="./cv_classifier_model",
    num_train_epochs=30,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    learning_rate=2e-5,
    load_best_model_at_end=False  # no need for best model
)


In [9]:
# ========================
# Step 6: Trainer
# ========================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

In [10]:
# ========================
# Step 7: Train
# ========================
trainer.train()

Step,Training Loss
500,0.7312
1000,0.2806
1500,0.1655
2000,0.1281
2500,0.117
3000,0.1044
3500,0.0959
4000,0.0938
4500,0.0919


TrainOutput(global_step=4740, training_loss=0.19546120398155245, metrics={'train_runtime': 453.1977, 'train_samples_per_second': 83.407, 'train_steps_per_second': 10.459, 'total_flos': 12009847449600.0, 'train_loss': 0.19546120398155245, 'epoch': 30.0})

In [11]:
# ========================
# Step 8: Save model & tokenizer
# ========================
model.save_pretrained("./cv_classifier_model")
tokenizer.save_pretrained("./cv_classifier_model")

('./cv_classifier_model\\tokenizer_config.json',
 './cv_classifier_model\\special_tokens_map.json',
 './cv_classifier_model\\vocab.txt',
 './cv_classifier_model\\added_tokens.json',
 './cv_classifier_model\\tokenizer.json')

In [12]:
# ========================
# Step 9: Example Inference
# ========================
def predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    outputs = model(**inputs)
    preds = torch.argmax(outputs.logits, dim=1).item()
    label = le.inverse_transform([preds])[0]
    return label

# Example prediction
example_text = "Python , java "
print("Predicted Label:", predict(example_text))

Predicted Label: SKILL


In [14]:
import pandas as pd 

In [15]:
df_csv_test = pd.read_csv('data 02.csv')

In [16]:
df_csv_test = df_csv_test [253:]

In [17]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dehem\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [18]:
unlabeled_texts = []
for i in df_csv_test['Resume']:
    sentences = sent_tokenize(i)
    unlabeled_texts.extend(sentences)

In [19]:
# Step 2: Tokenize unlabeled data
unlabeled_dataset = Dataset.from_dict({"text": unlabeled_texts})
unlabeled_dataset = unlabeled_dataset.map(tokenize, batched=True)
unlabeled_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

Map: 100%|██████████████████████████████████████████████████████████████████| 882/882 [00:00<00:00, 6374.74 examples/s]


In [20]:
# Step 3: Get pseudo-labels from your trained model
model.eval()
pseudo_labels = []
for batch in unlabeled_dataset:
    input_ids = batch["input_ids"].unsqueeze(0)  # batch of 1
    attention_mask = batch["attention_mask"].unsqueeze(0)
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        pred = torch.argmax(outputs.logits, dim=1).item()
    pseudo_labels.append(pred)

In [21]:
# Step 4: Create pseudo-labeled dataset
pseudo_dataset = Dataset.from_dict({
    "text": unlabeled_texts,
    "label": pseudo_labels
})

In [35]:
train_dataset = train_dataset.map(tokenize, batched=True)
pseudo_dataset = pseudo_dataset.map(tokenize, batched=True)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
pseudo_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


Map: 100%|████████████████████████████████████████████████████████████████| 1260/1260 [00:00<00:00, 3099.33 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████| 882/882 [00:00<00:00, 3508.78 examples/s]


In [36]:
from datasets import concatenate_datasets
# Step 5: Combine with original labeled data
combined_dataset = concatenate_datasets([train_dataset, pseudo_dataset])

In [37]:
# Step 6: Retrain / fine-tune the model on combined dataset
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=combined_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)
trainer.train()


Step,Training Loss
500,0.0782
1000,0.0719
1500,0.063
2000,0.0555
2500,0.0514
3000,0.0541
3500,0.0503
4000,0.0463
4500,0.0367
5000,0.0487


TrainOutput(global_step=8040, training_loss=0.05024357460328002, metrics={'train_runtime': 1307.4476, 'train_samples_per_second': 49.149, 'train_steps_per_second': 6.149, 'total_flos': 20416740664320.0, 'train_loss': 0.05024357460328002, 'epoch': 30.0})