In [235]:
# Install required libraries first (if not installed)
# !pip install transformers datasets torch scikit-learn
1
import json
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import warnings
warnings.filterwarnings("ignore")

In [238]:
dataset_json = [] 
import json

try:
    with open('data.json', 'r') as file:
        dataset_json = json.load(file)
    print("JSON data from file:", dataset_json)
except FileNotFoundError:
    print("Error: The file 'data.json' was not found.")
except json.JSONDecodeError:
    print("Error: Failed to decode JSON from the file (malformed JSON).")


JSON data from file: [{'text': 'As a seasoned Frontend Developer, I have a proven track record of crafting stunning and responsive user interfaces for web applications.', 'label': 'QUALIFICATION'}, {'text': 'With over 5 years of experience, I am proficient in HTML, CSS, and JavaScript.', 'label': 'EXPERIENCE'}, {'text': 'I have a deep understanding of modern frontend frameworks such as React, Vue.js, and Angular.', 'label': 'SKILL'}, {'text': 'I excel in translating design mockups into pixel-perfect, cross-browser compatible UIs, ensuring an optimal user experience across devices.', 'label': 'SKILL'}, {'text': 'Additionally, I am well-versed in CSS preprocessors like Sass and LESS, enabling me to write clean and maintainable stylesheets.', 'label': 'SKILL'}, {'text': 'My passion for UI/UX design drives me to stay updated with the latest trends and best practices in the industry, allowing me to create visually appealing and intuitive interfaces that engage users.', 'label': 'QUALIFICATI

In [239]:
# Convert to Dataset
texts = [item["text"] for item in dataset_json]
labels = [item["label"] for item in dataset_json]

# Encode labels to integers
le = LabelEncoder()
labels_encoded = le.fit_transform(labels)

In [240]:
# Train/test split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels_encoded, test_size=0.2, random_state=42
)

train_dataset = Dataset.from_dict({"text": train_texts, "label": train_labels})
val_dataset = Dataset.from_dict({"text": val_texts, "label": val_labels})

In [241]:
# ========================
# Step 2: Tokenization
# ========================
model_name = "prajjwal1/bert-tiny"  # lightweight and fast
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)

# Set format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

Map: 100%|█████████████████████████████████████████████████████████████████████| 63/63 [00:00<00:00, 654.64 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████| 16/16 [00:00<00:00, 752.38 examples/s]


In [242]:


from transformers import AutoModelForSequenceClassification

num_labels = len(le.classes_)

# This will print progress while downloading/loading
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    force_download=False,  # Set True if you want to re-download
    local_files_only=False  # Allow download if not cached
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [243]:
# ========================
# Step 4: Define Metrics
# ========================
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average="weighted")
    acc = accuracy_score(p.label_ids, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

In [244]:
training_args = TrainingArguments(
    output_dir="./cv_classifier_model",
    num_train_epochs=30,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    learning_rate=2e-5,
    load_best_model_at_end=False  # no need for best model
)


In [245]:
# ========================
# Step 6: Trainer
# ========================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

In [246]:
# ========================
# Step 7: Train
# ========================
trainer.train()

Step,Training Loss


TrainOutput(global_step=240, training_loss=1.034695307413737, metrics={'train_runtime': 30.3707, 'train_samples_per_second': 62.231, 'train_steps_per_second': 7.902, 'total_flos': 248641372980.0, 'train_loss': 1.034695307413737, 'epoch': 30.0})

In [247]:
# ========================
# Step 8: Save model & tokenizer
# ========================
model.save_pretrained("./cv_classifier_model")
tokenizer.save_pretrained("./cv_classifier_model")

('./cv_classifier_model\\tokenizer_config.json',
 './cv_classifier_model\\special_tokens_map.json',
 './cv_classifier_model\\vocab.txt',
 './cv_classifier_model\\added_tokens.json',
 './cv_classifier_model\\tokenizer.json')

In [248]:
# ========================
# Step 9: Example Inference
# ========================
def predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    outputs = model(**inputs)
    preds = torch.argmax(outputs.logits, dim=1).item()
    label = le.inverse_transform([preds])[0]
    return label

# Example prediction
example_text = "I have a skill in understanding of modern frontend frameworks such as Java, react "
print("Predicted Label:", predict(example_text))

Predicted Label: SKILL


In [249]:
print("hi")

hi
