In [3]:
!pip install transformers sentence-transformers datasets gradio



In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import pandas as pd

file_path = "/content/drive/My Drive/resume_dataset.csv"

df_resumes = pd.read_csv(file_path)

df_resumes.head()

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


In [6]:
df_resumes.columns

Index(['Category', 'Resume'], dtype='object')

In [7]:
df_resumes.isnull().sum()

Unnamed: 0,0
Category,0
Resume,0


In [8]:
df_resumes.dropna(inplace=True)

In [9]:
df_resumes.head()

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


In [10]:
from sentence_transformers import SentenceTransformer, util
import torch

In [11]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [12]:
resume_texts = df_resumes["Resume"].tolist()
resume_embeddings = embedding_model.encode(resume_texts, convert_to_tensor=True)

print(f"Generated {len(resume_embeddings)} resume embeddings")

Generated 962 resume embeddings


In [13]:
file_path_jobs = "/content/drive/My Drive/job_description.csv"
df_jobs = pd.read_csv(file_path_jobs)

df_jobs.head()

Unnamed: 0.1,Unnamed: 0,Job Title,Job Description
0,0,Flutter Developer,We are looking for hire experts flutter develo...
1,1,Django Developer,PYTHON/DJANGO (Developer/Lead) - Job Code(PDJ ...
2,2,Machine Learning,"Data Scientist (Contractor)\n\nBangalore, IN\n..."
3,3,iOS Developer,JOB DESCRIPTION:\n\nStrong framework outside o...
4,4,Full Stack Developer,job responsibility full stack engineer – react...


In [14]:
job_texts = df_jobs["Job Description"].tolist()
job_embeddings = embedding_model.encode(job_texts, convert_to_tensor=True)

print(f"Generated {len(job_embeddings)} job description embeddings")

Generated 2277 job description embeddings


In [15]:
def find_best_match(resume_text):
    resume_embedding = embedding_model.encode(resume_text, convert_to_tensor=True)
    similarity_scores = util.pytorch_cos_sim(resume_embedding, job_embeddings)
    best_match_idx = torch.argmax(similarity_scores).item()
    return df_jobs.iloc[best_match_idx]["Job Title"], df_jobs.iloc[best_match_idx]["Job Description"]

In [16]:
sample_resume = df_resumes.iloc[0]["Resume"]
matched_job, matched_description = find_best_match(sample_resume)

print("Matched Job Title:", matched_job)
print("Matched Job Description:", matched_description)

Matched Job Title: Machine Learning
Matched Job Description: Key Requirements of the Role:
Bachelor's degree in a quantitative or related field
MS/PhD in a quantitative discipline such as Statistics, Physics, Economics, Applied Math, Computer Science, Operations Research, or Computational Sciences, with coursework and projects in machine learning and data analysis
3+ years of related experience
Strong understanding of advanced data mining techniques, curating, processing and transforming data to produce sound datasets.
Strong understanding of the Machine Learning lifecycle - feature engineering, training, validation, scaling, deployment, scoring, monitoring, and feedback loop
Experience in analyzing complex problems and translating it into an analytical approach.
Experience in Supervised and Unsupervised Machine Learning including Classification, Forecasting, Anomaly detection, Pattern detection, Text Mining, using variety of techniques such as Decision trees, Time Series Analysis, Bag

In [17]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [18]:
tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [19]:
resume_feedback_data = [
    {"input": f"Resume: {resume} | Job: {job}", "output": "Your resume is strong in [X] but lacks [Y]."}
    for resume, job in zip(df_resumes["Resume"], df_jobs["Job Description"])
]

df_feedback = pd.DataFrame(resume_feedback_data)

In [20]:
from datasets import Dataset

dataset = Dataset.from_pandas(df_feedback)

def preprocess_data(examples):
    model_inputs = tokenizer(examples["input"], padding="max_length", truncation=True, max_length=512)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["output"], padding="max_length", truncation=True, max_length=128)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

dataset = dataset.map(preprocess_data)

Map:   0%|          | 0/962 [00:00<?, ? examples/s]



In [30]:
dataset = dataset.map(preprocess_data, batched=True)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="no",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    save_steps=1000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

trainer.train()

Map:   0%|          | 0/962 [00:00<?, ? examples/s]



Step,Training Loss
500,0.0004


TrainOutput(global_step=605, training_loss=0.0003302135497085319, metrics={'train_runtime': 216.8501, 'train_samples_per_second': 22.181, 'train_steps_per_second': 2.79, 'total_flos': 650994065080320.0, 'train_loss': 0.0003302135497085319, 'epoch': 5.0})

In [31]:
from sklearn.model_selection import train_test_split
from transformers import TrainingArguments, Trainer

print("Columns in dataset before processing:", dataset.column_names)

if "input_ids" in dataset.column_names and "labels" in dataset.column_names:
    print("Dataset is already tokenized. Skipping preprocessing.")
else:
    dataset = dataset.map(preprocess_data, batched=True)

dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

train_size = int(0.8 * len(dataset))
train_dataset = dataset.select(range(train_size))
eval_dataset = dataset.select(range(train_size, len(dataset)))

print(f"Training examples: {len(train_dataset)}")
print(f"Evaluation examples: {len(eval_dataset)}")

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    save_steps=1000,
    save_total_limit=2,
    remove_unused_columns=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

trainer.train()

Columns in dataset before processing: ['input', 'output', 'input_ids', 'attention_mask', 'labels']
Dataset is already tokenized. Skipping preprocessing.
Training examples: 769
Evaluation examples: 193




Epoch,Training Loss,Validation Loss
1,No log,1e-06
2,No log,0.0
3,No log,0.0
4,No log,0.0
5,No log,0.0


TrainOutput(global_step=485, training_loss=3.700103111488303e-05, metrics={'train_runtime': 188.5755, 'train_samples_per_second': 20.39, 'train_steps_per_second': 2.572, 'total_flos': 520389226659840.0, 'train_loss': 3.700103111488303e-05, 'epoch': 5.0})

In [32]:
trainer.evaluate()

{'eval_loss': 3.779824453431502e-07,
 'eval_runtime': 2.8876,
 'eval_samples_per_second': 66.838,
 'eval_steps_per_second': 8.658,
 'epoch': 5.0}

In [258]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# ✅ Load BART, optimized for summarization
model_name = "facebook/bart-large-cnn"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def summarize_resume(resume_text):
    """Extracts key skills and experiences from a resume using BART."""

    input_text = (
        f"{resume_text}\n\n"
    )

    # 🔹 Tokenize input
    input_ids = tokenizer(input_text, return_tensors="pt", max_length=1024, truncation=True).input_ids

    # 🔹 Generate structured summary with improved parameters
    output_ids = model.generate(
        input_ids,
        max_length=350,  # Allow detailed response
        temperature=0.82,  # ✅ Increased randomness for variation
        top_k=40,  # Controls vocabulary diversity
        top_p=0.9,  # Nucleus sampling for coherent responses
        do_sample=True  # Prevents deterministic output
    )

    # 🔹 Decode response
    summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    return summary

In [259]:
!pip install gradio
import gradio as gr



In [260]:
def resume_evaluator(resume_text):

    matched_job, matched_description = find_best_match(resume_text)

    feedback = summarize_resume(resume_text)

    return matched_job, feedback

iface = gr.Interface(
    fn=resume_evaluator,
    inputs=gr.Textbox(label="Paste Your Resume"),
    outputs=[gr.Textbox(label="Best Matched Tech Job"), gr.Textbox(label="Resume Summary")],
    title="AI Resume Evaluator",
    description="Upload your resume and get AI-powered feedback!"
)

iface.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://9be54476fdb90d7e31.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [261]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Define save path in Google Drive
save_directory = "/content/drive/My Drive/trained_resume_model"

# Save model & tokenizer
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

print(f"✅ Model saved at {save_directory}")



✅ Model saved at /content/drive/My Drive/trained_resume_model


In [262]:
import os

print("Saved model files:", os.listdir("/content/drive/My Drive/trained_resume_model"))

Saved model files: ['config.json', 'generation_config.json', 'model.safetensors', 'tokenizer_config.json', 'special_tokens_map.json', 'spiece.model', 'tokenizer.json', 'vocab.json', 'merges.txt', 'added_tokens.json']


In [263]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load model from Drive
model_path = "/content/drive/My Drive/trained_resume_model"

model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

print("✅ Model loaded successfully!")



✅ Model loaded successfully!
