In [None]:
!pip install transformers sentence-transformers datasets gradio



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd

file_path = "/content/drive/My Drive/resume_dataset.csv"

df_resumes = pd.read_csv(file_path)

df_resumes.head()

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


In [None]:
df_resumes.columns

Index(['Category', 'Resume'], dtype='object')

In [None]:
df_resumes.isnull().sum()

Unnamed: 0,0
Category,0
Resume,0


In [None]:
df_resumes.dropna(inplace=True)

In [None]:
df_resumes.head()

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


In [None]:
from sentence_transformers import SentenceTransformer, util
import torch

In [None]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
resume_texts = df_resumes["Resume"].tolist()
resume_embeddings = embedding_model.encode(resume_texts, convert_to_tensor=True)

print(f"Generated {len(resume_embeddings)} resume embeddings")

Generated 962 resume embeddings


In [None]:
file_path_jobs = "/content/drive/My Drive/job_description.csv"
df_jobs = pd.read_csv(file_path_jobs)

df_jobs.head()

Unnamed: 0.1,Unnamed: 0,Job Title,Job Description
0,0,Flutter Developer,We are looking for hire experts flutter develo...
1,1,Django Developer,PYTHON/DJANGO (Developer/Lead) - Job Code(PDJ ...
2,2,Machine Learning,"Data Scientist (Contractor)\n\nBangalore, IN\n..."
3,3,iOS Developer,JOB DESCRIPTION:\n\nStrong framework outside o...
4,4,Full Stack Developer,job responsibility full stack engineer – react...


In [None]:
job_texts = df_jobs["Job Description"].tolist()
job_embeddings = embedding_model.encode(job_texts, convert_to_tensor=True)

print(f"Generated {len(job_embeddings)} job description embeddings")

Generated 2277 job description embeddings


In [None]:
def find_best_match(resume_text):
    resume_embedding = embedding_model.encode(resume_text, convert_to_tensor=True)
    similarity_scores = util.pytorch_cos_sim(resume_embedding, job_embeddings)
    best_match_idx = torch.argmax(similarity_scores).item()
    return df_jobs.iloc[best_match_idx]["Job Title"], df_jobs.iloc[best_match_idx]["Job Description"]

In [None]:
sample_resume = df_resumes.iloc[0]["Resume"]
matched_job, matched_description = find_best_match(sample_resume)

print("Matched Job Title:", matched_job)
print("Matched Job Description:", matched_description)

Matched Job Title: Machine Learning
Matched Job Description: Key Requirements of the Role:
Bachelor's degree in a quantitative or related field
MS/PhD in a quantitative discipline such as Statistics, Physics, Economics, Applied Math, Computer Science, Operations Research, or Computational Sciences, with coursework and projects in machine learning and data analysis
3+ years of related experience
Strong understanding of advanced data mining techniques, curating, processing and transforming data to produce sound datasets.
Strong understanding of the Machine Learning lifecycle - feature engineering, training, validation, scaling, deployment, scoring, monitoring, and feedback loop
Experience in analyzing complex problems and translating it into an analytical approach.
Experience in Supervised and Unsupervised Machine Learning including Classification, Forecasting, Anomaly detection, Pattern detection, Text Mining, using variety of techniques such as Decision trees, Time Series Analysis, Bag

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [None]:
tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
resume_feedback_data = [
    {"input": f"Resume: {resume} | Job: {job}", "output": "Your resume is strong in [X] but lacks [Y]."}
    for resume, job in zip(df_resumes["Resume"], df_jobs["Job Description"])
]

df_feedback = pd.DataFrame(resume_feedback_data)

In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(df_feedback)

def preprocess_data(examples):
    model_inputs = tokenizer(examples["input"], padding="max_length", truncation=True, max_length=512)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["output"], padding="max_length", truncation=True, max_length=128)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

dataset = dataset.map(preprocess_data)

Map:   0%|          | 0/962 [00:00<?, ? examples/s]

In [None]:
dataset = dataset.map(preprocess_data, batched=True)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="no",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    save_steps=1000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

trainer.train()

Map:   0%|          | 0/962 [00:00<?, ? examples/s]

Step,Training Loss


TrainOutput(global_step=363, training_loss=0.0014248623335657042, metrics={'train_runtime': 135.2236, 'train_samples_per_second': 21.342, 'train_steps_per_second': 2.684, 'total_flos': 390596439048192.0, 'train_loss': 0.0014248623335657042, 'epoch': 3.0})

In [None]:
from sklearn.model_selection import train_test_split
from transformers import TrainingArguments, Trainer

print("Columns in dataset before processing:", dataset.column_names)

if "input_ids" in dataset.column_names and "labels" in dataset.column_names:
    print("Dataset is already tokenized. Skipping preprocessing.")
else:
    dataset = dataset.map(preprocess_data, batched=True)

dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

train_size = int(0.8 * len(dataset))
train_dataset = dataset.select(range(train_size))
eval_dataset = dataset.select(range(train_size, len(dataset)))

print(f"Training examples: {len(train_dataset)}")
print(f"Evaluation examples: {len(eval_dataset)}")

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    save_steps=1000,
    save_total_limit=2,
    remove_unused_columns=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

trainer.train()

Columns in dataset before processing: ['input_ids', 'attention_mask', 'labels']
Dataset is already tokenized. Skipping preprocessing.
Training examples: 769
Evaluation examples: 193




Epoch,Training Loss,Validation Loss
1,No log,4e-06
2,No log,2e-06
3,No log,2e-06


TrainOutput(global_step=291, training_loss=0.00023835638851644247, metrics={'train_runtime': 116.9474, 'train_samples_per_second': 19.727, 'train_steps_per_second': 2.488, 'total_flos': 312233535995904.0, 'train_loss': 0.00023835638851644247, 'epoch': 3.0})

In [None]:
trainer.evaluate()

{'eval_loss': 2.0033714918099577e-06,
 'eval_runtime': 2.9997,
 'eval_samples_per_second': 64.341,
 'eval_steps_per_second': 8.334,
 'epoch': 3.0}

In [None]:
def generate_feedback(resume_text, job_description):
    input_text = f"Resume: {resume_text} | Job: {job_description}"
    input_ids = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).input_ids

    input_ids = input_ids.to(model.device)

    output_ids = model.generate(input_ids, max_length=150)
    feedback = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    return feedback

sample_resume = df_resumes.iloc[0]["Resume"]
sample_job = df_jobs.iloc[0]["Job Description"]

feedback = generate_feedback(sample_resume, sample_job)
print("Generated Feedback:", feedback)

Generated Feedback: Your resume is strong in [X] but lacks [Y].


In [None]:
!pip install gradio
import gradio as gr



In [None]:
def resume_evaluator(resume_text):

    matched_job, matched_description = find_best_match(resume_text)

    feedback = generate_feedback(resume_text, matched_description)

    return matched_job

iface = gr.Interface(
    fn=resume_evaluator,
    inputs=gr.Textbox(label="Paste Your Resume"),
    outputs=gr.Textbox(label="Best Matched Technology Job"),
    title="AI Resume Evaluator",
    description="Upload your resume and get AI-powered feedback!"
)

iface.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://38e054c3240367373f.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


