# Install and load the libraries.

In [1]:
!pip install -q chromadb==0.4.22
!pip install -q langchain==0.1.4
!pip install -q sentence_transformers==2.3.0
!pip install -q accelerate==0.26.1

# Load the Dataset

In [2]:
import numpy as np
import pandas as pd

In [None]:
#Download and unzip  the dataset from kaggle:
#https://www.kaggle.com/datasets/snehaanbhawal/resume-dataset?resource=download

#Pass the directory where the .csv file is stored to read_csv
resumes = pd.read_csv('../data/Resume.csv', sep=',')

#Using plain text resumes for simplicity
DOCUMENT="Resume_str" #Replace with "Resume_html" for HTML resumes
CATEGORY="Category"
ID="ID"

In [4]:
resumes.head(2)

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR


In [5]:
#Show distinct resume categories
categories = resumes[CATEGORY].unique()
print(f"Categories: {categories}")

Categories: ['HR' 'DESIGNER' 'INFORMATION-TECHNOLOGY' 'TEACHER' 'ADVOCATE'
 'BUSINESS-DEVELOPMENT' 'HEALTHCARE' 'FITNESS' 'AGRICULTURE' 'BPO' 'SALES'
 'CONSULTANT' 'DIGITAL-MEDIA' 'AUTOMOBILE' 'CHEF' 'FINANCE' 'APPAREL'
 'ENGINEERING' 'ACCOUNTANT' 'CONSTRUCTION' 'PUBLIC-RELATIONS' 'BANKING'
 'ARTS' 'AVIATION']


In [6]:
#Because it is just a example we select a small portion of resumes
#Per the 3/11/25 meeting, we'll focus on tech resumes
CATEGORY = "INFORMATION-TECHNOLOGY"
subset_resumes = resumes[resumes["Category"] == CATEGORY].reset_index(drop=True)

In [7]:
subset_resumes.head(2)

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,36856210,INFORMATION TECHNOLOGY Summar...,"<div class=""fontsize fontface vmargins hmargin...",INFORMATION-TECHNOLOGY
1,21780877,INFORMATION TECHNOLOGY SPECIALIST\tGS...,"<div class=""fontsize fontface vmargins hmargin...",INFORMATION-TECHNOLOGY


## CREATE THE DOCUMENT FROM THE DATAFRAME

In [8]:
from langchain.document_loaders import DataFrameLoader
from langchain.vectorstores import Chroma


First, we create the loader, indicating the data source and the name of the column in the DataFrame where we store what we could consider as the document, that is, the information we want to pass to the model so that it takes it into account in its responses.

In [9]:
df_loader = DataFrameLoader(subset_resumes, page_content_column=DOCUMENT)

Then, we use the loader to load the document.

In [10]:
df_document = df_loader.load()

In [11]:
display(df_document[:2])

[Document(page_content="         INFORMATION TECHNOLOGY         Summary     Dedicated  Information Assurance Professional \xa0well-versed in analyzing and mitigating risk and finding cost-effective solutions. Excels at boosting performance and productivity by establishing realistic goals and enforcing deadlines.\xa0 Versatile IT professional with 37 years of Enterprise design and engineering methodology.       Skills          Enterprise platforms  Knowledge of Product Lifecycle Management (PLM)  Project tracking  Hardware and software upgrade planning  Product requirements documentation  Self-directed  MS Visio  Decisive  Collaborative  Domain Active Directory Layout  Data storage engineering      Information Assurance  Risk Management Framework (RMF)  Active Directory design and deployment  Workstation build and deployment  Systems Accreditation Packages  Red Hat Enterprise Linux installation and hardening  Network Design & Troubleshooting  \xa0High Performance Computing            Ex

# Creating the embeddings

We split the data into manageable chunks to store as vectors using **CharacterTextSplitter**. There isn't an exact way to do this, more chunks means more detailed context, but will increase the size of our vectorstore.

There are no magic numbers to inform. It is important to consider that the larger the chunk size, the more context the model will have, but the size of our vector store will also increase.

In [33]:
from langchain.text_splitter import CharacterTextSplitter

In [34]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=3000,  # Large enough to keep resume context
    chunk_overlap=400  # Allows minor overlap for coherence
)

texts = text_splitter.split_documents(df_document)

In [35]:
display(texts[:2])

[Document(page_content='INFORMATION TECHNOLOGY         Summary     Dedicated  Information Assurance Professional \xa0well-versed in analyzing and mitigating risk and finding cost-effective solutions. Excels at boosting performance and productivity by establishing realistic goals and enforcing deadlines.\xa0 Versatile IT professional with 37 years of Enterprise design and engineering methodology.       Skills          Enterprise platforms  Knowledge of Product Lifecycle Management (PLM)  Project tracking  Hardware and software upgrade planning  Product requirements documentation  Self-directed  MS Visio  Decisive  Collaborative  Domain Active Directory Layout  Data storage engineering      Information Assurance  Risk Management Framework (RMF)  Active Directory design and deployment  Workstation build and deployment  Systems Accreditation Packages  Red Hat Enterprise Linux installation and hardening  Network Design & Troubleshooting  \xa0High Performance Computing            Experience 

We load the library to create the pre trained model from HuggingFace to create the embeddings from sentences.


In [13]:
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm


# Creating the Index With Chroma
Here we are creating the index of embeddings. Using the document, and the embedding function created above.

In [36]:
directory_cdb = './chromadb'
chroma_db = Chroma.from_documents(
    texts, # using the entire df_document is too slow
    embedding_function, 
    persist_directory=directory_cdb
)

## LANGCHAIN

### Using the new LCEL Architecture from LangChain.
LangChain recommends using LCEL (LangChain Expression Language) over Chains. 

In [15]:
from langchain.llms import HuggingFacePipeline
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

Now we create the retriever object, the responsible to return the data contained in the ChromaDB Database.

In [37]:
retriever = chroma_db.as_retriever()

In [17]:
!pip install huggingface_hub==0.23.0

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting huggingface_hub==0.23.0
  Using cached huggingface_hub-0.23.0-py3-none-any.whl.metadata (12 kB)
Using cached huggingface_hub-0.23.0-py3-none-any.whl (401 kB)
Installing collected packages: huggingface_hub
  Attempting uninstall: huggingface_hub
    Found existing installation: huggingface-hub 0.29.3
    Uninstalling huggingface-hub-0.29.3:
      Successfully uninstalled huggingface-hub-0.29.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
transformers 4.49.0 requires huggingface-hub<1.0,>=0.26.0, but you have huggingface-hub 0.23.0 which is incompatible.[0m[31m
[0mSuccessfully installed huggingface_hub-0.23.0


In [18]:
from getpass import getpass
hf_key = getpass("Hugging Face Key: ")

In [19]:
!huggingface-cli login --token $hf_key

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /Users/ebengunadi/.cache/huggingface/token
Login successful


## Importing LangChain Libraries.

In [20]:
from langchain import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

from langchain.llms import HuggingFacePipeline
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

In [21]:
import torch
from torch import cuda

In [22]:
#In a MAC Silicon the device must be 'mps'
# device = torch.device('mps') #to use with MAC Silicon
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

In [23]:
device

'cpu'

##Load the Model .

In [24]:
#You can try with any llama model, but you will need more GPU and memory as you
#increase the size of the model.
model_id = "meta-llama/Llama-3.2-1B-Instruct" #As agreed per the 3/11/25 meeting

In [25]:
%%time

# begin initializing HF items, need auth token for these
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    token=hf_key
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    device_map='auto',
    token=hf_key
)
model.eval()
print(f"Model loaded on {device}")


Model loaded on cpu


In [26]:
tokenizer = AutoTokenizer.from_pretrained(model_id,
                                          use_aut_token=hf_key)


In [27]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=128,
    temperature=0.1,
    #do_sample=False,
    top_p=0,
    #trust_remote_code=True,
    eos_token_id=tokenizer.eos_token_id,
    repetition_penalty=1.1,
    return_full_text=True,
    device_map='auto'
)

hf_llm = HuggingFacePipeline(pipeline=pipe)

Device set to use mps


## Testing

Query documents from Vector DB

In [38]:
retrieved_docs = retriever.get_relevant_documents("Tech resume summary:")
for i, doc in enumerate(retrieved_docs):
    print(f"Doc {i+1}:\n{doc.page_content}\n")

Doc 1:
Microsoft Certified Professional, Tech Skills   :     June 1999            Associate of Science   :   Computer Technology  ,   June 1993    American River College          Computer Technology        Accomplishments      Guided company to comply with PCI Data Security Standard and got it certifiedMigrated Analog phone system to VOIP, saving over $40k a year in costMigrated 80% of physical servers to Hyper-V to save cost and improve productivity.        Skills    Active Directory, Adobe, Antivirus, Backup Exec, Backup, Budget, business processes, call center, call center, capacity planning, Cisco, computer assembly, Hardware, contracts, Direct Attached Storage, DAS, Database, Dell, Dell Servers, Designing, Desktops, Documentation, Firewall, HP, hiring, information systems, information systems, ISO 9000, Leadership, Linux, Logistics, Logistics and Management, Mac, MAC OS, Managing, Access, Microsoft Certified Professional, Microsoft Exchange, Microsoft Office Professional, office, 

Have LLM generate a summary without RAG

In [30]:
%%time
response = hf_llm("Tech resume summary:")
print(response)

  warn_deprecated(
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


 Highlighting technical skills and experience in software development, data analysis, machine learning, cloud computing, cybersecurity, and IT infrastructure.

**Summary**

Highly skilled software developer with 5+ years of experience in designing, developing, testing, and deploying scalable and secure software applications. Proficient in a range of programming languages, including Java, Python, C++, and JavaScript. Strong expertise in data analysis using tools like Excel, SQL, and Tableau. Profound knowledge of machine learning algorithms and frameworks such as TensorFlow, PyTorch, and Scikit-learn. Skilled in cloud computing platforms like AWS, Azure, and Google Cloud Platform
CPU times: user 7.71 s, sys: 1.21 s, total: 8.92 s
Wall time: 16.5 s


In [39]:
%%time

# Function to clean and format retrieved documents
def format_docs(docs):
    return "\n\n".join([doc.page_content.strip() for doc in docs])

# Update the LCEL chain
chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}  # Apply formatting
    | ChatPromptTemplate.from_template(
        "Answer the question based on the following context:\n{context}\n\nQuestion: {question}"
    )
    | hf_llm
    | StrOutputParser()
)

response = chain.invoke("Tech resume summary:")
print(response)

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


 What are some of your accomplishments in the field of information technology?

Answer: Based on the provided tech resume, here are some of the notable accomplishments:

* Designed and implemented a new security infrastructure, reducing costs by 30%
* Developed and maintained a robust security architecture, ensuring compliance with industry standards
* Implemented a new vulnerability assessment platform, resulting in a 50% reduction in security breaches
* Coordinated the migration of 500+ users to a new email server, improving email efficiency and reducing downtime
* Conducted a thorough risk assessment and developed a mitigation plan, identifying and addressing potential security threats
* Collaborated with cross-functional
CPU times: user 11.7 s, sys: 3.1 s, total: 14.8 s
Wall time: 42.9 s
