In [1]:
# download data to local

import os

def detect_environment():
    """Detects the environment: Kaggle, Google Colab, or Local."""
    if "KAGGLE_KERNEL_RUN_TYPE" in os.environ:
        return "Kaggle"
    elif "COLAB_GPU" in os.environ:
        return "Google Colab"
    else:
        return "Local"

def create_folder_if_not_exists(path):
    """Creates a folder if it does not already exist."""
    if not os.path.exists(path):
        os.makedirs(path)
        print(f"Folder created at: {path}")
    else:
        print(f"Folder already exists at: {path}")

def setup_kaggle_credentials():
    """Ensures Kaggle API credentials are available."""
    if detect_environment() == "Google Colab":
      from google.colab import userdata
      os.environ["KAGGLE_KEY"] = userdata.get('KAGGLE_KEY')
      os.environ["KAGGLE_USERNAME"] = userdata.get('KAGGLE_USERNAME')
    elif detect_environment() == "Local":
      kaggle_json_path = os.path.expanduser("~/.kaggle/kaggle.json")
      if not os.path.exists(kaggle_json_path):
        # Prompt user to upload `kaggle.json`
        print("Please upload your `kaggle.json` file.")
        from google.colab import files
        uploaded = files.upload()
        with open(kaggle_json_path, "wb") as f:
            f.write(uploaded['kaggle.json'])
      os.chmod(kaggle_json_path, 0o600)  # Set correct permissions for the file

def download_kaggle_resource(input_path, resource_name):
    """
    Downloads a dataset or notebook output from Kaggle using the resource name.
    Automatically unzips if needed and logs useful diagnostic information.
    """
    from kaggle.api.kaggle_api_extended import KaggleApi
    import os

    setup_kaggle_credentials()

    # Initialize Kaggle API
    api = KaggleApi()
    api.authenticate()

    try:
        # Attempt to download as a dataset
        print(f"Attempting to download dataset: {resource_name}")
        api.dataset_download_files(resource_name, path=input_path, unzip=True)
        print(f"Dataset downloaded and extracted to {input_path}")
    except Exception as dataset_error:
        print(f"Dataset not found or inaccessible: {dataset_error}. Attempting as notebook output.")
        try:
            # Attempt to download as notebook output
            print(f"Attempting to download notebook output: {resource_name}")
            api.kernel_output(resource_name, path=input_path, unzip=True)
            print(f"Notebook output downloaded and extracted to {input_path}")
        except Exception as notebook_error:
            print(f"Failed to download resource: {resource_name}.")
            print(f"Dataset error: {dataset_error}")
            print(f"Notebook output error: {notebook_error}")
            print("Please verify the resource name, access permissions, and your Kaggle API credentials.")

In [2]:
!pip install kaggle
resource_name = "arshkon/linkedin-job-postings"


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
# Step 1: Detect environment
environment = detect_environment()
print(f"Environment detected: {environment}")

# Step 2: Create input folder
if environment == "Kaggle":
    input_path = "/kaggle/input"
elif environment == "Google Colab":
    input_path = "/content/input"
elif environment == "Local":
    input_path = "./input"
else:
    raise ValueError("Unknown environment detected!")

create_folder_if_not_exists(input_path)
print(f"Input folder path: {input_path}")

# Step 3: Create output folder
if environment == "Kaggle":
    output_path = "/kaggle/working/output"
elif environment == "Google Colab":
    output_path = "/content/output"
elif environment == "Local":
    output_path = "./output"
else:
    raise ValueError("Unknown environment detected!")

create_folder_if_not_exists(output_path)
print(f"Output folder path: {output_path}")

# Step 4: Download dataset or notebook output
if environment != "Kaggle":
    pass
    # resource_name = input("Enter the Kaggle dataset name or notebook name to download: ")
    download_kaggle_resource(input_path, resource_name)

# initialize the data prefix
if environment == "Kaggle":
    dataprefix = input_path + "/" + resource_name.split("/")[1]
elif environment == "Google Colab":
    dataprefix = input_path
elif environment == "Local":
    dataprefix = input_path
# mount cloud drive if necessary
if environment == "Google Colab":
  from google.colab import drive
  drive.mount('/content/drive')

Environment detected: Local
Folder already exists at: ./input
Input folder path: ./input
Folder already exists at: ./output
Output folder path: ./output
Attempting to download dataset: arshkon/linkedin-job-postings
Dataset URL: https://www.kaggle.com/datasets/arshkon/linkedin-job-postings
Dataset downloaded and extracted to ./input


In [4]:
import pandas as pd
# Load the Job Postings CSV file into a Pandas DataFrame
postings_df = pd.read_csv(f"{dataprefix}/postings.csv")
# Print number of rows and columns
print('jd shape=',postings_df.shape)

# Display the first few rows of the dataset to understand its structure
postings_df.head()

jd shape= (123849, 31)


Unnamed: 0,job_id,company_name,title,description,max_salary,pay_period,location,company_id,views,med_salary,...,skills_desc,listed_time,posting_domain,sponsored,work_type,currency,compensation_type,normalized_salary,zip_code,fips
0,921716,Corcoran Sawyer Smith,Marketing Coordinator,Job descriptionA leading real estate firm in N...,20.0,HOURLY,"Princeton, NJ",2774458.0,20.0,,...,Requirements: \n\nWe are seeking a College or ...,1713398000000.0,,0,FULL_TIME,USD,BASE_SALARY,38480.0,8540.0,34021.0
1,1829192,,Mental Health Therapist/Counselor,"At Aspen Therapy and Wellness , we are committ...",50.0,HOURLY,"Fort Collins, CO",,1.0,,...,,1712858000000.0,,0,FULL_TIME,USD,BASE_SALARY,83200.0,80521.0,8069.0
2,10998357,The National Exemplar,Assitant Restaurant Manager,The National Exemplar is accepting application...,65000.0,YEARLY,"Cincinnati, OH",64896719.0,8.0,,...,We are currently accepting resumes for FOH - A...,1713278000000.0,,0,FULL_TIME,USD,BASE_SALARY,55000.0,45202.0,39061.0
3,23221523,"Abrams Fensterman, LLP",Senior Elder Law / Trusts and Estates Associat...,Senior Associate Attorney - Elder Law / Trusts...,175000.0,YEARLY,"New Hyde Park, NY",766262.0,16.0,,...,This position requires a baseline understandin...,1712896000000.0,,0,FULL_TIME,USD,BASE_SALARY,157500.0,11040.0,36059.0
4,35982263,,Service Technician,Looking for HVAC service tech with experience ...,80000.0,YEARLY,"Burlington, IA",,3.0,,...,,1713452000000.0,,0,FULL_TIME,USD,BASE_SALARY,70000.0,52601.0,19057.0


In [5]:
# Load the Job Postings CSV file into a Pandas DataFrame
jimap_df = pd.read_csv(f"{dataprefix}/jobs/job_industries.csv")
ins_df = pd.read_csv(f"{dataprefix}/mappings/industries.csv")

# print the industries related to IT
IT_IND_KEYWORDS = ['computer', 'IT', 'network']
# print('|'.join(IT_IND_KEYWORDS))
ins_filter = ins_df['industry_name'].str.contains('|'.join(IT_IND_KEYWORDS))

ins_it = ins_df[ins_filter.fillna(False)]
print(ins_it)

     industry_id                          industry_name
93            96          IT Services and IT Consulting
268         1855              IT System Design Services
343         3102  IT System Custom Software Development
344         3103   IT System Operations and Maintenance
345         3104    IT System Installation and Disposal
346         3105         IT System Training and Support
347         3106                IT System Data Services
348         3107       IT System Testing and Evaluation


  ins_it = ins_df[ins_filter.fillna(False)]


In [6]:
df_itpos = pd.merge(pd.merge(ins_it,jimap_df,on='industry_id'), postings_df, on='job_id')
print(df_itpos.shape)
df_itpos.head()

(10410, 33)


Unnamed: 0,industry_id,industry_name,job_id,company_name,title,description,max_salary,pay_period,location,company_id,...,skills_desc,listed_time,posting_domain,sponsored,work_type,currency,compensation_type,normalized_salary,zip_code,fips
0,96,IT Services and IT Consulting,3884431567,"Reveille Technologies,Inc",ServiceNow Developer,Role : Senior ServiceNow DeveloperLocation – A...,,,"Austin, Texas Metropolitan Area",729238.0,...,,1712346000000.0,,0,CONTRACT,,,,,
1,96,IT Services and IT Consulting,3884916106,"Direct Sales Recruiting, LLC",Sales Manager - Public Relations & Investor Re...,Sales Manager - Public Relations & Investor Re...,,,New York City Metropolitan Area,766849.0,...,,1712346000000.0,,0,FULL_TIME,,,,,
2,96,IT Services and IT Consulting,3884431568,ApTask,Information Security Manager,Role: Information Security ManagerLocation: RE...,,,United States,2934678.0,...,,1712346000000.0,,0,FULL_TIME,,,,,
3,96,IT Services and IT Consulting,3861704803,Kastech Software Solutions Group,Business Development Manager,"Hello folks,Hope you are well and doing great,...",,,"Austin, Texas Metropolitan Area",15984730.0,...,,1712346000000.0,,0,FULL_TIME,,,,,
4,96,IT Services and IT Consulting,3884915161,"Direct Sales Recruiting, LLC",Business Development Rep - Digital Media - Chi...,Business Development Rep - Digital Media \nOur...,80000.0,YEARLY,Greater Chicago Area,766849.0,...,,1712346000000.0,,0,FULL_TIME,USD,BASE_SALARY,65000.0,,


In [7]:
postings_sample_df = df_itpos.sample(1000)
print(postings_sample_df.shape)
print(postings_sample_df.count())
print(postings_sample_df[pd.isna(postings_sample_df['skills_desc'])==False]['skills_desc'])

(1000, 33)
industry_id                   1000
industry_name                 1000
job_id                        1000
company_name                   995
title                         1000
description                   1000
max_salary                     253
pay_period                     282
location                      1000
company_id                     995
views                          987
med_salary                      29
min_salary                     253
formatted_work_type           1000
applies                        324
original_listed_time          1000
remote_allowed                 241
job_posting_url               1000
application_url                494
application_type              1000
expiry                        1000
closed_time                      9
formatted_experience_level     636
skills_desc                     15
listed_time                   1000
posting_domain                 467
sponsored                     1000
work_type                     1000
currency 

In [8]:
# Analyze the key elements from sample postings, such as responsibilities, qualifications
print("containing responsibilities:",str(postings_sample_df['description'].str.contains('responsibilities|responsibility').value_counts()[True]))
print("containing qualification:",postings_sample_df['description'].str.contains('qualification|qualifications').value_counts()[True])
print("containing keywords:",postings_sample_df['description'].str.contains('company').value_counts()[True])

# filter out the job postings that contains both responsibilities and qualifications
postings_sample_df = postings_sample_df[postings_sample_df['description'].str.contains('responsibilities|responsibility')]
postings_sample_df = postings_sample_df[postings_sample_df['description'].str.contains('qualification|qualifications')]

# Display the shape of the filtered DataFrame
print(postings_sample_df.shape)

# Display the first few rows of the filtered DataFrame
print(postings_sample_df[['job_id', 'description','job_posting_url']].head())

containing responsibilities: 182
containing qualification: 97
containing keywords: 401
(31, 33)
          job_id                                        description  \
6126  3903471780  Convene is an Equal Employment Opportunity Emp...   
1090  3889148724  Comcast brings together the best in media and ...   
6070  3904988854  Job Description\n\nQuem desempenhar a função s...   
9839  3906254078  Job Description\n\nHybrid:\n\nThis role is cat...   
8172  3905667790  Boutique buy side client is seeking a top tier...   

                                        job_posting_url  
6126  https://www.linkedin.com/jobs/view/3903471780/...  
1090  https://www.linkedin.com/jobs/view/3889148724/...  
6070  https://www.linkedin.com/jobs/view/3904988854/...  
9839  https://www.linkedin.com/jobs/view/3906254078/...  
8172  https://www.linkedin.com/jobs/view/3905667790/...  


In [9]:
JOB_DETAILS_EXTRACTOR = """
<task>
Identify the key details from a job description and company overview to create a structured JSON output. Focus on extracting the most crucial and concise information that would be most relevant for tailoring a resume to this specific job.
</task>

<job_description>
{job_description}
</job_description>

Note: The "job_duties_and_responsibilities", and "required_qualifications" sections are particularly important for resume tailoring. Ensure these are as comprehensive and accurate as possible.

{format_instructions}
"""

In [10]:
from typing import List,Optional
from pydantic import BaseModel, Field

class JobDetails(BaseModel):
    # job_title: str = Field(description="The specific role, its level, and scope within the organization.")
    job_purpose: Optional[str] = Field(description="A high-level overview of the role and why it exists in the organization.")
    # keywords: Optional[List[str]] = Field(description="Key expertise, skills, and requirements the job demands.")
    job_duties_and_responsibilities: List[str] = Field(description="Focus on essential functions, their frequency and importance, level of decision-making, areas of accountability, and any supervisory responsibilities.")
    required_qualifications: List[str] = Field(description="Including education, minimum experience, specific knowledge, skills, abilities, and any required licenses or certifications.")
    preferred_qualifications: Optional[List[str]] = Field(description="Additional \"nice-to-have\" qualifications that could set a candidate apart.")
    # eduation: Optional[str] = Field(description="The minimum education level required for the role.")
    # company_name: Optional[str] = Field(description="The name of the hiring organization.")
    # company_details: Optional[str] = Field(description="Overview, mission, values, or way of working that could be relevant for tailoring a resume or cover letter.")

In [11]:
from langchain_core.prompts import PromptTemplate
from langchain_core.messages import HumanMessage
from langchain_ollama import ChatOllama
from langchain_core.output_parsers import JsonOutputParser

llm = ChatOllama(
    model="llama3.1",
    # model="gemma2",
    temperature=0,
    # other params...
)

json_parser = JsonOutputParser(pydantic_object=JobDetails)

prompt_template = PromptTemplate(
    template=JOB_DETAILS_EXTRACTOR,
    input_variables=["job_description"],
    partial_variables={"format_instructions": json_parser.get_format_instructions()}
    )

responses = []
job_descptions_idx = []
for i in range(10):
    prompt = prompt_template.format(job_description=postings_sample_df.iloc[i]['description'])
    structured_llm = llm.with_structured_output(JobDetails)
    response = structured_llm.invoke(prompt)
    if response:
        # make sure only valid response is appended
        responses.append(response)
        job_descptions_idx.append(i)
    print(response)

job_purpose='Convene is a global lifestyle hospitality company that designs and operates premium meeting, event, and flexible office spaces.' job_duties_and_responsibilities=['Provide genuine anticipatory service to clients and participants in all aspects of hospitality and conference services.', 'Greet all clients with enthusiasm and friendliness', 'Maintain a warm and friendly demeanor at all times', 'Provide world-class service, in accordance with our Brand Standards', 'Set up, replenish, maintain, and breakdown Food and Beverage stations', 'Accommodate special client needs and last minute requests', 'Develop relationships with clients', 'Accurately answer client questions about culinary and our spaces in a friendly manner', 'Read, understand, and execute Program Execution Orders', 'Follow checklists and Standard Operating Procedures', 'Set up and breakdown conference rooms and refreshing rooms as needed', 'Perform facility maintenance', 'Maintain safe, clean, organized, and well-st

In [12]:
FILENAME_JOB_EXTRACTOR = "job_extractor.json"
FILENAME_JOB_DESCRIPTIONS = "job_descriptions.json"

output_ext = [json_parser.parse(response.json()) for response in responses]

# save responses to json
import json
with open(f"{output_path}/{FILENAME_JOB_EXTRACTOR}", "w") as f:
    json.dump(output_ext, f, indent=2)
    f.close()

# save job descriptions to json
with open(f"{output_path}/{FILENAME_JOB_DESCRIPTIONS}", "w") as f:
    json.dump(job_descptions_idx, f, indent=2)
    f.close()

In [18]:
from pydantic_core import from_json

# load job details from json
with open(f"{output_path}/{FILENAME_JOB_EXTRACTOR}", "r") as f:
    # returns JSON object as a dictionary
    data = json.load(f)

    # Closing file
    f.close()
# print(data)
# convert json to pydantic object
job_extracts = []
for d in data:
    # print(d)
    jd = from_json(json.dumps(d))
    job_extracts.append(jd)
    # print(jd)

# load job descriptions from txt
job_descptions_idx = []
with open(f"{output_path}/{FILENAME_JOB_DESCRIPTIONS}", "r") as f:
    job_descptions_idx = json.load(f)
    f.close()


* Whether the extraction(esp: skills) contains enough information from the original text 
  * leveraging existing skill extraction model to extract 
  * compare the data with the 
* Whether the extraction contains any contradictions or hallucinations from the original text.
  * use rouge score, which was used for evaluation of content conformation of text summarizaion as reference

In [14]:
!pip install rouge-score


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [42]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

# print(job_extracts[0].required_qualifications)
# print(job_descptions[0]['description'])

for i in range(len(job_descptions_idx)):
    scores = []
    # print(job_extracts[i]['required_qualifications'])
    for qualification in job_extracts[i]['required_qualifications']:
        scores.append(scorer.score(postings_sample_df.iloc[job_descptions_idx[i]]['description'], qualification))
    print(sum([s['rougeL'].precision for s in scores])/len(scores))
    # print('Average score of job {i} is {:.2}'.format(sum([s['rougeL'].fmeasure for s in scores])/len(scores))

0.8913299663299664
0.7857142857142857
0.4374149659863945
0.576923076923077
0.8888417206803054
0.7857142857142857
0.903125
1.0
0.9188034188034188
0.8630952380952381


In [None]:
!pip install transformers datasets evaluate accelerate

python(27904) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
from transformers import AutoTokenizer,pipeline

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

print(tokenizer.is_fast)

example = "My name is Sylvain and I work at Hugging Face in Brooklyn."
encoding = tokenizer(example)
print(type(encoding))
encoding.tokens()
encoding.word_ids()

start, end = encoding.word_to_chars(3)
example[start:end]

ner = pipeline("ner", grouped_entities=True)
ner("My name is Sylvain and I work at Hugging Face in Brooklyn.")

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision 4c53496 (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


True
<class 'transformers.tokenization_utils_base.BatchEncoding'>


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps:0


[{'entity_group': 'PER',
  'score': 0.9981694,
  'word': 'Sylvain',
  'start': 11,
  'end': 18},
 {'entity_group': 'ORG',
  'score': 0.9796019,
  'word': 'Hugging Face',
  'start': 33,
  'end': 45},
 {'entity_group': 'LOC',
  'score': 0.9932106,
  'word': 'Brooklyn',
  'start': 49,
  'end': 57}]

In [None]:
# from transformers import AutoModel
# model = AutoModel.from_pretrained("jjzha/jobspanbert-base-cased") 
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

model_name = "dslim/bert-base-NER"
# model_name = "jjzha/jobspanbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=32)
model = AutoModelForTokenClassification.from_pretrained(model_name)

nlp = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)
example = postings_sample_df.iloc[0]['description']

ner_results = nlp(example)

# preds = [
#     {
#         "entity": pred["entity"],
#         "score": round(pred["score"], 4),
#         "index": pred["index"],
#         "word": pred["word"],
#         "start": pred["start"],
#         "end": pred["end"],
#     }
#     for pred in ner_results
# ]
# print (preds)
print(ner_results)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps:0


NameError: name 'postings_sample_df' is not defined