In [1]:
import pandas as pd

# Login using e.g. `huggingface-cli login` to access this dataset
df = pd.read_json("hf://datasets/NxtGenIntern/IT_Job_Roles_Skills_Certifications_Dataset/Top_207_IT_Job_Roles_Skills_Database.json")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df.head()

Unnamed: 0,Job Title,Job Description,Skills,Certifications
0,Admin Big Data,Responsible for managing and overseeing big da...,"Hadoop, Spark, MapReduce, Data Lakes, Data War...","Cloudera Certified Professional (CCP), Hortonw..."
1,Ansible Operations Engineer,Focuses on automating IT processes using Ansib...,"Ansible, Linux, Automation, Cloud Platforms, C...",Red Hat Certified Specialist in Ansible Automa...
2,Artifactory Administrator,Manages the Artifactory repository for build a...,"Artifactory, CI/CD, Jenkins, Docker, Maven, Gr...","JFrog Artifactory Certification, DevOps Instit..."
3,Artificial intelligence / Machine Learning Eng...,No description available,,
4,Artificial Intelligence / Machine Learning Leader,"Leads AI/ML projects and teams, defining strat...","AI Strategy, Machine Learning, Team Management...","AI-900: Microsoft Azure AI Fundamentals, Certi..."


In [3]:
df['Job Title'].nunique()

200

In [4]:
df.describe()

Unnamed: 0,Job Title,Job Description,Skills,Certifications
count,207,207,207,207.0
unique,200,194,175,125.0
top,DevOps Architect,No description available,"JavaScript, Java, Python, Git, SQL, HTML, CSS,...",
freq,2,7,8,50.0


In [5]:
df['Job Title'].value_counts()

Job Title
DevOps Architect                 2
Machine Learning Engineer        2
Splunk Engineer                  2
Senior DevOps Engineer           2
ELK Engineer                     2
                                ..
Web Designer (UI/UX Designer)    1
Web Developer                    1
WordPress Developer              1
XL Deploy Engineer               1
Zabbix Engineer                  1
Name: count, Length: 200, dtype: int64

In [6]:
df["Skills"] = df["Skills"].apply(lambda x: [s.strip() for s in x.split(",")])
df["Certifications"] = df["Certifications"].apply(lambda x: [c.strip() for c in x.split(",")])

In [7]:
# Step 1: Create an empty dictionary to hold merged data
merged_data = {}

# Step 2: Loop over each row in the DataFrame
for index, row in df.iterrows():
    job = row["Job Title"]
    skills = row["Skills"]           # should be a list
    certifications = row["Certifications"]  # should be a list
    
    if job not in merged_data:
        merged_data[job] = {
            "Skills": set(skills),
            "Certifications": set(certifications)
        }
    else:
        merged_data[job]["Skills"].update(skills)
        merged_data[job]["Certifications"].update(certifications)

# Step 3: Convert the dictionary back to a DataFrame
df_merged = pd.DataFrame({
    "Job Title": list(merged_data.keys()),
    "Skills": [list(v["Skills"]) for v in merged_data.values()],
    "Certifications": [list(v["Certifications"]) for v in merged_data.values()]
})

df_merged.head()


Unnamed: 0,Job Title,Skills,Certifications
0,Admin Big Data,"[AWS, MapReduce, Data Governance, Cloud Comput...","[Hortonworks Certified Associate (HCA), AWS Ce..."
1,Ansible Operations Engineer,"[AWS, CI/CD, Linux, Kubernetes, Terraform, Net...",[Red Hat Certified Specialist in Ansible Autom...
2,Artifactory Administrator,"[Gradle, Maven, CI/CD, Cloud Computing, Linux,...","[JFrog Artifactory Certification, DevOps Insti..."
3,Artificial intelligence / Machine Learning Eng...,[],[]
4,Artificial Intelligence / Machine Learning Leader,"[AI Strategy, Stakeholder Management, Cloud Co...",[Certified Artificial Intelligence Practitione...


In [8]:
df_merged.describe()

Unnamed: 0,Job Title,Skills,Certifications
count,200,200,200
unique,200,174,121
top,Admin Big Data,"[Algorithms, SQL, HTML, Data Structures, Git, ...",[]
freq,1,8,50


In [9]:
skill_dict = {}

for index, row in df.iterrows():
    job = row["Job Title"]
    for skill in row["Skills"]:
        skill_lower = skill.lower()
        if skill_lower not in skill_dict:
            skill_dict[skill_lower] = [job]
        else:
            skill_dict[skill_lower].append(job)

print(skill_dict)

{'hadoop': ['Admin Big Data', 'Big Data Architect', 'Big Data Engineer', 'Big Data Specialist', 'Principle Engineer in Big Data', 'Data Engineer', 'DATA SCIENTIST'], 'spark': ['Admin Big Data', 'Big Data Architect', 'Big Data Engineer', 'Big Data Specialist', 'Principle Engineer in Big Data', 'Data Engineer', 'DATA SCIENTIST'], 'mapreduce': ['Admin Big Data', 'Big Data Architect'], 'data lakes': ['Admin Big Data', 'Big Data Architect'], 'data warehousing': ['Admin Big Data', 'Big Data Architect', 'Big Data Engineer', 'Data Architect', 'Principle Engineer in Big Data', 'Data Engineer', 'DATA MODELER'], 'big data architecture': ['Admin Big Data', 'Principle Engineer in Big Data'], 'nosql': ['Admin Big Data', 'Big Data Architect', 'Big Data Engineer', 'Big Data Specialist', 'Data Architect'], 'data modeling': ['Admin Big Data', 'Big Data Architect', 'Big Data Specialist', 'Data Analysts', 'Data Architect', 'Principle Engineer in Data Analysis', 'DATA ANALYST', 'Data Engineer', 'DATA MODEL

In [13]:
resume_skills = ["C#"]

# Normalize to lowercase
resume_skills = [s.lower() for s in resume_skills]

# Count how many skills match per job
job_match_count = {}

for skill in resume_skills:
    if skill in skill_dict:
        for job in skill_dict[skill]:
            job_match_count[job] = job_match_count.get(job, 0) + 1

job_match_list = list(job_match_count.items())
job_match_list.sort(key=lambda item: item[1], reverse=True)
job_match_sorted = job_match_list
for job, count in job_match_sorted:
    print(f"{job}: {count} matching skills")

print(job_match_sorted)


.NET Developer: 1 matching skills
C# Developer: 1 matching skills
Game Developer: 1 matching skills
Sharepoint Developer: 1 matching skills
Unity Developer: 1 matching skills
[('.NET Developer', 1), ('C# Developer', 1), ('Game Developer', 1), ('Sharepoint Developer', 1), ('Unity Developer', 1)]
