In [3]:
import pandas as pd
from pathlib import Path

BASE_DIR = Path.cwd().parent
DATA_DIR = BASE_DIR / "data"
RAW_DATA_DIR = DATA_DIR / "raw"
PROCESSED_DATA_DIR = DATA_DIR / "processed"

csv_path = RAW_DATA_DIR / "us_tech_jobs.csv"
df = pd.read_csv(csv_path)

df.head()


Unnamed: 0,advertiserurl,company,employmenttype_jobstatus,jobdescription,jobid,joblocation_address,jobtitle,postdate,shift,site_name,skills,uniq_id
0,https://www.dice.com/jobs/detail/AUTOMATION-TE...,"Digital Intelligence Systems, LLC","C2H Corp-To-Corp, C2H Independent, C2H W2, 3 M...",Looking for Selenium engineers...must have sol...,Dice Id : 10110693,"Atlanta, GA",AUTOMATION TEST ENGINEER,1 hour ago,Telecommuting not available|Travel not required,,SEE BELOW,418ff92580b270ef4e7c14f0ddfc36b4
1,https://www.dice.com/jobs/detail/Information-S...,University of Chicago/IT Services,Full Time,The University of Chicago has a rapidly growin...,Dice Id : 10114469,"Chicago, IL",Information Security Engineer,1 week ago,Telecommuting not available|Travel not required,,"linux/unix, network monitoring, incident respo...",8aec88cba08d53da65ab99cf20f6f9d9
2,https://www.dice.com/jobs/detail/Business-Solu...,"Galaxy Systems, Inc.",Full Time,"GalaxE.SolutionsEvery day, our solutions affec...",Dice Id : CXGALXYS,"Schaumburg, IL",Business Solutions Architect,2 weeks ago,Telecommuting not available|Travel not required,,"Enterprise Solutions Architecture, business in...",46baa1f69ac07779274bcd90b85d9a72
3,https://www.dice.com/jobs/detail/Java-Develope...,TransTech LLC,Full Time,Java DeveloperFull-time/direct-hireBolingbrook...,Dice Id : 10113627,"Bolingbrook, IL","Java Developer (mid level)- FT- GREAT culture,...",2 weeks ago,Telecommuting not available|Travel not required,,Please see job description,3941b2f206ae0f900c4fba4ac0b18719
4,https://www.dice.com/jobs/detail/DevOps-Engine...,Matrix Resources,Full Time,Midtown based high tech firm has an immediate ...,Dice Id : matrixga,"Atlanta, GA",DevOps Engineer,48 minutes ago,Telecommuting not available|Travel not required,,"Configuration Management, Developer, Linux, Ma...",45efa1f6bc65acc32bbbb953a1ed13b7


Temizlik 1 — Kolonları at

In [4]:
df = df.drop(columns=["site_name", "shift", "advertiserurl", "jobid", "uniq_id"])
df.head()


Unnamed: 0,company,employmenttype_jobstatus,jobdescription,joblocation_address,jobtitle,postdate,skills
0,"Digital Intelligence Systems, LLC","C2H Corp-To-Corp, C2H Independent, C2H W2, 3 M...",Looking for Selenium engineers...must have sol...,"Atlanta, GA",AUTOMATION TEST ENGINEER,1 hour ago,SEE BELOW
1,University of Chicago/IT Services,Full Time,The University of Chicago has a rapidly growin...,"Chicago, IL",Information Security Engineer,1 week ago,"linux/unix, network monitoring, incident respo..."
2,"Galaxy Systems, Inc.",Full Time,"GalaxE.SolutionsEvery day, our solutions affec...","Schaumburg, IL",Business Solutions Architect,2 weeks ago,"Enterprise Solutions Architecture, business in..."
3,TransTech LLC,Full Time,Java DeveloperFull-time/direct-hireBolingbrook...,"Bolingbrook, IL","Java Developer (mid level)- FT- GREAT culture,...",2 weeks ago,Please see job description
4,Matrix Resources,Full Time,Midtown based high tech firm has an immediate ...,"Atlanta, GA",DevOps Engineer,48 minutes ago,"Configuration Management, Developer, Linux, Ma..."


Temizlik 2 — jobdescription normalize

In [5]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)         # URL kaldır
    text = re.sub(r"[^a-zA-Z0-9\s]", " ", text) # noktalama kaldır
    text = re.sub(r"\s+", " ", text)            # fazla boşluk kaldır
    return text.strip()

df["jobdescription_clean"] = df["jobdescription"].apply(clean_text)
df["jobdescription_clean"].head()


0    looking for selenium engineers must have solid...
1    the university of chicago has a rapidly growin...
2    galaxe solutionsevery day our solutions affect...
3    java developerfull time direct hirebolingbrook...
4    midtown based high tech firm has an immediate ...
Name: jobdescription_clean, dtype: object

Temizlik 3 — skills temizleme

In [6]:
import numpy as np

def clean_skills(text):
    if pd.isna(text):
        return np.nan
    
    text = text.lower()
    
    # noise değerler
    noise = ["see below", "please see job description", "(see job description)", "(see job description"]
    for n in noise:
        text = text.replace(n, "")
        
    # temizle
    parts = re.split(r"[,/|]", text)
    skills = [p.strip() for p in parts if len(p.strip()) > 1]
    
    if len(skills) == 0:
        return np.nan
    
    return skills

df["skills_clean"] = df["skills"].apply(clean_skills)
df["skills_clean"].head(10)


0                                                  NaN
1    [linux, unix, network monitoring, incident res...
2    [enterprise solutions architecture, business i...
3                                                  NaN
4    [configuration management, developer, linux, m...
5               [fico, ar, ap, asset management, haha]
6    [cisco, dns, http, networking, network enginee...
7    [.net, c#, mvc, restful web services, http, aw...
8    [c++, developer, development, javascript, user...
9                                                  NaN
Name: skills_clean, dtype: object

In [7]:
output_path = PROCESSED_DATA_DIR / "jobs_clean.csv"
df.to_csv(output_path, index=False)

output_path


WindowsPath('c:/Projects/ai-job-skill-gap-analyzer/data/processed/jobs_clean.csv')