In [14]:
import os
os.getcwd()


'c:\\Projects\\ai-job-skill-gap-analyzer\\notebooks'

In [15]:
import sys
from pathlib import Path

BASE_DIR = Path.cwd().parent
sys.path.append(str(BASE_DIR))


In [25]:
from src.features import (
    parse_skills_column,
    rebuild_skills_list,
    build_skill_profiles
)


# Feature Engineering & Baseline Model (v1)

Bu notebook'ta:
1. Temizlenmiş iş ilanı verisini yüklüyoruz
2. TF-IDF tabanlı feature engineering yapıyoruz
3. K-means ile job role clustering yapıyoruz
4. Cluster bazlı skill profilleri çıkarıyoruz


In [17]:
import pandas as pd
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import MiniBatchKMeans
from collections import Counter
import ast
import re

BASE_DIR = Path.cwd().parent
DATA_DIR = BASE_DIR / "data"
PROCESSED_DATA_DIR = DATA_DIR / "processed"

csv_path = PROCESSED_DATA_DIR / "jobs_clean.csv"
df = pd.read_csv(csv_path)

df.head()


Unnamed: 0,company,employmenttype_jobstatus,jobdescription,joblocation_address,jobtitle,postdate,skills,jobdescription_clean,skills_clean
0,"Digital Intelligence Systems, LLC","C2H Corp-To-Corp, C2H Independent, C2H W2, 3 M...",Looking for Selenium engineers...must have sol...,"Atlanta, GA",AUTOMATION TEST ENGINEER,1 hour ago,SEE BELOW,looking for selenium engineers must have solid...,
1,University of Chicago/IT Services,Full Time,The University of Chicago has a rapidly growin...,"Chicago, IL",Information Security Engineer,1 week ago,"linux/unix, network monitoring, incident respo...",the university of chicago has a rapidly growin...,"['linux', 'unix', 'network monitoring', 'incid..."
2,"Galaxy Systems, Inc.",Full Time,"GalaxE.SolutionsEvery day, our solutions affec...","Schaumburg, IL",Business Solutions Architect,2 weeks ago,"Enterprise Solutions Architecture, business in...",galaxe solutionsevery day our solutions affect...,"['enterprise solutions architecture', 'busines..."
3,TransTech LLC,Full Time,Java DeveloperFull-time/direct-hireBolingbrook...,"Bolingbrook, IL","Java Developer (mid level)- FT- GREAT culture,...",2 weeks ago,Please see job description,java developerfull time direct hirebolingbrook...,
4,Matrix Resources,Full Time,Midtown based high tech firm has an immediate ...,"Atlanta, GA",DevOps Engineer,48 minutes ago,"Configuration Management, Developer, Linux, Ma...",midtown based high tech firm has an immediate ...,"['configuration management', 'developer', 'lin..."


## 1. TF-IDF Feature Engineering ve K-means Clustering


In [18]:
vectorizer = TfidfVectorizer(
    stop_words="english",
    max_features=5000
)

X = vectorizer.fit_transform(df["jobdescription_clean"])
X.shape


(22000, 5000)

In [19]:
k = 10
kmeans = MiniBatchKMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(X)

df["cluster"] = clusters
df["cluster"].value_counts()


cluster
4    6056
7    4037
1    3065
2    2501
5    2327
3    1969
0     788
8     539
6     416
9     302
Name: count, dtype: int64

In [20]:
df.groupby("cluster")["jobtitle"].apply(lambda x: x.value_counts().head(5))


cluster                                                         
0        Security Engineer                                           34
         Security Analyst                                            19
         Security Architect                                          11
         Senior Security Engineer                                    11
         Information Security Analyst                                 9
1        Java Developer                                             100
         .Net Developer                                              52
         Web Developer                                               37
         Software Engineer                                           37
         .NET Developer                                              32
2        Network Engineer                                           117
         Systems Administrator                                       42
         Systems Engineer                                            40

## 2. Skill Listelerini Parse Etme ve Temizleme


In [21]:
# skills_clean string -> list dönüşümü
df_skills = df.dropna(subset=["skills_clean"]).copy()
df_skills["skills_clean"] = df_skills["skills_clean"].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)
df_skills["skills_clean"].head()


1    [linux, unix, network monitoring, incident res...
2    [enterprise solutions architecture, business i...
4    [configuration management, developer, linux, m...
5               [fico, ar, ap, asset management, haha]
6    [cisco, dns, http, networking, network enginee...
Name: skills_clean, dtype: object

In [22]:
def clean_skill(skill):
    skill = skill.lower().strip()
    
    remove_list = ["see below", "please see job description", "haha"]
    for bad in remove_list:
        if bad in skill:
            return None
    
    skill = re.sub(r"[^a-z0-9#+. ]", "", skill)
    skill = skill.strip(" .,+-")
    
    if len(skill) == 0:
        return None
    
    return skill

def rebuild_skills_list(skill_list):
    if not isinstance(skill_list, list):
        return None
    
    cleaned = []
    for sk in skill_list:
        sk2 = clean_skill(sk)
        if sk2:
            cleaned.append(sk2)
    
    return cleaned if cleaned else None

df_skills["skills_filtered"] = df_skills["skills_clean"].apply(rebuild_skills_list)
df_skills[["skills_clean", "skills_filtered"]].head(10)


Unnamed: 0,skills_clean,skills_filtered
1,"[linux, unix, network monitoring, incident res...","[linux, unix, network monitoring, incident res..."
2,"[enterprise solutions architecture, business i...","[enterprise solutions architecture, business i..."
4,"[configuration management, developer, linux, m...","[configuration management, developer, linux, m..."
5,"[fico, ar, ap, asset management, haha]","[fico, ar, ap, asset management]"
6,"[cisco, dns, http, networking, network enginee...","[cisco, dns, http, networking, network enginee..."
7,"[.net, c#, mvc, restful web services, http, aw...","[net, c#, mvc, restful web services, http, aws..."
8,"[c++, developer, development, javascript, user...","[c, developer, development, javascript, user i..."
10,[openstack],[openstack]
11,"[unix, iam, scripting knowledge, oim, windows,...","[unix, iam, scripting knowledge, oim, windows,..."
12,"[java, oss]","[java, oss]"


## 3. Cluster Bazlı Skill Profilleri

In [23]:
cluster_skill_profiles = {}

for c in sorted(df_skills["cluster"].unique()):
    skills = []
    for row in df_skills[df_skills["cluster"] == c]["skills_filtered"]:
        if isinstance(row, list):
            skills.extend(row)
    cluster_skill_profiles[c] = Counter(skills).most_common(20)


In [24]:
for c, skills in cluster_skill_profiles.items():
    print(f"\nCLUSTER {c}")
    for skill, count in skills[:10]:
        print(f"{skill} → {count}")



CLUSTER 0
security → 227
cissp → 141
management → 77
development → 53
analysis → 51
linux → 50
unix → 39
siem → 39
ids → 39
python → 39

CLUSTER 1
javascript → 758
java → 704
css → 460
html → 455
sql → 444
c# → 437
jquery → 341
net → 316
asp.net → 273
j2ee → 271

CLUSTER 2
windows → 332
cisco → 313
vmware → 293
linux → 263
security → 218
active directory → 202
management → 192
hardware → 174
networking → 162
wan → 162

CLUSTER 3
sql → 502
hadoop → 178
etl → 175
java → 163
python → 149
oracle → 139
development → 127
business intelligence → 120
management → 110
analysis → 103

CLUSTER 4
sql → 333
java → 332
development → 217
management → 192
linux → 163
javascript → 158
sales → 153
project → 151
oracle → 136
security → 133

CLUSTER 5
java → 630
linux → 385
python → 371
development → 343
javascript → 279
aws → 247
agile → 211
management → 210
c → 205
programming → 188

CLUSTER 6
javascript → 179
sql → 107
python → 83
java → 79
css → 76
c# → 72
c → 69
html → 66
mysql → 63
linux → 62

CLUS