Temiz dataset’i yükleme

In [20]:
import pandas as pd
from pathlib import Path

BASE_DIR = Path.cwd().parent
DATA_DIR = BASE_DIR / "data"
PROCESSED_DATA_DIR = DATA_DIR / "processed"

csv_path = PROCESSED_DATA_DIR / "jobs_clean.csv"
df = pd.read_csv(csv_path)

df.head()


Unnamed: 0,company,employmenttype_jobstatus,jobdescription,joblocation_address,jobtitle,postdate,skills,jobdescription_clean,skills_clean
0,"Digital Intelligence Systems, LLC","C2H Corp-To-Corp, C2H Independent, C2H W2, 3 M...",Looking for Selenium engineers...must have sol...,"Atlanta, GA",AUTOMATION TEST ENGINEER,1 hour ago,SEE BELOW,looking for selenium engineers must have solid...,
1,University of Chicago/IT Services,Full Time,The University of Chicago has a rapidly growin...,"Chicago, IL",Information Security Engineer,1 week ago,"linux/unix, network monitoring, incident respo...",the university of chicago has a rapidly growin...,"['linux', 'unix', 'network monitoring', 'incid..."
2,"Galaxy Systems, Inc.",Full Time,"GalaxE.SolutionsEvery day, our solutions affec...","Schaumburg, IL",Business Solutions Architect,2 weeks ago,"Enterprise Solutions Architecture, business in...",galaxe solutionsevery day our solutions affect...,"['enterprise solutions architecture', 'busines..."
3,TransTech LLC,Full Time,Java DeveloperFull-time/direct-hireBolingbrook...,"Bolingbrook, IL","Java Developer (mid level)- FT- GREAT culture,...",2 weeks ago,Please see job description,java developerfull time direct hirebolingbrook...,
4,Matrix Resources,Full Time,Midtown based high tech firm has an immediate ...,"Atlanta, GA",DevOps Engineer,48 minutes ago,"Configuration Management, Developer, Linux, Ma...",midtown based high tech firm has an immediate ...,"['configuration management', 'developer', 'lin..."


TF-IDF Feature extraction (baseline model)

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    stop_words="english",
    max_features=5000
)

X = vectorizer.fit_transform(df["jobdescription_clean"])

X.shape


(22000, 5000)

İş ilanlarını role göre clustera hazırlama kısmı

In [22]:
from sklearn.cluster import MiniBatchKMeans

k = 10  # ilk deneme, sonra optimize edeceğiz
kmeans = MiniBatchKMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(X)

df["cluster"] = clusters
df["cluster"].value_counts()


cluster
4    6056
7    4037
1    3065
2    2501
5    2327
3    1969
0     788
8     539
6     416
9     302
Name: count, dtype: int64

İlk cluster özetini yazdırma

In [23]:
df.groupby("cluster")["jobtitle"].apply(lambda x: x.value_counts().head(3))


cluster                                                     
0        Security Engineer                                       34
         Security Analyst                                        19
         Security Architect                                      11
1        Java Developer                                         100
         .Net Developer                                          52
         Web Developer                                           37
2        Network Engineer                                       117
         Systems Administrator                                   42
         Systems Engineer                                        40
3        Data Analyst                                            36
         Data Scientist                                          26
         Data Architect                                          24
4        Business Analyst                                        35
         Technical Recruiter                           

In [24]:
df_skills = df.dropna(subset=["skills_clean"])
len(df_skills)


21923

In [25]:
from collections import Counter

cluster_skills = {}

for c in sorted(df_skills["cluster"].unique()):
    skills = []
    for row in df_skills[df_skills["cluster"] == c]["skills_clean"]:
        skills.extend(row)
    cluster_skills[c] = Counter(skills).most_common(20)

cluster_skills


{np.int32(0): [("'", 9435),
  (' ', 7740),
  ('e', 6010),
  ('i', 5233),
  ('s', 4747),
  ('t', 4635),
  ('n', 4166),
  ('r', 4074),
  ('a', 4058),
  (',', 3935),
  ('c', 3525),
  ('o', 2935),
  ('l', 1977),
  ('p', 1874),
  ('u', 1771),
  ('m', 1641),
  ('y', 1487),
  ('d', 1302),
  ('g', 1140),
  ('[', 787)],
 np.int32(1): [("'", 37547),
  (' ', 27283),
  ('e', 18722),
  (',', 15719),
  ('a', 15183),
  ('s', 14009),
  ('r', 11947),
  ('t', 10689),
  ('n', 9532),
  ('i', 9519),
  ('o', 8716),
  ('l', 7951),
  ('c', 7912),
  ('p', 6847),
  ('m', 5281),
  ('v', 5169),
  ('d', 5014),
  ('j', 4854),
  ('u', 3900),
  ('g', 3636)],
 np.int32(2): [("'", 30852),
  (' ', 25581),
  ('e', 19106),
  ('i', 14202),
  ('n', 13641),
  ('s', 13416),
  ('r', 13232),
  (',', 12948),
  ('t', 12689),
  ('a', 11875),
  ('o', 11465),
  ('c', 10188),
  ('p', 6427),
  ('l', 6180),
  ('d', 5473),
  ('m', 5297),
  ('w', 4658),
  ('u', 4174),
  ('g', 3853),
  ('v', 3038)],
 np.int32(3): [("'", 22340),
  (' ', 18

In [26]:
for c, skills in cluster_skills.items():
    print(f"\nCLUSTER {c}")
    for skill, count in skills:
        print(skill, count)



CLUSTER 0
' 9435
  7740
e 6010
i 5233
s 4747
t 4635
n 4166
r 4074
a 4058
, 3935
c 3525
o 2935
l 1977
p 1874
u 1771
m 1641
y 1487
d 1302
g 1140
[ 787

CLUSTER 1
' 37547
  27283
e 18722
, 15719
a 15183
s 14009
r 11947
t 10689
n 9532
i 9519
o 8716
l 7951
c 7912
p 6847
m 5281
v 5169
d 5014
j 4854
u 3900
g 3636

CLUSTER 2
' 30852
  25581
e 19106
i 14202
n 13641
s 13416
r 13232
, 12948
t 12689
a 11875
o 11465
c 10188
p 6427
l 6180
d 5473
m 5297
w 4658
u 4174
g 3853
v 3038

CLUSTER 3
' 22340
  18646
e 13640
a 13602
s 10358
t 9440
, 9215
i 8924
n 8275
r 7677
o 7217
l 6877
c 5188
d 5071
m 3949
p 3781
g 3281
u 2714
h 2242
b 2215

CLUSTER 4
' 48765
  43307
e 35269
a 25999
i 23312
n 22246
s 22157
t 21849
r 21002
o 19011
, 18349
c 16320
l 14310
p 10781
m 10730
d 9862
g 7776
u 7587
[ 6046
] 6046

CLUSTER 5
' 31544
  22633
e 17041
, 13467
a 13246
n 10676
s 10376
i 10316
t 10212
r 9906
o 9587
l 6992
c 6677
p 5906
d 5259
m 5036
u 4365
g 4266
v 3401
h 2903

CLUSTER 6
' 9606
  8059
e 4909
, 4390
a 4228


In [27]:
import re

def filter_skills(skill_list):
    if isinstance(skill_list, list):
        cleaned = []
        for skill in skill_list:
            skill = skill.strip().lower()

            # remove punctuation leftovers
            skill = re.sub(r"[^a-z0-9#+. ]", "", skill)
            
            # remove very short tokens
            if len(skill) < 3:
                continue
            
            cleaned.append(skill)

        return cleaned if len(cleaned) > 0 else None
    return None

df_2 = df_skills.copy()
df_2["skills_filtered"] = df_2["skills_clean"].apply(filter_skills)

df_2[["skills_clean", "skills_filtered"]].head(10)


Unnamed: 0,skills_clean,skills_filtered
1,"['linux', 'unix', 'network monitoring', 'incid...",
2,"['enterprise solutions architecture', 'busines...",
4,"['configuration management', 'developer', 'lin...",
5,"['fico', 'ar', 'ap', 'asset management', 'haha']",
6,"['cisco', 'dns', 'http', 'networking', 'networ...",
7,"['.net', 'c#', 'mvc', 'restful web services', ...",
8,"['c++', 'developer', 'development', 'javascrip...",
10,['openstack'],
11,"['unix', 'iam', 'scripting knowledge', 'oim', ...",
12,"['java', 'oss']",


In [28]:
from collections import Counter

cluster_skill_profiles = {}

for c in sorted(df_2["cluster"].unique()):
    skills = []
    for row in df_2[df_2["cluster"] == c]["skills_filtered"]:
        if isinstance(row, list):
            skills.extend(row)
    cluster_skill_profiles[c] = Counter(skills).most_common(20)

cluster_skill_profiles


{np.int32(0): [],
 np.int32(1): [],
 np.int32(2): [],
 np.int32(3): [],
 np.int32(4): [],
 np.int32(5): [],
 np.int32(6): [],
 np.int32(7): [],
 np.int32(8): [],
 np.int32(9): []}

In [29]:
for skill, count in cluster_skill_profiles[1][:10]:
    print(skill, count)


In [30]:
import re

def clean_skill(skill):
    skill = skill.lower().strip()
    
    # remove only extreme garbage
    remove_list = ["see below", "please see job description", "haha"]
    for bad in remove_list:
        if bad in skill:
            return None

    # basic punctuation cleanup (keep +, #, .)
    skill = re.sub(r"[^a-z0-9#+. ]", "", skill)
    
    # remove trailing/leading punctuation
    skill = skill.strip(" .,+-")
    
    # discard if empty
    if len(skill) == 0:
        return None
    
    return skill

def rebuild_skills_list(skill_list):
    if not isinstance(skill_list, list):
        return None
    
    cleaned = []
    for sk in skill_list:
        sk2 = clean_skill(sk)
        if sk2:
            cleaned.append(sk2)
    
    return cleaned if cleaned else None

df2 = df_skills.copy()
df2["skills_filtered"] = df2["skills_clean"].apply(rebuild_skills_list)

df2[["skills_clean", "skills_filtered"]].head(10)


Unnamed: 0,skills_clean,skills_filtered
1,"['linux', 'unix', 'network monitoring', 'incid...",
2,"['enterprise solutions architecture', 'busines...",
4,"['configuration management', 'developer', 'lin...",
5,"['fico', 'ar', 'ap', 'asset management', 'haha']",
6,"['cisco', 'dns', 'http', 'networking', 'networ...",
7,"['.net', 'c#', 'mvc', 'restful web services', ...",
8,"['c++', 'developer', 'development', 'javascrip...",
10,['openstack'],
11,"['unix', 'iam', 'scripting knowledge', 'oim', ...",
12,"['java', 'oss']",


In [31]:
from collections import Counter

cluster_skill_profiles_2 = {}

for c in sorted(df2["cluster"].unique()):
    skills = []
    for row in df2[df2["cluster"] == c]["skills_filtered"]:
        if isinstance(row, list):
            skills.extend(row)
    cluster_skill_profiles_2[c] = Counter(skills).most_common(20)

cluster_skill_profiles_2


{np.int32(0): [],
 np.int32(1): [],
 np.int32(2): [],
 np.int32(3): [],
 np.int32(4): [],
 np.int32(5): [],
 np.int32(6): [],
 np.int32(7): [],
 np.int32(8): [],
 np.int32(9): []}

In [32]:
for c, skills in cluster_skill_profiles_2.items():
    print(f"\nCLUSTER {c}")
    for skill, count in skills[:10]:
        print(f"{skill} → {count}")



CLUSTER 0

CLUSTER 1

CLUSTER 2

CLUSTER 3

CLUSTER 4

CLUSTER 5

CLUSTER 6

CLUSTER 7

CLUSTER 8

CLUSTER 9


In [None]:
import ast

df2 = df_skills.copy()

df2["skills_clean"] = df2["skills_clean"].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

df2["skills_clean"].head()
