In [1]:
!pip install sentence-transformers
!pip install nltk

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [2]:
import pandas as pd
import nltk
import re
from sentence_transformers import SentenceTransformer, util

# Download nltk data untuk lemmatize, stopwords (jika mau pakai)
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # lowercase
    text = text.lower()
    # hapus non-alphabetic chars
    text = re.sub(r'[^a-z\s]', '', text)
    # tokenize & lemmatize
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

2025-08-12 07:53:45.941302: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754985226.286987      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754985226.390006      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


In [3]:
# Load Data
postings = pd.read_csv('/kaggle/input/linkedin-job-postings/postings.csv')
job_skills = pd.read_csv('/kaggle/input/linkedin-job-postings/jobs/job_skills.csv')
skills = pd.read_csv('/kaggle/input/linkedin-job-postings/mappings/skills.csv')

In [4]:
# Gabungkan job_skills dengan skills untuk nama skill
job_skills = job_skills.merge(skills, how='left', on='skill_abr')

In [5]:
job_skills

Unnamed: 0,job_id,skill_abr,skill_name
0,3884428798,MRKT,Marketing
1,3884428798,PR,Public Relations
2,3884428798,WRT,Writing/Editing
3,3887473071,SALE,Sales
4,3887465684,FIN,Finance
...,...,...,...
213763,3902876855,HR,Human Resources
213764,3902878689,MGMT,Management
213765,3902878689,MNFC,Manufacturing
213766,3902883233,SALE,Sales


In [6]:
# Buat dictionary mapping job_id ke list skill
job_skill_dict = job_skills.groupby('job_id')['skill_name'].apply(lambda x: ' '.join(x)).to_dict()

In [7]:
list(job_skill_dict.items())[:10]

[(921716, 'Marketing Sales'),
 (1218575, 'Health Care Provider'),
 (1829192, 'Health Care Provider'),
 (2264355, 'Design Art/Creative Information Technology'),
 (10998357, 'Management Manufacturing'),
 (11009123, 'Design Art/Creative Information Technology'),
 (23221523, 'Other'),
 (35982263, 'Information Technology'),
 (56924323, 'Engineering'),
 (69333422, 'Marketing Sales')]

In [8]:
# Gabungkan title, description, skill jadi satu text field
def combine_text(row):
    skills_text = job_skill_dict.get(row['job_id'], '')
    parts = [str(row['title']), str(row['description']), skills_text]
    combined = ' '.join([p for p in parts if p and p != 'nan'])
    return preprocess_text(combined)

In [9]:
postings['combined_text'] = postings.apply(combine_text, axis=1)

In [10]:
# Load model sentence-transformers
model = SentenceTransformer('all-MiniLM-L6-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [11]:
# Generate embeddings untuk semua job postings
embeddings = model.encode(postings['combined_text'].tolist(), show_progress_bar=True)

Batches:   0%|          | 0/3871 [00:00<?, ?it/s]

In [14]:
# Fungsi mencari job mirip query teks
def search_jobs(query, top_k=5):
    query_processed = preprocess_text(query)
    query_emb = model.encode(query_processed)
    cos_scores = util.cos_sim(query_emb, embeddings)[0]
    top_results = cos_scores.topk(top_k)
    
    for score, idx in zip(top_results.values, top_results.indices):
        idx = int(idx)
        print(f"Score: {score.item():.4f}")
        print(f"Title: {postings.iloc[idx]['title']}")
        print(f"Description: {postings.iloc[idx]['description'][:200]}...")  # preview deskripsi
        print('---')

In [18]:
# Contoh pakai
search_jobs("sales analyst")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Score: 0.7258
Title: Sales Analyst
Description: Sales Analyst
Are you a finance whiz with a passion for business and marketing? Do you have a knack for spotting trends and identifying opportunities for growth? Our client has an exciting opportunity...
---
Score: 0.6568
Title: Sales Operations Analyst-US Remote
Description: It's fun to work in a company where people truly BELIEVE in what they're doing!

We're committed to bringing passion and customer focus to the business.

The Role 

In this entry-level Sales Operation...
---
Score: 0.6539
Title: Sales Analyst
Description: PowerStop stands out as the pioneering force within the automotive aftermarket brake sector. Being part of PowerStop’s team entails becoming a member of a company that is reshaping the industry’s land...
---
Score: 0.6520
Title: Junior Analyst
Description: https://www.indeed.com/viewjob?from=appshareios&jk=0ffb85ac5977a783...
---
Score: 0.6481
Title: Sales Analyst
Description: POSITION:Sales Analyst
JOB TYPE:W2, Con