In [8]:
import requests
from bs4 import BeautifulSoup
from time import sleep
import pandas as pd
import random

In [None]:
trigger_words = ['data', 'machine learning', 'ai', 'ml ', 'statist', 'artificial intelligence', 'python']
keyword = 'data scientist'
def get_job_ids(trigger_words, keyword, geoid=102890719, search_count=250, headers=None, internship=False):
    """
    Get job IDs from LinkedIn based on trigger words and keyword.
    # explain the inputs and outputs
    :param trigger_words: List of words to search for in job titles or descriptions.
    :param keyword: Keyword to search for in job titles or descriptions.
    :param geoid: Geographical ID for the job search location. Default is 102890719 (Netherlands).
    :param search_count: Number of job postings to fetch. Default is 250.
    :param headers: Optional headers for the request. If None, default headers will be used. If False, no headers will be used.
    :return: List of job IDs that match the trigger words and keyword.
    """
    job_ids = []
    keyword = keyword.replace(' ', '%2B')
    # round search_count to the nearest multiple of 25
    search_count = (search_count // 25) * 25
    if search_count > 1000:
        search_count = 1000
        print(f"Search count exceeds 1000, setting to 1000.")
    if search_count < 25:
        search_count = 25
        print(f"Search count is less than 25, setting to 25.")
    try:
        headers
    except NameError:
        headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
        'Accept': '*/*',
        'Accept-Language': 'en-US,en;q=0.9',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
        'Referer': f'https://www.linkedin.com/jobs/search/?keywords={keyword}&location=Nederland&geoId={geoid}&trk=homepage-basic_jobs-search-bar_search-submit&position=1&pageNum=0',
        }

    for start in range(0, search_count, 25):
        URL = f"https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords={keyword}&location=Nederland&geoId={geoid}&start={start}"
        if headers == False:
            response = requests.get(URL)
        else:
            response = requests.get(URL, headers=headers)
        if response.status_code != 200:
            print(f"Error: Unable to fetch data from LinkedIn. Status code: {response.status_code}")
            # leave the loop if the request fails
            break
        else:
            print("Data fetched successfully!")
            data = response.text
        soup = BeautifulSoup(data, 'html.parser')
        job_listings = soup.find_all('a', class_='base-card__full-link absolute top-0 right-0 bottom-0 left-0 p-0 z-[2]')


        for job in job_listings:
            if any(word in job.text.strip().lower() for word in ['intern', 'afstudeeropdracht', 'stage']):
                if internship == False:
                    print(f"--- Job ID {job.text.strip()} is an intern position.")
            elif any(word in job.text.strip().lower() for word in trigger_words):
                if internship == True:
                    print(f"--- Job ID {job.text.strip()} is an intern position.")
                else:
                    job_ids.append(('').join(job.get('href').split('/')[5:]).split('-')[-1].split('?')[0])
                    print(f"!!! Job ID {job.text.strip()} contains trigger words.")
            else:
                print(f"Job ID {job.text.strip()} does not contain trigger words.")
        print('')
        sleep(random.randint(0, 2))

    print(f"Total job IDs found: {len(job_ids)}")
    return job_ids

In [None]:
job_ids = get_job_ids(trigger_words, keyword, geoid=102890719, search_count=250, headers=None, internship=False)

Data fetched successfully!
!!! Job ID Machine Learning Engineer contains trigger words.
!!! Job ID Data Scientist contains trigger words.
!!! Job ID Machine Learning Engineer contains trigger words.
!!! Job ID Junior Data Scientist contains trigger words.
!!! Job ID Machine Learning Engineer contains trigger words.
!!! Job ID Data Scientist contains trigger words.
--- Job ID Machine Learning Engineer - Internship is an intern position.
!!! Job ID Data Scientist contains trigger words.
!!! Job ID Data Scientist contains trigger words.
!!! Job ID Data Scientist contains trigger words.

Data fetched successfully!
!!! Job ID Data Scientist contains trigger words.
!!! Job ID Data Scientist contains trigger words.
--- Job ID Data Science Intern(Analytics) is an intern position.
!!! Job ID Data Scientist AI contains trigger words.
!!! Job ID Data Scientist contains trigger words.
!!! Job ID Data Scientist AI contains trigger words.
!!! Job ID Data Scientist contains trigger words.
!!! Job ID 

['4207969860',
 '4190285194',
 '4081414959',
 '4187507643',
 '4193157782',
 '4206694302',
 '4202522884',
 '4166385107',
 '4190510848',
 '4166729606',
 '4167579820',
 '4207968748',
 '4125832494',
 '4207964942',
 '4197872763',
 '4207966830',
 '4195237062',
 '4199325380',
 '4044337676',
 '4202496500',
 '4192808833',
 '4198857360',
 '4185793767',
 '4198539672',
 '3875010257',
 '4174430599',
 '4081419833',
 '4200815282',
 '4204519336',
 '4196603277',
 '4198080271',
 '4172657877',
 '4203216584',
 '4191894579',
 '4186710091',
 '4192057590',
 '4210012538',
 '4203778839',
 '4078850232',
 '4202876806',
 '4190265425',
 '4175549195',
 '4148666946',
 '4200169359',
 '4211407011',
 '4210910725',
 '4188318993',
 '4211905147',
 '4207774997',
 '4205806612',
 '4204358737',
 '4161434847',
 '4180254954',
 '4211799731',
 '4211734065',
 '4193718162',
 '4202839744',
 '4197901555',
 '4208667000',
 '4153880617',
 '4202328052',
 '4206208512',
 '4209786990',
 '4207625812',
 '4192872796',
 '4207599086',
 '41778326

In [57]:
data = {'job_title' : [], 'job_company': [], 'job_location': [], 'days_ago': [], 'company_description': [], 'job_description': []}
error_count = 0
for idx,job_id in enumerate(job_ids):
    URL = f"https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{job_id}"
    response = requests.get(URL, headers=headers)
    if response.status_code != 200:
        print(f"Error: Unable to fetch data from LinkedIn. Status code: {response.status_code}")
        data['job_title'].append('')
        data['job_company'].append('')
        data['job_location'].append('')
        data['days_ago'].append('')
        data['company_description'].append('')
        data['job_description'].append('')
        error_count += 1
        if error_count > 5:
            print("Too many errors, stopping the script.")
            break
        sleep(3)
        continue
    else:
        error_count = 0
        print("Data fetched successfully!")
        data2 = response.text
    soup2 = BeautifulSoup(data2, 'html.parser')
    soup2.find('div', class_= "show-more-less-html__markup show-more-less-html__markup--clamp-after-5 relative overflow-hidden")
    text = soup2.get_text(strip=True, separator="~~")
    data['job_title'].append(text.split('~~')[0])
    data['job_company'].append(text.split('~~')[1])
    data['job_location'].append(text.split('~~')[2])
    data['days_ago'].append(text.split('~~')[3].split(' ')[0])
    data['company_description'].append((' ').join(text.split('~~')[10:12]))
    job_desc = (' ').join(text.split('~~')[13:-12])
    #if job_desc[0:12] == 'Remove photo':
    #    job_desc = job_desc.split(' ')[49:]
    data['job_description'].append(job_desc)
    if idx+1 % 10 == 0:
        print(f"Processed {idx+1} job postings.")
    sleep(random.randint(0, 1))

Data fetched successfully!
Data fetched successfully!
Data fetched successfully!
Data fetched successfully!
Data fetched successfully!
Data fetched successfully!
Data fetched successfully!
Data fetched successfully!
Data fetched successfully!
Data fetched successfully!
Data fetched successfully!
Data fetched successfully!
Data fetched successfully!
Data fetched successfully!
Data fetched successfully!
Data fetched successfully!
Data fetched successfully!
Data fetched successfully!
Data fetched successfully!
Data fetched successfully!
Data fetched successfully!
Data fetched successfully!
Data fetched successfully!
Data fetched successfully!
Data fetched successfully!
Data fetched successfully!
Data fetched successfully!
Data fetched successfully!
Data fetched successfully!
Data fetched successfully!
Data fetched successfully!
Data fetched successfully!
Data fetched successfully!
Data fetched successfully!
Data fetched successfully!
Data fetched successfully!
Data fetched successfully!
D

In [61]:
# CLeaning linkedin prefixes
for idx,desc in enumerate(data['job_description']):
    if desc[:10] == 'Sign in to':
        data['job_description'][idx] = desc[1212:]
    elif desc[:12] == 'Remove photo':
        if desc[291:][:6] == 'Use AI':
            data['job_description'][idx] = desc[1642:]
        else:
            data['job_description'][idx] = desc[291:]
    else:
        data['job_description'][idx] = desc
       
job_links = [f"https://www.linkedin.com/jobs/search/?currentJobId={job_id}" for job_id in job_ids]
len(job_links)

df = pd.DataFrame(data)
# Check for duplicates
df = df.drop_duplicates(subset=['job_title', 'job_company', 'job_location', 'days_ago', 'company_description', 'job_description'], keep='first')
df.reset_index(drop=True, inplace=True)
df['job_link'] = job_links
df.to_csv('linkedin_jobs.csv', index=False)

In [None]:

df = pd.read_csv('linkedin_jobs.csv')

In [None]:
import re
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize

# Step 1: Expanded keywords in Dutch and English
section_keywords = [
    # English
    "skills", "requirements", "responsibilities", "who you are", "who are you",
    "qualifications", "desired profile", "your background", "what you bring", 
    "about the candidate", "candidate profile", 
    "nice to have", "we are looking for", "we're looking for", 
    "looking for someone who", "experience and skills", "python", "years of experience", "SQL"

    # Dutch
    "wat ga je doen", "wat je gaat doen", "functie-eisen", "wie zoeken wij", 
    "wat breng je mee", "vaardigheden", "wie ben jij", "jouw profiel", 
    "gewenst profiel", "eisen", "jij bent", "jij hebt", "jouw kwalificaties", 
    "over jou", "wat wij zoeken", "wie we zoeken", "je profiel", 
    "wij zoeken iemand die", "wij zijn op zoek naar"
]

# Step 2: Compile patterns for efficient matching (case-insensitive)
patterns = [re.compile(rf"\b{re.escape(token)}\b", re.IGNORECASE) for token in section_keywords]

def extract_relevant_sections(job_posting_text, context_window=1):
    # Step 3: Tokenize text into sentences
    sentences = sent_tokenize(job_posting_text)

    # Step 4: Find sentences that match the patterns
    matches = []
    for i, sentence in enumerate(sentences):
        if any(p.search(sentence) for p in patterns):
            # Optionally, capture surrounding sentences for context
            start = max(0, i - context_window)
            end = min(len(sentences), i + context_window + 1)
            block = " ".join(sentences[start:end])
            matches.append(block)
    relevant_text = (' ').join(matches)

    return relevant_text

# Example usage
jobs_relevant = []
for jobs in df['job_description']:
    relevant_text = extract_relevant_sections(jobs, context_window=2)
    jobs_relevant.append(relevant_text)

# find the years of experience in the relevant text
years_of_experience = []
for job in df['job_description']:
    match = re.search(r"(\d+)\s*[-]?\s*(?:years?|jaar)", job, re.IGNORECASE)
    if match:
        years_of_experience.append(int(match.group(1)))
    else:
        years_of_experience.append(None)

# Check for several key skills: python, sql, machine learning, data analysis, statistics, matplotlib, pandas, numpy, scikit-learn, tensorflow, pytorch, keras, data visualization, data wrangling, big data, cloud computing, pyspark, hadoop, spark, tableau, power bi, data mining, data engineering, data modeling, data governance, data quality, data architecture, data strategy, data storytelling, data ethics, data privacy, data security, data compliance, data management, data integration, data transformation, data pipeline, data lake, data warehouse, data mart, data catalog, data lineage, data profiling, data cleansing, data enrichment, data visualization tools, business intelligence tools, machine learning algorithms, deep learning algorithms, natural language processing (NLP), computer vision, reinforcement learning, unsupervised learning, supervised learning, tableau, apache airflow, apache kafka, data science, data analytics, data mining techniques, data analysis techniques, data visualization techniques, data storytelling techniques, data ethics principles, data privacy regulations, data security best practices, data compliance standards, docker, kubernetes, data science tools, data analytics tools, data mining tools, data visualization software, business intelligence software, machine learning frameworks, deep learning frameworks, natural language processing, computer vision, power bi

skills = [
    'python', 'sql', 'machine learning', 'data analysis', 'statistics', 'matplotlib', 'pandas', 'numpy',
    'scikit-learn', 'tensorflow', 'pytorch', 'keras', 'data visualization', 'data wrangling', 
    'big data', 'cloud computing', 'pyspark', 'hadoop', 'spark', 'tableau', 'power bi',
    'data mining', 'data engineering', 'data modeling', 'data governance', 
    'data quality', 'data architecture', 'data strategy', 
    'data storytelling', 'data ethics', 
    'data privacy', 'data security',
    'data compliance', 'data management',
    'data integration', 'data transformation',
    'data pipeline', 'data lake', 'data warehouse',
    'data mart', 'data catalog', 'data lineage',
    'data profiling', 'data cleansing',
    'data enrichment', 'data visualization tools',
    'business intelligence tools',
    'machine learning algorithms',
    'deep learning algorithms',
    'natural language processing (NLP)',
    'computer vision',
    'reinforcement learning',
    # Add more skills as needed
]

skills_found = []
for job in df['job_description']:
    found_skills = [skill for skill in skills if re.search(rf"\b{re.escape(skill)}\b", job, re.IGNORECASE)]
    skills_found.append(found_skills)

df['skills'] = skills_found
df['years_of_experience'] = years_of_experience
df['relevant_text'] = jobs_relevant

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
