# Scraping the no fluff jobs site

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

In [2]:
def get_first_item(string):
    s = string.split(',')[0]
    if s[-1] == ' ':
        end = len(s) - 1
    else:
        end = len(s)
    return s[1:end]

In [12]:
position_names = []
salary_ranges = []
skills = []
seniorities = []
categories = []
hrefs = []

no_fluff_jobs = requests.get(f'https://nofluffjobs.com/pl/warszawa?page={1}&lang=en')
soap = BeautifulSoup(no_fluff_jobs.content, 'html.parser')
pages_num = soap.find_all('a', class_='page-link')[-2].text
pages_num = re.sub(r'^\s+','', pages_num)
pages_num = re.sub(r'\s+$','', pages_num)
pages_num = int(pages_num)

for i in tqdm(range(1, pages_num+1)):
    no_fluff_jobs = requests.get(f'https://nofluffjobs.com/pl/warszawa?page={i}')
    soap = BeautifulSoup(no_fluff_jobs.content, 'html.parser')
    
    postings = soap.find_all('a', class_='posting-list-item')
    for posting in postings:
        position_name = posting.find('h3', class_='posting-title__position').text
        position_name = position_name[1:len(position_name)-1]
        position_names.append(position_name)
        
        salary_element = posting.find('span', class_='salary')
        salary_range = []
        if salary_element is None:
            salary_range = [0, 0]
        else:
            salary_range = re.sub(r'\s+', '', salary_element.text)
            salary_range = re.findall(r'[0-9]+', salary_range)
            for j in range(len(salary_range)):
                salary_range[j] = int(salary_range[j])
            if len(salary_range) == 1:
                salary_range.append(salary_range[0])
        salary_ranges.append(salary_range)
        
        skill_element = posting.find('a')
        if skill_element is None:
            skill = ''
        else:
            skill = skill_element.text
            skill = re.sub(r'^\s+','', skill)
            skill = re.sub(r'\s+$','', skill)
        skills.append(skill.lower())
        
    list_items = soap.find_all('a', class_='posting-list-item')
    job_hrefs = []
    for li in list_items:
        job_hrefs.append('https://nofluffjobs.com' + li['href'] + '?lang=en')
    hrefs += job_hrefs
    for href in job_hrefs:
        job_page = requests.get(href)
        page_soap = BeautifulSoup(job_page.content, 'html.parser')
        seniorities.append(get_first_item(page_soap.find('li', id='posting-seniority').find('span').text))
        categories.append(get_first_item(page_soap.find('ul', class_='posting-info-row').find('li').find('a').text))
    

100%|██████████████████████████████████████████████████████████████████████████████████| 58/58 [18:46<00:00, 19.42s/it]


In [13]:
salary_ranges = np.transpose(salary_ranges)
contents_dict = {
    "position" : position_names,
    "salary_lower": salary_ranges[0],
    "salary_upper": salary_ranges[1],
    "main_skill": skills,
    "seniority": seniorities,
    "main_category": categories,
    "posting_href": hrefs
}
df = pd.DataFrame.from_dict(contents_dict)

In [14]:
df

Unnamed: 0,position,salary_lower,salary_upper,main_skill,seniority,main_category,posting_href
0,Remote Enrichment Content Manager,6000,11000,,Junior,Big Data,https://nofluffjobs.com/pl/job/remote-enrichme...
1,ServiceNow Business Analyst,12000,20000,,Mid,Business Analysis,https://nofluffjobs.com/pl/job/servicenow-busi...
2,Data Engineer with SQL,17000,22000,sql,Mid,Big Data,https://nofluffjobs.com/pl/job/data-engineer-w...
3,Remote React Fullstack Developer,14000,20000,react,Mid,Frontend,https://nofluffjobs.com/pl/job/remote-react-fu...
4,[Remote] Data Analyst,26250,31500,,Mid,Business Intelligence,https://nofluffjobs.com/pl/job/remote-data-ana...
...,...,...,...,...,...,...,...
1148,Senior .NET Test Automation Engineer,23520,26880,.net,Senior,Testing,https://nofluffjobs.com/pl/job/senior-net-test...
1149,QA Engineer,12000,16000,python,Mid,Testing,https://nofluffjobs.com/pl/job/qa-engineer-syn...
1150,Junior HR Administrative Assistant,2800,3600,,Trainee,HR,https://nofluffjobs.com/pl/job/junior-hr-admin...
1151,Agile Consultant (aka Agile Coach),21000,25200,,Senior,Agile,https://nofluffjobs.com/pl/job/agile-consultan...


In [18]:
def standarize_main_skills(main_skill):
    if 'sql' in main_skill:
        return 'sql'
    if main_skill == 'google cloud platform':
        return 'gcp'
    if main_skill == 'go':
        return 'golang'
    if 'azure' in main_skill:
        return 'microsoft azure'
    return main_skill

def rename_other_category(category):
    if category == 'Inne':
        return 'Other'
    return category

In [19]:
df['main_skill'] = df.apply(lambda x: standarize_main_skills(x['main_skill']), axis=1)
df['main_category'] = df.apply(lambda x: rename_other_category(x['main_category']), axis=1)
df.to_csv('job_postings.csv', index=False, encoding='utf-8-sig')