In [1]:
import sys, os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import string
import re
import collections

sys.path.insert(0, os.path.dirname(os.path.abspath('../src')))
from src.getjobsch import *

### Web Scraping

The file `src/getjobsch` contains the necesary functions to pull infomation from https://www.jobs.ch/en/vacancies/. The function works in the following way:
- Receives a list of job positions on natural language
- The function `clean_job_keywords` will transform those key words to search keywords by removing white spaces and replacing them with `%20` characters
- Once the necesary keywords were obtained the function `df_full_data` will proceed to pull info for each job in the following way:
  - Get the number of available pages for each job position
  - For each of the available pages, scrap an individual text box using the function `get_data_one_job` and concatenating the info by using the function `df_all_jobs`
  - In case no job postings are found an error should be printed (see example below).

In [2]:
# Load found jobs
df_jobs = pd.read_csv("../data/raw/df_jobs_ch.csv", index_col=[0])
df_jobs.head(10)

Unnamed: 0,title,publication_date,location,workload,job_type,company,job_link,keyword
0,Data Engineer temp. 24 months (w/m/d),25 avril 2023,Baden,100%,Temporary,Axpo Group,https://www.jobs.ch/en/vacancies/detail/3fa23b...,data engineer
1,Big Data Engineer (w/m/d),24 mai 2023,St. Gallen,80% – 100%,Unlimited employment,Raiffeisen Schweiz,https://www.jobs.ch/en/vacancies/detail/450b00...,data engineer
2,CSV Engineer MedTech 80% (5306 – KFR),17 mai 2023,Ostschweiz,80%,Temporary,CTC Resourcing Solutions,https://www.jobs.ch/en/vacancies/detail/cf1d02...,data engineer
3,Azure Data Ingenieur (m/w/d),23 mai 2023,Zell LU,80% – 100%,Unlimited employment,ROCKEN,https://www.jobs.ch/en/vacancies/detail/7720c6...,data engineer
4,Low Code Solution Engineer (80-100 %),15 mai 2023,Ittigen,80% – 100%,Unlimited employment,Gebäudeversicherung Bern – GVB Gruppe,https://www.jobs.ch/en/vacancies/detail/96ba41...,data engineer
5,"DevOps Engineer (part-time possible, all genders)",04 mai 2023,Basel|Bern|Genf|Lugano|Zürich,100%,Unlimited employment,Accenture,https://www.jobs.ch/en/vacancies/detail/09f695...,data engineer
6,Produktentwicklungsingenieur*,17 avril 2023,Biel,100%,Unlimited employment,HARTING AG,https://www.jobs.ch/en/vacancies/detail/6b99d0...,data engineer
7,Head Engineering & Development (m/w/d) - Digit...,11 mai 2023,Oftringen AG,100%,Unlimited employment,Mercuri Urval AG,https://www.jobs.ch/en/vacancies/detail/54aa6c...,data engineer
8,Expert Engineer Digital Workplace & Microsoft ...,22 mai 2023,Winterthur,50% – 100%,Unlimited employment,AXA,https://www.jobs.ch/en/vacancies/detail/6f4c28...,data engineer
9,Service Engineer (m/w/d) 80-100 %,12 avril 2023,Winterthur,80% – 100%,Unlimited employment,Swiss Birdradar Solution AG,https://www.jobs.ch/en/vacancies/detail/cf9b05...,data engineer


In [None]:
results = get_job_keywords(df_jobs)

In [None]:
programming_summary = results["programming_summary"]
skills_summary = results["skills_summary"]
python_summary = results["python_summary"]
errors = results["errors"]

In [None]:
print(f"There were {len(errors)} positions without available information")

In [None]:
print(programming_summary)

In [None]:
print(python_summary)

In [None]:
print(skills_summary)

### Store Raw Data

In [None]:
# df_jobs.to_csv("../data/raw/df_jobs_ch.csv")
# pd.DataFrame(dict(programming_summary).items()).to_csv("../data/raw/programming_summary_2.csv")
# pd.DataFrame(dict(python_summary).items()).to_csv("../data/raw/python_summary_2.csv")
# pd.DataFrame(dict(skills_summary).items()).to_csv("../data/raw/skills_summary_2.csv")