In [None]:
import sys, os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import string
import re
import collections

sys.path.insert(0, os.path.dirname(os.path.abspath('../src')))
from src.getjobsch import *

# Scraping Jobs.ch

As a job seeker, one has to search through job portals to find most relevant jobs related to your profile. In this exercise, your goal is to find all jobs related to keywords: “Data Scientist”, “Data Analyst”, “Python Developer”, “Data Engineer”, “Data Manager”, “Data Architect”, “Big Data Analyst” and “Data Python” on jobs.ch.
1. Download all necessary information (including job title, date, company name, location…) for all webpages.
2. Using the information obtained, perform a descriptive analysis on this data including questions:
   - How many jobs are shared between these categories?
   - How much the keywords: “Data Analyst” and “Big Data Analyst” overlap?
   - Are there some companies doing more hires than average?
   - How many jobs are there in different Kantons?
   - Is “machine learning” keyword more often in data scientist or data analyst jobs?
   - What is the distribution of most common keywords between and across categories?
3. Produce a report in the form of a clean notebook (or jupyter slides), with commented code and markdown cells for structuring and interpretations.

### Web Scraping

The file `src/getjobsch` contains the necesary functions to pull infomation from https://www.jobs.ch/en/vacancies/. The function works in the following way:
- Receives a list of job positions on natural language
- The function `clean_job_keywords` will transform those key words to search keywords by removing white spaces and replacing them with `%20` characters
- Once the necesary keywords were obtained the function `df_full_data` will proceed to pull info for each job in the following way:
  - Get the number of available pages for each job position
  - For each of the available pages, scrap an individual text box using the function `get_data_one_job` and concatenating the info by using the function `df_all_jobs`
  - In case no job postings are found an error should be printed (see example below).

In [None]:
# Key words to be searched
job_positions = ["Data Engineer", "Data Scientist", "Data Analyst", "Python Developer", "Data Manager", "Data Architect", "Big Data Analyst", "Data Python"]

In [None]:
# # Run the function to get both errors and 
# df_all = df_full_data(job_positions)

# # In this case we should not have errors
# errors = df_all["errors"]
# errors

In [None]:
# # Print the found jobs
# df_jobs = df_all["results"]
# df_jobs.head(10)

In [None]:
# Load found jobs
df_jobs = pd.read_csv("../data/raw/df_jobs_ch.csv", index_col=[0])
df_jobs.head(10)

In [None]:
# There is an index problem for some cases and therefore some job types do not make sense
df_jobs.job_type.unique()

In [None]:
programming_summary, skills_summary, python_summary, errors = get_job_keywords(df_jobs)

In [None]:
print(f"There were {len(errors['errors'])} positions without available information")

### Store Raw Data

In [None]:
# df_jobs.to_csv("../data/raw/df_jobs_ch.csv")
# pd.DataFrame(dict(programming_summary).items()).to_csv("../data/raw/programming_summary.csv")
# pd.DataFrame(dict(python_summary).items()).to_csv("../data/raw/python_summary.csv")
# pd.DataFrame(dict(skills_summary).items()).to_csv("../data/raw/skills_summary.csv")

## Web Scraping -- Continued

In [None]:
job_urls = df_jobs["job_link"].values

In [None]:
programming_count = []
skills_count = []
python_count = []
errors = []

for i, ju in enumerate(job_urls):
    flag_1 = False
    flag_2 = False
    sec_page = requests.get(ju)
    sec_soup = BeautifulSoup(sec_page.content, "html.parser")
    
    try:
        job_desc = sec_soup.find("div", {"data-cy" : "vacancy-description"})
        job_desc_text = job_desc.text
    except AttributeError:
        job_desc_text = ""
        flag_1 = True
    
    if job_desc == None:
        try:
            job_desc = sec_soup.find("iframe", {"data-cy" : "detail-vacancy-iframe-content"}).find_next()
            job_desc_text = job_desc.text
        except AttributeError:
            job_desc_text = ""
            flag_2 = True
    
    if flag_1 and flag_2:
        errors.append(ju)

    job_desc_text = job_desc_text.translate(job_desc_text.maketrans("", "", '!"$%&\'()*,-./:;<=>?@[\\]^_`{|}~'))
    job_desc_text = job_desc_text.lower()
    job_desc_words = job_desc_text.split()
    
    for word in job_desc_words:
        if word in keywords_programming.keys():
            programming_count.append(word) #  = word_count.get(word, 0) + 1
        if word in keywords_skills.keys():
            skills_count.append(word)
        if word in keywords_python.keys():
            python_count.append(word)
    
    programming_summary = collections.Counter(programming_count)
    skills_summary = collections.Counter(skills_count)
    python_summary = collections.Counter(python_count)

In [None]:
print(programming_summary)

In [None]:
print(python_summary)

In [None]:
print(skills_summary)

### Web Scrapping -- Raw

In [None]:
sec_url = df_jobs[["job_link"]].values[1][0] # 'https://www.jobs.ch/en/vacancies/detail/3fa23bd2-215b-4500-83cf-317d15b71d13/?source=vacancy_search'
sec_page = requests.get(sec_url)
sec_soup = BeautifulSoup(sec_page.content, "html.parser")

In [None]:
sec_soup.find("div", {"data-cy" : "vacancy-description"}).text.find("Git") > 0

In [None]:
sec_soup.find("div", {"data-cy" : "vacancy-description"}).text[936:939]

In [None]:
ll = sec_soup.find("div", {"data-cy" : "vacancy-description"}).find_all("ul", {"class" : "Ul-sc-1n42qu0-0 Ul-sc-1otw97l-0 JJPIu kNGQob"}) # .find_all("span")
for l in ll:
    l.find()

In [None]:
sec_soup.find("iframe", {"data-cy" : "detail-vacancy-iframe-content"}).find_next().text.find("Python") # find_all("p")