<a href="https://colab.research.google.com/github/enyeneraph/Python-Jobs-Analysis/blob/main/Webscraping_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing the necessary libraries

In [1]:
import requests
from html.parser import HTMLParser
import re

In [2]:
def a_href(word,tag, attrs, tag_check = 'a'):
  if tag == tag_check:
    for i in attrs:
      x = re.search(str(('href', word)), str(i))
      if x:
        return x.groups()

In [3]:

class JobDetailsParser(HTMLParser):
  
  ''' This class extracts details such as job title, company name, 
  location, category, date posted as well as job description from 
  each page opened. '''

  
  def __init__(self):
    self.ress = list()
    self.job_title = False
    self.category = False
    self.location = False
    self.req = False
    self.date = False
    self.h2 = False
    self.company_details = False
    self.company_name = False
    self.handle_req = False
    self.job_type = False
    self.req_str = ""
    self.type_str = ""
    super().__init__()
  
  def handle_starttag(self, tag, attrs):

    #job title: parent + sibling tag
    if (tag == 'span' and ('class', 'listing-company-name') in attrs): #parent tag
      self.company_details = True 
      self.job_title = True
    elif tag == 'span' and ('class', 'listing-new') in attrs: #sibling tag
      self.job_title = False
    #job location: parent tag
    elif (tag == 'span' and ('class', 'listing-location') in  attrs): 
      self.location = True  
    #job category: parent tag
    elif tag == 'span' and ('class', 'listing-company-category') in attrs:
      self.category = True
    #requirements
    elif tag == 'h2':
      self.h2 = True
    #type
    elif tag == 'span' and ('class', 'listing-job-type') in attrs:
      self.job_type = True
    #date
    self.date = tag == 'time'
  
  def handle_endtag(self, tag):
    #jobtitle
    if self.job_title and tag == 'span':
      self.job_title = False
    #location
    elif self.location:
      self.location = False
    #category
    elif self.category:
      self.category = False
    #date
    elif self.date:
      self.date = False
    #type
    elif self.job_type and tag == 'span':
      self.job_type = False
    #company_name
    elif tag == 'br' and self.company_details:
      self.company_name = True
    elif tag == 'span' and self.company_name:
      self.company_name, self.company_details = False, False
    #requirements
    elif self.req and tag in ['ul', 'ol', 'dl']:
      self.req = False
 
  
  def handle_data(self,data):
    #job title
    if self.job_title and data.strip() != '':
      self.ress.append(data.strip())
    #location
    elif self.location and data.strip() != '':
      self.ress.append(data.strip())
    #category
    elif self.category and data.strip() != '':
      self.ress.append(data.strip())
      self.ress.append(self.type_str) 
      self.ress.append(self.req_str) #to keep requirements which is text heavy as the last item

    #date
    elif self.date and data.strip() != '':
      self.ress.append(data.strip())
    #type
    elif self.job_type and data.strip() not in ('', ):
      self.type_str += data.strip() #joining all data from types together to make a string 

    # #company_name
    elif self.company_name and data.strip() != '':
      self.ress.append(data.strip())
    #requirements
    elif self.h2 and data.strip() == 'Requirements':
      self.req = True
    elif self.req and data.strip() != '':
      self.req_str += data.strip() + ' '

  def return_ress(self):
    return self.ress 

In [4]:
class JobListParser(HTMLParser):

  '''This extracts links to each job, opens the links and extracts the data as described in the JobDetailsParser class '''

  list_of_list = []
  def __init__(self):
    self.data_handler = False
    self.ol_handle = False
    self.next_page = False
    super().__init__()

  def handle_starttag(self, tag, attrs):
    if tag == 'ol' and ('class', 'list-recent-jobs list-row-container menu') in attrs:
      self.ol_handle = True
    elif a_href('/jobs/[0-9]+/', tag, attrs) and self.ol_handle:
      self.data_handler = True
      job_url = 'https://www.python.org' + a_href('/jobs/[0-9]+/', tag, attrs)[0].split()[1].strip("'")
      self.parse_url(job_url, JobDetailsParser())

    elif tag == 'li' and ('class', 'next') in attrs:
      self.next_page  = True
    elif self.next_page and tag == 'a':
      next_page_url = 'https://www.python.org/jobs/' + attrs[0][1]
      self.next_page = False
      if next_page_url != 'https://www.python.org/jobs/disabled':
        self.parse_url(next_page_url, JobListParser())


  def parse_url(self, job_url, parser):
      self.text_ = requests.get(job_url).text
      self.parser_ = parser
      self.parser_.feed(self.text_)
      if hasattr(parser, 'ress'):
        self.list_of_list.append(self.parser_.ress)


  
  def handle_endtag(self, tag):
    if tag == 'a' and self.data_handler:
      self.data_handler = False
    elif tag == 'ol' and self.ol_handle:
      self.ol_handle = False
  
  def return_list(self):
    return self.list_of_list
      
  

In [5]:
text = requests.get('https://www.python.org/jobs/').text

In [6]:
#this operation should be run only once.
parser = JobListParser()
parser.feed(text)
lists_ = parser.list_of_list
len(lists_)

150

In [7]:
#converting the lists of jobs and their descriptions  to a dataframe
import pandas as pd
columns = ('Job_Title', 'Company', 'Location', 'Date_Posted', 'Category', 'Looking_for', 'Description')
df = pd.DataFrame(lists_, columns=columns)
 

In [8]:
#
from google.colab import files
df.to_csv('pythonjobs.csv', index=False) 
files.download('pythonjobs.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>