In [181]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import time
import re
import numpy as np

In [167]:
spans_loc = {1: 'Number of Starts', 2: 'Comp Name', 5: 'Job Desc', 6: 'Location'}

In [204]:
class Scrapper:
    
    def __init__(self, base_url, num_pages=1):
        
        self._base_url = base_url
        self._num_pages = int(num_pages)
        self._pages_urls = self.build_pages_result_list()
        self._headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}
        self._data = {'Company': [],
                      'Rating': [],
                      'Job_Title': [],
                      'Location': [],
                      'Salary': [], 
                      'Job_Description': [],
                      'URL': []} 
        
    
    def build_pages_result_list(self):
        pages_urls = []
        for page_number in range(1, self._num_pages+1):
            page_url = self._base_url + str(page_number) + ".htm"
            pages_urls.append(page_url)
        
        return pages_urls
            
    
    @property
    def pages(self):
        return self._pages
    
    @property
    def headers(self):
        return self._headers
    
    @property
    def data(self):
        return self._data
    
        
    def build_data_dict(self, **kwargs):

        for key, val in kwargs.items():
            self._data[key].append(val)
            

    def request_page(self, url):
        
        self.page = requests.get(url, headers=self.headers)
        if self.page.status_code != 200:
            raise ValueError(f"Excpected to get 200\nInstead got: {self.page.status_code}")
    
    
    def get_elements(self, html_tag, html_class, **kwargs):
        self.elements = self.result.find_all(html_tag, class_=html_class, **kwargs)

    
    def init(self, unique_tag_id, html_tag, html_class):
        self.soup = BeautifulSoup(self.page.content, features='html.parser')
        self.result = self.soup.find(id=unique_tag_id)
        
    def build_dataframe(self):
        self.df = pd.DataFrame(self.data)

    def save_data(self, path):
        if ".csv" not in os.path.splitext(path):
            path += ".csv"
        self.df.to_csv(path)


class Job:
    _count = 0
    
    def __init__(self, job_tag, host_url):
        Job._count += 1
        self.id = Job._count
        
        self.host_url = host_url
        self._job_tag = job_tag
        
        self.url = np.nan
        self.stars = np.nan
        self.comp_name = np.nan
        self.title = np.nan
        self.location = np.nan
        self.salary = np.nan
    
    def extract_features(self):
        self.stars = self._job_tag.select('span')[1].text
        self.comp_name = self._job_tag.select('span')[2].text
        self.title = self._job_tag.select('span')[5].text
        self.location = self._job_tag.select('span')[6].text
        
        try:
            job_salary_estim = self._job_tag.select_one("span[class*=css-18034rf]").text
            salary_range = re.findall(r"(\d+K)(-)\$(\d+K)", salary_string)
            self.salary = "".join(salary_range[0])
        except AttributeError:
            pass
    
    def extract_url(self):
        self.url = 'https://www.glassdoor.com' + self._job_tag.find('a')['href']

In [205]:
scrap = Scrapper("https://www.glassdoor.com/Job/palo-alto-data-scientist-jobs-SRCH_IL.0,9_IC1147434_KO10,24_IP", 30) 

In [13]:
for url in scrap.pages:
    
    scrap.request_page(url)
    
    scrap.init('MainCol')
     
    for job_ele in scrap.elements:
        
        job_object = Job(job_ele, url)
        
        job_object.extract_features()

        job_object.extract_url()
        
        scrap.build_data_dict(Company=job_object.comp_name, Rating=job_object.stars,
                              Job_Title=job_object.title, Location=job_object.location,
                              Salary=job_object.salary, Job_Description="Unavailable ATM", URL=job_object.url)
        
    
    time.sleep(0.5)

In [15]:
df.to_csv('Temp_res.csv')