In [74]:
from predictNew import PreprocessData, Predict, ScrapeGlass, LinkedinScraper
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [148]:
from selenium.webdriver import Firefox

In [149]:
browser = Firefox()

In [150]:
browser.get('https://www.glassdoor.com')

In [151]:
hot_listings = browser.find_elements_by_class_name('hotListing')

In [152]:
len(hot_listings)

11

In [153]:
hot_listings[0].click()

In [None]:
class ScrapeGlass():
    """ ScrapeGlass is used to scrape job postings from Glassdoor.

    Basic Process:

    • The search method will go to the url and search for the query.

    • The click_wait method will click the second job posting to
      trigger the automatic pop-up window, then it will find the
      x-button in the corner and click it.

    • The loop_pages method will find the number of pages at the
      bottom of the screen use that number in a for loop to iterate
      through all of the pages for the given query.

    • The get_job_postings method is called in the loop_pages function
      in order to iterate through every job listing on the page and scrape
      the descriptions. The descriptions are then appended to a class variable.

    • Finally the return_job_descriptions method is called to prompt the user
      to decide whether or not to save the scraped results to a csv file.

    All of this is being done through the transform method.

    example:

        url = 'www.glassdoor.com'
        query = 'Data Scientist'

        test = ScrapeGlass()
        jobs = test.transform(url, query)

    Final Notes:

        If the program is interupted for any reason the user will still be prompted
        on whether or not to proceed with exporting to csv.

    """

    def __init__(self, param=None):
        self.browser = Firefox()
        self.titles = []
        self.companies = []
        self.job_descriptions = []

    def click_wait(self):
        """ This method triggers the automatic pop-up window and exits out of it."""

        listings = self.browser.find_elements_by_class_name('jl')
        listings[1].click()
        x_button = self.browser.find_element_by_class_name('xBtn')
        x_button.click()

    def return_job_descriptions(self):
        """ This method prompts the user whether or not to convert to csv"""

        print(len(self.titles), len(self.companies), len(self.job_descriptions))
        check_variable = input('\nProceed? (yes / no) ')
        if check_variable == 'yes':
            name = input('Enter name of data')
            self.convert_to_csv(self.titles,
                                self.companies,
                                self.job_descriptions,
                                ('~/galvanize/capstone/Ember-Job-Recommender/data/%s.csv' % name))
        else:
            return self.job_descriptions

    def sleep(self, start=5, end=15):
        return time.sleep(random.randint(5, 15))

    def search(self, query):
        """ This method goes to the url, and searchs for the query"""

        self.browser.get('https://www.glassdoor.com')
        self.sleep()
        keyword_search = self.browser.find_element_by_css_selector('#KeywordSearch')
        keyword_search.click()
        keyword_search.send_keys(query)
        start_search = self.browser.find_element_by_css_selector('#HeroSearchButton')
        start_search.click()

    def loop_pages(self):
        """ This method iterates through all available pages"""

        pages = self.browser.find_elements_by_class_name('page')
        while len(pages) == 5:
            self.get_job_postings()
            next_button = self.browser.find_element_by_class_name('next')
            next_button.click()
            self.sleep()
        return self.return_job_descriptions()

    def get_job_postings(self):
        """ This method iterates through all of the listings and appends
        them to the class variable self.job_descriptions"""

        job_listings = self.browser.find_elements_by_class_name('jl')
        self.sleep()
        for job in job_listings:
            job.location_once_scrolled_into_view
            job.click()
            self.sleep()
            title = self.browser.find_element_by_class_name('header')
            self.titles.append(title.text)
            company = self.browser.find_element_by_class_name('compInfo')
            self.companies.append(company.text)
            content = self.browser.find_element_by_class_name('jobDescriptionContent')
            self.job_descriptions.append(content.text)
            choice = random.randint(1,3)
            if choice == 2:
                tabs = self.browser.find_elements_by_class_name('tabLabel')
                try:
                    tabs[random.randint(1,2)].click()
                except IndexError:
                    pass
            self.sleep()
        return self.job_descriptions

    def convert_to_csv(self, titles, companies, final_content, name):
        """ This method converts the list final_content into a pandas
        dataframe and adds a 'lables' column of zeros"""

        final_txdf = pd.DataFrame({'titles': titles,
                                    'companies': companies,
                                    'jobs': final_content})
        final_txdf.to_csv(name)

    def transform(self, query):
        """ This method takes the url, and query as inputs and outputs either
        an exported csv file or a list of jobs"""
        self.search(query)
        self.click_wait()
        try:
            self.loop_pages()
        except:
            return self.return_job_descriptions()
        self.return_job_descriptions()