In [105]:
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException

class JobCrawler(object):
    
    def start(self, job_site_parser):
        browser = webdriver.Firefox()
        browser.get(job_site_parser.site_url)
        urls = job_site_parser.list_detail_page_urls_fn(browser)

        jobs = []
        for url in urls:
            job = self.__parse_job_detail_page(
                browser, 
                url, 
                job_site_parser.field_css_selectors, 
                job_site_parser.field_element_processors)
            jobs.append(job)
            
        browser.quit()

        return jobs

    def __parse_job_detail_page(self, browser, url, field_css_selectors, field_element_processors = None):
        browser.get(url)

        job = {}
        job["url"] = url
        for (name, css_selector) in field_css_selectors.items():
            if not css_selector:
                job[name] =  None
                continue
                
            field_element_processor = None
            if field_element_processors is not None and name in field_element_processors:
                field_element_processor = field_element_processors[name]

            value = self.__parse_field_content(browser, css_selector, field_element_processor)
            job[name] = value

        return job

    def __parse_field_content(self, browser, field_css_selector, element_text_processor=None):
        try:
            element = browser.find_element_by_css_selector(field_css_selector)
        except NoSuchElementException as inst:
            print("can not find the element with css selector: [%s] in: %s" % (field_css_selector, browser.current_url))
            return None
        else:
            if element_text_processor is None:
                element_text_processor = lambda e : element.text

            return element_text_processor(element)

In [63]:
def list_shixiseng_job_details_page_urls(browser):
    elems = browser.find_elements_by_css_selector('.jib_inf_inf')
    urls = []
    for elem in elems:
        jobName_link = elem.find_element_by_css_selector(".under_ani_jobname")
        urls.append(jobName_link.get_attribute("href"))
    return urls

shixisheng_css_selectors = {
    "job_name" :  '.jb_det_left .job_name',
    "updated_date" : '.jb_det_left .update_time',
    "location" : '.jb_det_left .city',
    "deadline" : '.closing_date + .date',
    "company" : '.jb_det_right .jb_det_right_top > a + p',
    "industry" : '.jb_det_right .jb_det_right_top .domain'
}

def update_date_processor(element):
    raw_date = element.text
    if len(raw_date) > 2:
        return raw_date[:-2]
    else:
        return raw_date
    
def industry_processor(element):
    raw_text = element.text
    return raw_text.split(",")


post_processors = {
        'updated_date' : update_date_processor,
        'industry' : industry_processor
    }

crawler = JobCrawler()

jobs = crawler.start('http://www.shixiseng.com', 
                     list_shixiseng_job_details_page_urls, 
                     shixisheng_css_selectors, 
                     post_processors)

print(jobs)


[{'updated_date': u'2016-09-20 14:22:52', 'url': u'http://www.shixiseng.com/intern/inn_acy6tmvojksi', 'industry': [u'\u8f6f\u4ef6', u'\u786c\u4ef6'], 'deadline': u'2016-10-20', 'location': u'\u4e0a\u6d77 \u4e28', 'company': u'IBM', 'job_name': u'OpenStack \u5f00\u53d1-Java / C /C+'}, {'updated_date': u'2016-09-20 14:32:17', 'url': u'http://www.shixiseng.com/intern/inn_l6rgp7ie6imr', 'industry': [u'\u4e92\u8054\u7f51'], 'deadline': u'2016-10-20', 'location': u'\u5317\u4eac \u4e28', 'company': u'\u817e\u8baf', 'job_name': u'\u4ea7\u54c1\u5b9e\u4e60\u751f'}, {'updated_date': u'2016-09-21 08:30:32', 'url': u'http://www.shixiseng.com/intern/inn_3to1j8g1gxle', 'industry': [u'\u4f01\u4e1a\u670d\u52a1'], 'deadline': u'2016-10-13', 'location': u'\u4e0a\u6d77 \u4e28', 'company': u'CEB', 'job_name': u'\u9879\u76ee\u6280\u672f\u652f\u6301'}, {'updated_date': u'2016-09-20 11:53:15', 'url': u'http://www.shixiseng.com/intern/inn_qzbmflvidhon', 'industry': [u'\u80fd\u6e90', u'\u5316\u5b66\u5316\u5de5'

In [87]:
def list_qiaobutang_job_details_page_urls(browser):
    elems = browser.find_elements_by_css_selector(".job__tab.job__tab_top .job__item a.job__title")

    urls = []
    for elem in elems:
        urls.append(elem.get_attribute("href"))
        
    return urls[:10]

qiaobutang_css_selectors = {
    "job_name" :  '.job-intro .job-intro__title',
    "updated_date" : '.job-intro .job-intro__bottom  .job-intro__bottom_right .job-intro__info_content',
    "location" : '.job-info .job-info-addr',
    "deadline" : '',
    "company" : '.job-intro .job-subtitle',
    "industry" : '.job-sidebar .job-sidebar__company_bottom .job-sidebar__slogan .job-require'
}

crawler = JobCrawler()
jobs = crawler.start('http://job.qiaobutang.com/', 
                     list_qiaobutang_job_details_page_urls, 
                     qiaobutang_css_selectors)

print(jobs)


can not find the element with css selector: [.job-sidebar .job-sidebar__company_bottom .job-sidebar__slogan .job-require] in: http://job.qiaobutang.com/1007714
[{'updated_date': u'2016-09-18', 'url': u'http://job.qiaobutang.com/1073591', 'industry': u'\u4e92\u8054\u7f51', 'deadline': None, 'location': u'\u4e0a\u6d77-\u9ec4\u6d66', 'company': u'\u5c0f\u7ea2\u4e66', 'job_name': u'\u7ba1\u7406\u57f9\u8bad\u751f'}, {'updated_date': u'2016-09-13', 'url': u'http://job.qiaobutang.com/1044595', 'industry': u'\u9152\u5e97', 'deadline': None, 'location': u'\u5168\u56fd', 'company': u'\u5982\u5bb6\u9152\u5e97', 'job_name': u'\u8fd0\u8425\u7ba1\u7406\u57f9\u8bad\u751f'}, {'updated_date': u'2016-09-18', 'url': u'http://job.qiaobutang.com/1027071', 'industry': u'\u4e0a\u5e02\u516c\u53f8', 'deadline': None, 'location': u'\u6e56\u5317', 'company': u'\u4e2d\u56fd\u7535\u5efa', 'job_name': u'\u5b9e\u4e60\u751f'}, {'updated_date': u'2016-09-18', 'url': u'http://job.qiaobutang.com/1007714', 'industry': No

In [70]:
print jobs[0]['industry']

互联网


In [39]:
txt = u'软件,硬件'
print txt.split(",")

[u'\u8f6f\u4ef6', u'\u786c\u4ef6']


In [108]:
import abc

class JobSiteParser(object):

    def __init__(self, url):
        self._url = url

    @property
    def site_url(self):
        return self._url

    @property
    def list_detail_page_urls_fn(self):
        return self._list_detail_page_urls

    @property
    def field_css_selectors(self):
        return self._get_field_css_selectors()

    @property
    def field_element_processors(self):
        return self._get_field_element_processors()

    @abc.abstractproperty
    def _list_detail_page_urls(self, browser):
        """Method that retrive all details page urls .

        Expected to be overriden by sub-classes that require custom support.

        Args:
          browser: the 'Browser' objects.

        Returns:
          The list of all urls
        """
        pass

    @abc.abstractproperty
    def _get_field_css_selectors(self):
        """Method that get a dictionary of field to its css selector in web page.

        Expected to be overriden by sub-classes that require custom support.

        Args:
          browser: the 'Browser' objects.

        Returns:
          The dict of field to css selector
        """
        pass
        pass        

    def _get_field_element_processors(self):
        return None



In [109]:
class ShixisengSiteParser(JobSiteParser):
    
    def __init(self, url):
        super(ShixisengSiteParser, self).__init__(url=url)
    
    def _get_field_css_selectors(self):
        return {
            "job_name" :  '.jb_det_left .job_name',
            "updated_date" : '.jb_det_left .update_time',
            "location" : '.jb_det_left .city',
            "deadline" : '.closing_date + .date',
            "company" : '.jb_det_right .jb_det_right_top > a + p',
            "industry" : '.jb_det_right .jb_det_right_top .domain'
        }
    
    def _list_detail_page_urls(self, browser):
        elems = browser.find_elements_by_css_selector('.jib_inf_inf')
        urls = []
        for elem in elems:
            jobName_link = elem.find_element_by_css_selector(".under_ani_jobname")
            urls.append(jobName_link.get_attribute("href"))
        return urls
    
    def _get_field_element_processors(self):
        processors = {
            'updated_date' : self._update_date_processor,
            'industry' : self._industry_processor
        }
        return processors
        
        
    def _update_date_processor(self, element):
        raw_date = element.text
        if len(raw_date) > 2:
            return raw_date[:-2]
        else:
            return raw_date
    
    def _industry_processor(self, element):
        raw_text = element.text
        return raw_text.split(",")
    

In [111]:
class QiaobutangSiteParser(JobSiteParser):
    
    def __init(self, url):
        super(QiaobutangSiteParser, self).__init__(url=url)
    
    def _get_field_css_selectors(self):
        return {
            "job_name" :  '.job-intro .job-intro__title',
            "updated_date" : '.job-intro .job-intro__bottom  .job-intro__bottom_right .job-intro__info_content',
            "location" : '.job-info .job-info-addr',
            "deadline" : '',
            "company" : '.job-intro .job-subtitle',
            "industry" : '.job-sidebar .job-sidebar__company_bottom .job-sidebar__slogan .job-require'
        }
    
    def _list_detail_page_urls(self, browser):
        elems = browser.find_elements_by_css_selector(".job__tab.job__tab_top .job__item a.job__title")

        urls = []
        for elem in elems:
            urls.append(elem.get_attribute("href"))
        
        return urls[:10]

In [112]:
# shixisheng_parser = ShixisengSiteParser("http://www.shixiseng.com")

# crawler = JobCrawler()

# jobs = crawler.start(shixisheng_parser)

# print(jobs)

shixisheng_parser = QiaobutangSiteParser("http://job.qiaobutang.com/")

crawler = JobCrawler()

jobs = crawler.start(shixisheng_parser)

print(jobs)

can not find the element with css selector: [.job-sidebar .job-sidebar__company_bottom .job-sidebar__slogan .job-require] in: http://job.qiaobutang.com/1007714
[{'updated_date': u'2016-09-18', 'url': u'http://job.qiaobutang.com/1073591', 'industry': u'\u4e92\u8054\u7f51', 'deadline': None, 'location': u'\u4e0a\u6d77-\u9ec4\u6d66', 'company': u'\u5c0f\u7ea2\u4e66', 'job_name': u'\u7ba1\u7406\u57f9\u8bad\u751f'}, {'updated_date': u'2016-09-13', 'url': u'http://job.qiaobutang.com/1044595', 'industry': u'\u9152\u5e97', 'deadline': None, 'location': u'\u5168\u56fd', 'company': u'\u5982\u5bb6\u9152\u5e97', 'job_name': u'\u8fd0\u8425\u7ba1\u7406\u57f9\u8bad\u751f'}, {'updated_date': u'2016-09-18', 'url': u'http://job.qiaobutang.com/1027071', 'industry': u'\u4e0a\u5e02\u516c\u53f8', 'deadline': None, 'location': u'\u6e56\u5317', 'company': u'\u4e2d\u56fd\u7535\u5efa', 'job_name': u'\u5b9e\u4e60\u751f'}, {'updated_date': u'2016-09-18', 'url': u'http://job.qiaobutang.com/1007714', 'industry': No

In [None]:
import pymongo

In [None]:
from pymongo import MongoClient

client = MongoClient('mongodb://localhost:27017/')

In [None]:
db = client['myTest']
collection = db['mycollection']

import datetime
post = {"author": "Mike",
        "text": "My first blog post!",
        "tags": ["mongodb", "python", "pymongo"],
        "date": datetime.datetime.utcnow()}

post_id = collection.insert_one(post).inserted_id
print post_id

In [None]:
db.collection_names(include_system_collections=False)

In [None]:
from pymongo import MongoClient

class Repository(object):
    
    def __init__(self, database, collection, url='mongodb://localhost:27017/'):
        try:
            conn = MongoClient(url)
            coll = conn[database][collection]
        except Exception as e:
            print "Could not connect to MongoDB: %s" % e
        else:
            print "Connected successfully!!!"

        self.coll = coll


    def insert(self, doc):
        self.coll.insert(doc)


    def find(self, query):
        cur = self.coll.find(query)
        return cur

    def update(self, query, update_doc):
        self.update(query, {"$set", update_doc})

    def remove(self, query):
        self.remove(query)

In [None]:
repo = Repository("jobs1", "job")

In [None]:
repo.insert({"name" : "david"})

In [None]:
job = repo.find({"name" : "david"})
print job