In [1]:
from html.parser import HTMLParser  
from urllib.request import urlopen  
from urllib import parse

# We are going to create a class called LinkParser that inherits some
# methods from HTMLParser which is why it is passed into the definition
class LinkParser(HTMLParser):

    # This is a function that HTMLParser normally has
    # but we are adding some functionality to it
    def handle_starttag(self, tag, attrs):
        # We are looking for the begining of a link. Links normally look
        # like <a href="www.someurl.com"></a>
        if tag == 'a':
            for (key, value) in attrs:
                if key == 'href':
                    # We are grabbing the new URL. We are also adding the
                    # base URL to it. For example:
                    # www.netinstructions.com is the base and
                    # somepage.html is the new URL (a relative URL)
                    #
                    # We combine a relative URL with the base URL to create
                    # an absolute URL like:
                    # www.netinstructions.com/somepage.html
                    newUrl = parse.urljoin(self.baseUrl, value)
                    # And add it to our colection of links:
                    self.links = self.links + [newUrl]

    # This is a new function that we are creating to get links
    # that our spider() function will call
    def getLinks(self, url):
        self.links = []
        # Remember the base URL which will be important when creating
        # absolute URLs
        self.baseUrl = url
        # Use the urlopen function from the standard Python 3 library
        response = urlopen(url)
        # Make sure that we are looking at HTML and not other things that
        # are floating around on the internet (such as
        # JavaScript files, CSS, or .PDFs for example)
        if response.getheader('Content-Type')=='text/html':
            htmlBytes = response.read()
            # Note that feed() handles Strings well, but not bytes
            # (A change from Python 2.x to Python 3.x)
            htmlString = htmlBytes.decode("utf-8")
            self.feed(htmlString)
            return htmlString, self.links
        else:
            return "",[]

# And finally here is our spider. It takes in an URL, a word to find,
# and the number of pages to search through before giving up
def spider(url, word, maxPages):  
    pagesToVisit = [url]
    numberVisited = 0
    foundWord = False
    # The main loop. Create a LinkParser and get all the links on the page.
    # Also search the page for the word or string
    # In our getLinks function we return the web page
    # (this is useful for searching for the word)
    # and we return a set of links from that web page
    # (this is useful for where to go next)
    while numberVisited < maxPages and pagesToVisit != [] and not foundWord:
        numberVisited = numberVisited +1
        # Start from the beginning of our collection of pages to visit:
        url = pagesToVisit[0]
        pagesToVisit = pagesToVisit[1:]
        try:
            print(numberVisited, "Visiting:", url)
            parser = LinkParser()
            data, links = parser.getLinks(url)
            if data.find(word)>-1:
                foundWord = True
                # Add the pages that we visited to the end of our collection
                # of pages to visit:
                pagesToVisit = pagesToVisit + links
                print(" **Success!**")
        except:
            print(" **Failed!**")
    if foundWord:
        print("The word", word, "was found at", url)
    else:
        print("Word never found")

In [12]:
spider("http://www.swissquant.com/en/775/Open-Positions.htm","Specialist",200)

1 Visiting: http://www.swissquant.com/en/775/Open-Positions.htm
Word never found


In [152]:
from bs4 import BeautifulSoup
import requests
import hickle as hkl
import numpy as np
import pandas as pd
import re

In [18]:
url_to_scrape = 'http://www.swissquant.com/en/775/Open-Positions.htm'
r = requests.get(url_to_scrape)
 
soup = BeautifulSoup(r.text, 'lxml')

In [23]:
#print(soup.prettify())

<!DOCTYPE HTML>
<html lang="en">
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="content-type"/>
  <meta content="noodp,noydir" name="robots"/>
  <base href="http://www.swissquant.com/data/"/>
  <!-- this page was produced using the content management system (CMS) PC3 by The Toolpark Corporation AG (www.toolpark.com). License issued to SWISSQUANT -->
  <link href="http://www.swissquant.com/data/styles/en/Hauptstil/reset/all.css?m=1389010974" rel="stylesheet" type="text/css"/>
  <link href="http://www.swissquant.com/data/styles/en/Hauptstil/common/all.css?m=1411022264" rel="stylesheet" type="text/css"/>
  <link href="http://www.swissquant.com/data/styles/en/Hauptstil/design/all.Tablet.css?m=1411022264" rel="stylesheet" type="text/css"/>
  <link href="http://www.swissquant.com/data/styles/en/Hauptstil/navigation/all.css?m=1411022271" rel="stylesheet" type="text/css"/>
  <link href="http://www.swissquant.com/data/styles/en/Hauptstil/utils-lightbox/all.css?m=1411022272" rel=

In [27]:
for link in soup.find_all('a'):
    if 'Career' in link:
        print(link.get('href'))

http://www.swissquant.com/en/689/Career.htm


In [34]:

positions=[]
for name in soup.find_all('h3'):
    positions.append(name.text)

In [35]:
positions

['Stay connected.',
 'Quantitative Research Engineer - Optimization Specialist',
 'Master Thesis',
 'Technical Project Manager',
 'Software Development Internship',
 'Sales and Relationship Manager',
 'Java Software Developer',
 'Quant / Support / DevOps Engineers Maintenance & Services Team',
 'Quant Internship Private Banking Technologies',
 'Quantitative Research Internship',
 'Project Manager / Business Analyst']

In [171]:
url_to_scrape = 'http://www.swissquant.com/en/775/Open-Positions.htm'
r = requests.get(url_to_scrape)
 
soup = BeautifulSoup(r.text, 'lxml')

#testing=pd.read_hdf(r'T:\Job.h5',url_to_scrape.split('/')[2])

positions=dict()
for name in soup.find_all('h3'):
    if url_to_scrape in positions:
        if name.text not in positions[url_to_scrape]:
            positions[url_to_scrape].add(name.text)
                
    else:
        positions[url_to_scrape]=set()
'''        
hdf=pd.HDFStore(r'T:\Job.h5')
save= pd.DataFrame.from_dict(data=positions, orient='index')
#save.columns=range(0,len(positions[url_to_scrape]))
hdf.put(url_to_scrape.split('/')[2], save)
hdf.close() 
'''

"        \nhdf=pd.HDFStore(r'T:\\Job.h5')\nsave= pd.DataFrame.from_dict(data=positions, orient='index')\n#save.columns=range(0,len(positions[url_to_scrape]))\nhdf.put(url_to_scrape.split('/')[2], save)\nhdf.close() \n"

In [170]:
testing.loc[url_to_scrape].str.contains('Master Thesis').sum()


1

In [181]:
positions.keys()

dict_keys(['https://apply.refline.ch/845721/search.html?segment=postDoctoralCandidate', 'http://www.swissquant.com/en/775/Open-Positions.htm'])

In [179]:
url_to_scrape = 'https://apply.refline.ch/845721/search.html?segment=postDoctoralCandidate'
r = requests.get(url_to_scrape)
 
soup = BeautifulSoup(r.text, 'lxml')
for name in soup.find_all('a'):
    if url_to_scrape in positions:
        if name.text not in positions[url_to_scrape]:
            positions[url_to_scrape].add(name.text)
                
    else:
        positions[url_to_scrape]=set()

In [176]:
print(soup.prettify())

<!DOCTYPE html>
<html lang="de">
 <head>
  <meta charset="utf-8"/>
  <title>
   ETH Zürich
  </title>
  <meta content="initial-scale=1.0, maximum-scale=1.0, minimum-scale=1.0, user-scalable=0;" name="viewport"/>
  <meta content="yes" name="apple-mobile-web-app-capable"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="text/html;charset=utf-8" http-equiv="Content-Type"/>
  <meta content="no-cache" http-equiv="cache-control"/>
  <meta content="no-cache" http-equiv="pragma"/>
  <link href="https://cdn.refline.ch/3.0.0/eth/b/browser.min.css" rel="stylesheet"/>
  <script src="https://cdn.refline.ch/3.0.0/sbrowser/jquery.min.js">
  </script>
  <script src="https://cdn.refline.ch/3.0.0/sbrowser/sbrowser.min.js">
  </script>
  <script src="https://cdn.refline.ch/3.0.0/eth/b/browser.js">
  </script>
 </head>
 <body>
  <a name="toplink">
  </a>
  <div class="container searchContainer">
   <div class="header">
    <div class="logo">
     <a class="logoLink" href="http://