Skip to content

Commit

Permalink
Fork of code from ScraperWiki at https://classic.scraperwiki.com/scra…
Browse files Browse the repository at this point in the history
  • Loading branch information
backgroundcheck committed Jun 27, 2016
0 parents commit d0e3e58
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Ignore output of scraper
data.sqlite
39 changes: 39 additions & 0 deletions scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#! /usr/bin/env python
# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup
import sys
import requests

results = {}

companies = [ 'google', 'microsoft']

for company in companies:
url = "http://www.linkedin.com/company/{}".format(company)
raw = requests.get(url).content

soup = BeautifulSoup(raw)

node = soup.find(attrs = {"class" : "grid-f"})

if node!=None:
info = node.find(attrs = {"class" : "basic-info"})
titles = [item.get_text(strip=True) for item in info.findAll("dt")]
data = [item.get_text(strip=True) for item in info.findAll("dd")]
output = dict(zip(titles,data))
else:
output = {}

output['company'] = company
results[company] = output

import random, time
sleep_time = random.uniform(5,10)
time.sleep(sleep_time)

import scraperwiki
scraperwiki.sql.save(['company'], results.values())

#import json
#print json.dumps(results, indent=2)

0 comments on commit d0e3e58

Please sign in to comment.