Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Branch: master
Fetching contributors…

Cannot retrieve contributors at this time

86 lines (77 sloc) 2.841 kB
import util
import time
import json
import requests
from lxml.html import fromstring
from pymongo import Connection
from dateutil.parser import parse as parse_dt
from pprint import pprint
from events import Event
from path import path
from util import collect
db = Connection().ghsc
STATIC_DIR = path('static')
IMG_DIR = STATIC_DIR.joinpath('imgs')
github_template = 'http://github.com/%s.json'
GRAB_IMGS = False
def update(n=0):
for hacker in db.hackers.find(limit=n):
print hacker['name']
for page in range(2, 5):
stream = github_template % hacker['nick'] + '?page=%s' % page
print 'stream', stream
content = requests.get(stream).content
try:
stream_json = json.loads(content)
except ValueError:
print content
continue
events = map(Event.classify, stream_json)
time.sleep(1)
print stream, 'got %s events' % len(events)
for event in events:
event.created_at = parse_dt(event.created_at)
# get the commits here, if not will be requested at `score()`
if not db.events.find_one({'created_at':event.created_at,
'url':event.url}):
event.get_commits()
print event.__dict__
db.events.save(event.__dict__)
@collect
def grab_hackers():
h = fromstring(open('Hacker School.html').read())
h.make_links_absolute('http://hackerschool.com/')
people = h.xpath('//li[1]//a[contains(@href, "github")]/@href')
imgs = h.xpath('//li[1]//img[contains(@src, "people")]/@src')
if not IMG_DIR.exists():
IMG_DIR.makedirs_p()
imgs = [img for img in imgs if not 'fleece' in img.lower()] # deserter
pprint (people)
for github, img in zip(people, imgs)[:57]: # massage data
nick = github[github.rfind('/')+1:].strip()
img_name = img.rsplit('/')[-1]
img_path = IMG_DIR.joinpath(img_name)
print img_path
if GRAB_IMGS:
img_data = requests.get(img).content
print len(img_data)
img_path.write_bytes(img_data)
time.sleep(0.5)
if nick:
name = ' '.join(word.capitalize() for word in img[img.rfind('/')+1:img.rfind('_')].split('_'))
yield {'name':name,
'nick':nick,
'img_path': img_path
}
if __name__ == '__main__':
db.hackers.drop()
if 1 or not db.hackers.find_one():
db.hackers.insert(grab_hackers())
#pprint(list(db.hackers.find()))
# pprint(grab_hackers())
update()
exit()
exit()
#db.events.drop()
print list(grab_hackers())
db.hackers.drop()
Jump to Line
Something went wrong with that request. Please try again.