Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
edsu committed Nov 21, 2011
0 parents commit 8655a72
Show file tree
Hide file tree
Showing 16 changed files with 629 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
@@ -0,0 +1,2 @@
mboxes/*
*.pyc
Empty file added jobs/__init__.py
Empty file.
Empty file added jobs/management/__init__.py
Empty file.
Empty file.
40 changes: 40 additions & 0 deletions jobs/management/commands/load_mboxes.py
@@ -0,0 +1,40 @@
import os
import re
import urllib
import mailbox

from django.conf import settings
from django.core.management.base import BaseCommand

from jobs.models import JobEmail

mbox_dir = os.path.join(settings.PROJECT_DIR, "mboxes")

class Command(BaseCommand):

def handle(self, *args, **options):
for mbox in mboxes():
for msg in mailbox.mbox(mbox):
print msg['content-type']
email = JobEmail.new_from_msg(msg)
if email:
print "loaded %s" % email

def mboxes():
if not os.path.isdir(mbox_dir):
os.mkdir(mbox_dir)
download_mboxes()
for filename in os.listdir(mbox_dir):
if filename.endswith("mbox"):
yield os.path.join(mbox_dir, filename)

def download_mboxes():
print "downloading code4lib mboxes"
opener = urllib.URLopener()
url = "http://serials.infomotions.com/code4lib/etc/mboxes/code4lib-%s.mbox"
for year in range(2004, 2012):
mbox_url = url % year
mbox_file = os.path.join(mbox_dir, "code4lib-%s.mbox" % year)
print "saving %s as %s" % (mbox_url, mbox_file)
opener.retrieve(mbox_url, mbox_file)

10 changes: 10 additions & 0 deletions jobs/management/commands/nnp.py
@@ -0,0 +1,10 @@
from django.core.management.base import BaseCommand

from jobs.models import JobEmail

class Command(BaseCommand):

def handle(self, *args, **options):
for email in JobEmail.objects.all():
for n in email.proper_nouns():
print n.lower().encode('utf-8')
27 changes: 27 additions & 0 deletions jobs/management/commands/pop.py
@@ -0,0 +1,27 @@
import os
import email
import poplib
import logging

from django.conf import settings
from django.core.management.base import BaseCommand

from jobs.models import JobEmail

log = logging.getLogger(__name__)

class Command(BaseCommand):

def handle(self, *args, **options):
log.info("checking for new emails")
gmail = poplib.POP3_SSL(settings.POP_SERVER, settings.POP_PORT)
gmail.user(settings.POP_USER)
gmail.pass_(settings.POP_PASSWORD)

num_messages = len(gmail.list()[1])
for i in range(num_messages):
email_txt = '\n'.join(gmail.retr(i+1)[1])
msg = email.message_from_string(email_txt)
e = JobEmail.new_from_msg(msg)
if e:
log.info("found a new job email: %s", e)
103 changes: 103 additions & 0 deletions jobs/models.py
@@ -0,0 +1,103 @@
import re
import time
import codecs
import rfc822
import datetime
import StringIO

import nltk

from django.db import models

class JobEmail(models.Model):
from_name = models.CharField(max_length=255)
from_address = models.CharField(max_length=255)
from_domain = models.CharField(max_length=255)
subject = models.TextField()
body = models.TextField()
sent_time = models.DateTimeField()
message_id = models.CharField(max_length=1024)

def __str__(self):
return "%s -%s" % (self.from_address, self.subject)

def proper_nouns(self):
nouns = []
for tag in self.tags():
word = tag[0]
is_proper_noun = tag[1] == "NNP"
is_word = re.match("^[a-z]+$", tag[0], re.IGNORECASE)

if is_proper_noun and is_word:
nouns.append(tag[0])
elif len(nouns) > 0:
yield " ".join(nouns)
nouns = []

def tags(self):
words = nltk.word_tokenize(self.body)
return nltk.pos_tag(words)


@classmethod
def new_from_msg(klass, msg):
if not is_job(msg):
return None

if JobEmail.objects.filter(message_id=msg['message-id']).count() == 1:
return None

e = JobEmail()
e.from_name, e.from_address = rfc822.parseaddr(msg['from'])
e.from_name = normalize_name(e.from_name)
e.from_address = e.from_address.lower()
e.from_domain = e.from_address.split('@')[1]
e.subject = msg['subject']
e.message_id = msg['message-id']
e.body = get_body(msg)

t = time.mktime(rfc822.parsedate(msg['date']))
e.sent_time = datetime.datetime.fromtimestamp(t)

if not e.body:
return None

e.save()
return e

def normalize_name(name):
if ',' in name:
parts = name.split(',')
parts = [p.strip() for p in parts]
first_name = parts.pop()
parts.insert(0, first_name)
name = ' '.join(parts)
return name

def is_job(msg):
if not msg['subject']:
return False
subject = msg['subject'].lower()
if re.search('^re:', subject):
return False
if re.search('job', subject):
return True
if re.search('position', subject):
return True
return False

def get_body(msg):
charset = msg.get_content_charset()

if not charset:
return None

try:
codec = codecs.getreader(charset)
except LookupError:
return None

payload = StringIO.StringIO(msg.get_payload())
reader = codec(payload)
body = "\n".join(reader.readlines())
return body
21 changes: 21 additions & 0 deletions jobs/tests.py
@@ -0,0 +1,21 @@
import email
import unittest

from jobs.models import JobEmail

class JobsTests(unittest.TestCase):

def test_email(self):
msg = email.message_from_file(open("test-data/job-email"))
e = JobEmail.new_from_msg(msg)
self.assertEqual(e.from_address, "cgowing@miami.edu")
self.assertEqual(e.from_domain, 'miami.edu')
self.assertEqual(e.from_name, 'Cheryl A. Gowing')
self.assertEqual(e.subject, '[CODE4LIB] Job Posting: Head of Web & Emerging Technologies, University of Miami - revised')
self.assertTrue('collaborates' in e.body)
self.assertTrue(e.message_id, '<7933CD19EEFCC94392323A994F6F1EDF01DBB52AE8@MBX03.cgcent.miami.edu>')

def test_tagging(self):
msg = email.message_from_file(open("test-data/job-email"))
e = JobEmail.new_from_msg(msg)
print e.tags()
1 change: 1 addition & 0 deletions jobs/views.py
@@ -0,0 +1 @@
# Create your views here.
Empty file added logs/.keep
Empty file.
14 changes: 14 additions & 0 deletions manage.py
@@ -0,0 +1,14 @@
#!/usr/bin/env python
from django.core.management import execute_manager
import imp
try:
imp.find_module('settings') # Assumed to be in the same directory.
except ImportError:
import sys
sys.stderr.write("Error: Can't find the file 'settings.py' in the directory containing %r. It appears you've customized things.\nYou'll have to run django-admin.py, passing it your settings module.\n" % __file__)
sys.exit(1)

import settings

if __name__ == "__main__":
execute_manager(settings)
3 changes: 3 additions & 0 deletions requirments.pip
@@ -0,0 +1,3 @@
django
PyYAML
nltk

0 comments on commit 8655a72

Please sign in to comment.