Skip to content

Commit

Permalink
nlp support for eventhorizon
Browse files Browse the repository at this point in the history
  • Loading branch information
vlad73 committed Dec 12, 2010
1 parent 9e55ed5 commit 83b1a0b
Show file tree
Hide file tree
Showing 12 changed files with 252 additions and 0 deletions.
Empty file added eventhorizon2/__init__.py
Empty file.
Binary file added eventhorizon2/__init__.pyc
Binary file not shown.
Empty file.
39 changes: 39 additions & 0 deletions eventhorizon2/learning/models.py
@@ -0,0 +1,39 @@
from django.db import models

class Text(models.Model):
text = models.CharField(_('text'), max_length=10000, help_text=_('document text'))

class Meta:
verbose_name = _('text')
verbose_name_plural = _('text')

def __unicode__(self):
return self.text

class Document(models.Model):
discussion_id = models.IntegerField()
keywords = models.CharField(_('keywords'), max_length=1000, help_text=_('document keywords'))

class Meta:
verbose_name = _('document')
verbose_name_plural = _('documents')

def __unicode__(self):
return self.discussion_name

class Term(models.Model):
num_docs = models.IntegerField(default=0)
term = models.CharField(_('term'), max_length=50, help_text=_('term'))

class Meta:
verbose_name = _('term')
verbose_name_plural = _('terms')

def __unicode__(self):
return self.term


class Document_Term(models.Model):
document_id = models.IntegerField()
term_id = models.IntegerField()
tf = models.FloatField()
17 changes: 17 additions & 0 deletions eventhorizon2/learning/nlp.py
@@ -0,0 +1,17 @@
import nltk
from nltk.stem.porter import PorterStemmer

__author__ = 'vlad'


def clean_text(text,language='english'):
stemmer = PorterStemmer()
tokens = nltk.word_tokenize(text)
stemed_tokens = stemmer.stem(tokens)
stopwords = nltk.corpus.stopwords.words(language)
content = [w for w in stemed_tokens if w.lower() not in stopwords]





23 changes: 23 additions & 0 deletions eventhorizon2/learning/tests.py
@@ -0,0 +1,23 @@
"""
This file demonstrates two different styles of tests (one doctest and one
unittest). These will both pass when you run "manage.py test".
Replace these with more appropriate tests for your application.
"""

from django.test import TestCase

class SimpleTest(TestCase):
def test_basic_addition(self):
"""
Tests that 1 + 1 always equals 2.
"""
self.failUnlessEqual(1 + 1, 2)

__test__ = {"doctest": """
Another way to test that 1 + 1 is equal to 2.
>>> 1 + 1 == 2
True
"""}

1 change: 1 addition & 0 deletions eventhorizon2/learning/views.py
@@ -0,0 +1 @@
# Create your views here.
11 changes: 11 additions & 0 deletions eventhorizon2/manage.py
@@ -0,0 +1,11 @@
#!/usr/bin/env python
from django.core.management import execute_manager
try:
import settings # Assumed to be in the same directory.
except ImportError:
import sys
sys.stderr.write("Error: Can't find the file 'settings.py' in the directory containing %r. It appears you've customized things.\nYou'll have to run django-admin.py, passing it your settings module.\n(If the file settings.py does indeed exist, it's causing an ImportError somehow.)\n" % __file__)
sys.exit(1)

if __name__ == "__main__":
execute_manager(settings)
49 changes: 49 additions & 0 deletions eventhorizon2/services/add_document_part.py
@@ -0,0 +1,49 @@
from eventhorizon2.learning.models import Document, Term, Document_Term
from eventhorizon2.learning.nlp import clean_text
import math
from operator import itemgetter
import simplejson

__author__ = 'vlad'

TF_IDF_THRESHOLD = 1.5

def add_document_part(discussion_id, text):

terms = clean_text(text)
terms_set = set(terms)
document = Document.objects.get_or_create(discussion_id = discussion_id)
number_documents = get_number_documents()

keywords = {}
for term in terms_set:
t = Term.objects.get_or_create(term =term)
q = Document_Term.objects.filter(document_id = document.id, term_id = t.id)
if len(q)==0:
t.num_docs = t.num_docs +1
t.save()
dt = Document_Term.objects.create(document_id = document.id, term_id = t.id)
else:

dt = q[0]

dt.tf = float(terms.count(term)) / len(terms)
dt.save()
idf = math.log(float(number_documents) / t.num_docs)

if idf*dt.tf > TF_IDF_THRESHOLD :
keywords[term] = idf*dt.tf


sorted_kw = sorted(keywords.items(), key=itemgetter(1), reverse=True)
old_kw = simplejson.loads(document.keywords)
for k,v in sorted_kw:
old_kw[k] =v

document.keywords = simplejson.dumps(old_kw)
document.save()



def get_number_documents():
return Document.objects.all().count()
96 changes: 96 additions & 0 deletions eventhorizon2/settings.py
@@ -0,0 +1,96 @@
# Django settings for eventhorizon2 project.

DEBUG = True
TEMPLATE_DEBUG = DEBUG

ADMINS = (
# ('Your Name', 'your_email@domain.com'),
)

MANAGERS = ADMINS

DATABASES = {
'default': {
'ENGINE': 'django.db.backends.', # Add 'postgresql_psycopg2', 'postgresql', 'mysql', 'sqlite3' or 'oracle'.
'NAME': '', # Or path to database file if using sqlite3.
'USER': '', # Not used with sqlite3.
'PASSWORD': '', # Not used with sqlite3.
'HOST': '', # Set to empty string for localhost. Not used with sqlite3.
'PORT': '', # Set to empty string for default. Not used with sqlite3.
}
}

# Local time zone for this installation. Choices can be found here:
# http://en.wikipedia.org/wiki/List_of_tz_zones_by_name
# although not all choices may be available on all operating systems.
# On Unix systems, a value of None will cause Django to use the same
# timezone as the operating system.
# If running in a Windows environment this must be set to the same as your
# system time zone.
TIME_ZONE = 'America/Chicago'

# Language code for this installation. All choices can be found here:
# http://www.i18nguy.com/unicode/language-identifiers.html
LANGUAGE_CODE = 'en-us'

SITE_ID = 1

# If you set this to False, Django will make some optimizations so as not
# to load the internationalization machinery.
USE_I18N = True

# If you set this to False, Django will not format dates, numbers and
# calendars according to the current locale
USE_L10N = True

# Absolute path to the directory that holds media.
# Example: "/home/media/media.lawrence.com/"
MEDIA_ROOT = ''

# URL that handles the media served from MEDIA_ROOT. Make sure to use a
# trailing slash if there is a path component (optional in other cases).
# Examples: "http://media.lawrence.com", "http://example.com/media/"
MEDIA_URL = ''

# URL prefix for admin media -- CSS, JavaScript and images. Make sure to use a
# trailing slash.
# Examples: "http://foo.com/media/", "/media/".
ADMIN_MEDIA_PREFIX = '/media/'

# Make this unique, and don't share it with anybody.
SECRET_KEY = 'f6t8s7qdlx^2m+6*%&6jg@4eom+4*&6i4r*2q=f1#88dz8+i9='

# List of callables that know how to import templates from various sources.
TEMPLATE_LOADERS = (
'django.template.loaders.filesystem.Loader',
'django.template.loaders.app_directories.Loader',
# 'django.template.loaders.eggs.Loader',
)

MIDDLEWARE_CLASSES = (
'django.middleware.common.CommonMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware',
'django.middleware.csrf.CsrfViewMiddleware',
'django.contrib.auth.middleware.AuthenticationMiddleware',
'django.contrib.messages.middleware.MessageMiddleware',
)

ROOT_URLCONF = 'eventhorizon2.urls'

TEMPLATE_DIRS = (
# Put strings here, like "/home/html/django_templates" or "C:/www/django/templates".
# Always use forward slashes, even on Windows.
# Don't forget to use absolute paths, not relative paths.
)

INSTALLED_APPS = (
'django.contrib.auth',
'django.contrib.contenttypes',
'django.contrib.sessions',
'django.contrib.sites',
'django.contrib.messages',
# Uncomment the next line to enable the admin:
# 'django.contrib.admin',
# Uncomment the next line to enable admin documentation:
# 'django.contrib.admindocs',
)
Binary file added eventhorizon2/settings.pyc
Binary file not shown.
16 changes: 16 additions & 0 deletions eventhorizon2/urls.py
@@ -0,0 +1,16 @@
from django.conf.urls.defaults import *

# Uncomment the next two lines to enable the admin:
# from django.contrib import admin
# admin.autodiscover()

urlpatterns = patterns('',
# Example:
# (r'^eventhorizon2/', include('eventhorizon2.foo.urls')),

# Uncomment the admin/doc line below to enable admin documentation:
# (r'^admin/doc/', include('django.contrib.admindocs.urls')),

# Uncomment the next line to enable the admin:
# (r'^admin/', include(admin.site.urls)),
)

0 comments on commit 83b1a0b

Please sign in to comment.