Skip to content

Commit

Permalink
switch to FreqDist
Browse files Browse the repository at this point in the history
  • Loading branch information
cyhex committed Dec 20, 2013
1 parent 3f3ce0e commit b022edc
Show file tree
Hide file tree
Showing 7 changed files with 35 additions and 23 deletions.
16 changes: 10 additions & 6 deletions smm/classifier/textprocessing.py
@@ -1,10 +1,12 @@
from nltk import pos_tag
from nltk import corpus
from nltk import PorterStemmer
from nltk import FreqDist
from smm.classifier import emoticons
from smm.classifier.ngrams import bigrams
import re


stopwords = corpus.stopwords.words('english')


Expand All @@ -26,6 +28,10 @@ def getSearchTokens(cls, text):
def getClassifierTokens(cls, text):
return text.split()

@classmethod
def getFeatures(cls, text):
fq = FreqDist(cls.getClassifierTokens(text))
return dict((k, v) for k, v in fq.items())

class StopWordsMixin():
@classmethod
Expand Down Expand Up @@ -154,6 +160,10 @@ def getSearchTokens(cls, text):
tokes = cls.remove_stop_words(tokes)
return tokes

@classmethod
def getFeatures(cls, text):
fq = FreqDist(cls.getClassifierTokens(text))
return dict((k, v) for k, v in fq.items())

class StopStemmTwitterProcessor(StopTwitterProcessor):
"""
Expand Down Expand Up @@ -206,9 +216,3 @@ def getClassifierTokens(cls, text):
tokens = StopTwitterProcessor.getClassifierTokens(text)
return pos_tag(tokens)


def feature_extractor(text):
# poor's man delayed import :)
from smm.config import classifier_tokenizer

return dict.fromkeys(classifier_tokenizer.getClassifierTokens(text), 1)
3 changes: 1 addition & 2 deletions smm/classifier/worker.py
Expand Up @@ -8,7 +8,6 @@
from smm.models import RawStreamQueue, ClassifiedStream
from smm import config
from smm.classifier import labels
from smm.classifier.textprocessing import feature_extractor
import logging

class ClassifierWorker(Process):
Expand Down Expand Up @@ -51,7 +50,7 @@ def save(self, raw_data):


def get_polarity(self, text):
features = feature_extractor(text)
features = config.classifier_tokenizer.getFeatures(text)

prob = self.classifier.prob_classify(features)

Expand Down
5 changes: 5 additions & 0 deletions tests/textprocessing/test_simpleProcessor.py
Expand Up @@ -21,4 +21,9 @@ def test_getSearchTokens(self):
def test_getClassifierTokens(self):
result = SimpleProcessor.getClassifierTokens(SimpleProcessor.clean(self.text))
expect = "hello my name is timor".split()
self.assertEqual(expect, result)

def test_getFeatures(self):
result = SimpleProcessor.getFeatures(SimpleProcessor.clean(self.text))
expect = {'timor': 1, 'is': 1, 'my': 1, 'hello': 1, 'name': 1}
self.assertEqual(expect, result)
7 changes: 6 additions & 1 deletion tests/textprocessing/test_stopStemmTwitterProcessor.py
Expand Up @@ -12,4 +12,9 @@ def test_getClean(self):
self.assertEqual("the quick brown fox was jumping over the lazy dog #crazyfox @thedog http://fox.com __h__", StopStemmTwitterProcessor.clean(self.text))

def test_getClassifierTokens(self):
self.assertEqual(['quick', 'brown', 'fox', 'jump', 'lazi', 'dog', '#crazyfox', '__h__'], StopStemmTwitterProcessor.getClassifierTokens(self.text))
self.assertEqual(['quick', 'brown', 'fox', 'jump', 'lazi', 'dog', '#crazyfox', '__h__'], StopStemmTwitterProcessor.getClassifierTokens(self.text))

def test_getFeatures(self):
result = StopStemmTwitterProcessor.getFeatures(self.text)
expect = {'quick': 1, 'brown': 1, 'fox': 1, 'jump': 1, 'lazi': 1, 'dog':1, '#crazyfox':1, '__h__':1}
self.assertEqual(expect, result)
11 changes: 5 additions & 6 deletions toolbox/shell-classifier.py
Expand Up @@ -5,9 +5,8 @@
import argparse
import sys

from smm.classifier.textprocessing import feature_extractor
from smm import models

from smm import config

parser = argparse.ArgumentParser(description='Interact directly with Trained data',
usage='python shell-classifier.py myClassifier')
Expand All @@ -32,13 +31,13 @@
try:
while True:
txt = raw_input('Classify: ')

prob = cls.prob_classify(feature_extractor(txt))
classified_label = cls.classify(feature_extractor(txt))
features = config.classifier_tokenizer.getFeatures(txt)
prob = cls.prob_classify(features)
classified_label = cls.classify(features)
print ''
print "Classification: %s with %0.2f%%" % (classified_label, prob.prob(classified_label) * 100)
print ""
cls.explain(feature_extractor(txt))
cls.explain(features)
print '\n'

except (KeyboardInterrupt, SystemExit, EOFError):
Expand Down
6 changes: 3 additions & 3 deletions toolbox/test-accuracy.py
Expand Up @@ -9,10 +9,9 @@
import io
import nltk

from smm.classifier.textprocessing import feature_extractor
from smm.classifier import labels
from smm import models

from smm import config

parser = argparse.ArgumentParser(description='Test accuracy of Trained data',
usage='python test-accuracy.py myClassifier data/testDataSource.csv')
Expand Down Expand Up @@ -40,7 +39,8 @@
for l in f.readlines():
label, text = l.split('\t')
if label in [labels.negative, labels.positive]:
gold.append(((feature_extractor(text), label)))
features = config.classifier_tokenizer.getFeatures(text)
gold.append(((features, label)))


row = models.TrainedClassifiers.objects(name=args.name).first()
Expand Down
10 changes: 5 additions & 5 deletions toolbox/train-classifier.py
Expand Up @@ -8,11 +8,11 @@
import argcomplete
import sys
import io
import mongoengine

from smm.classifier.textprocessing import feature_extractor, TwitterMixin
from smm.classifier.textprocessing import TwitterMixin
from smm import models
from smm.classifier import labels
from smm.config import classifier_tokenizer as tokenizer
from smm import config

parser = argparse.ArgumentParser(description='Classify collected raw tweets', usage='python train-classifier.py myClassifier 1000')
Expand Down Expand Up @@ -70,15 +70,15 @@ def apply_features(row):
else:
continue

featureset.append(( feature_extractor(gloss), label))
featureset.append((tokenizer.getFeatures(gloss), label))

else:


#featureset = nltk.apply_features(apply_features, models.TrainDataRaw.objects(polarity=1)[0:args.size])
#featureset += nltk.apply_features(apply_features, models.TrainDataRaw.objects(polarity=-1)[0:args.size])
featureset = [(feature_extractor(row.text), labels.positive) for row in models.TrainDataRaw.objects(polarity=1).timeout(False)[0:args.size]]
featureset += [(feature_extractor(row.text), labels.negative) for row in models.TrainDataRaw.objects(polarity=-1).timeout(False)[0:args.size]]
featureset = [(tokenizer.getFeatures(row.text), labels.positive) for row in models.TrainDataRaw.objects(polarity=1).timeout(False)[0:args.size]]
featureset += [(tokenizer.getFeatures(row.text), labels.negative) for row in models.TrainDataRaw.objects(polarity=-1).timeout(False)[0:args.size]]

if args.type == 'maxent':
# Train
Expand Down

0 comments on commit b022edc

Please sign in to comment.