switch to FreqDist

cyhex · Dec 20, 2013 · b022edc · b022edc
1 parent 3f3ce0e
commit b022edc
Show file tree

Hide file tree

Showing 7 changed files with 35 additions and 23 deletions.
diff --git a/smm/classifier/textprocessing.py b/smm/classifier/textprocessing.py
@@ -1,10 +1,12 @@
 from nltk import pos_tag
 from nltk import corpus
 from nltk import PorterStemmer
+from nltk import FreqDist
 from smm.classifier import emoticons
 from smm.classifier.ngrams import bigrams
 import re
 
+
 stopwords = corpus.stopwords.words('english')
 
 
@@ -26,6 +28,10 @@ def getSearchTokens(cls, text):
     def getClassifierTokens(cls, text):
         return text.split()
 
+    @classmethod
+    def getFeatures(cls, text):
+        fq = FreqDist(cls.getClassifierTokens(text))
+        return dict((k, v) for k, v in fq.items())
 
 class StopWordsMixin():
     @classmethod
@@ -154,6 +160,10 @@ def getSearchTokens(cls, text):
         tokes = cls.remove_stop_words(tokes)
         return tokes
 
+    @classmethod
+    def getFeatures(cls, text):
+        fq = FreqDist(cls.getClassifierTokens(text))
+        return dict((k, v) for k, v in fq.items())
 
 class StopStemmTwitterProcessor(StopTwitterProcessor):
     """
@@ -206,9 +216,3 @@ def getClassifierTokens(cls, text):
         tokens = StopTwitterProcessor.getClassifierTokens(text)
         return pos_tag(tokens)
 
-
-def feature_extractor(text):
-    # poor's man delayed import :)
-    from smm.config import classifier_tokenizer
-
-    return dict.fromkeys(classifier_tokenizer.getClassifierTokens(text), 1)
diff --git a/smm/classifier/worker.py b/smm/classifier/worker.py
@@ -8,7 +8,6 @@
 from smm.models import RawStreamQueue, ClassifiedStream
 from smm import config
 from smm.classifier import labels
-from smm.classifier.textprocessing import feature_extractor
 import logging
 
 class ClassifierWorker(Process):
@@ -51,7 +50,7 @@ def save(self, raw_data):
 
 
     def get_polarity(self, text):
-        features = feature_extractor(text)
+        features = config.classifier_tokenizer.getFeatures(text)
 
         prob = self.classifier.prob_classify(features)
 

diff --git a/tests/textprocessing/test_simpleProcessor.py b/tests/textprocessing/test_simpleProcessor.py
@@ -21,4 +21,9 @@ def test_getSearchTokens(self):
     def test_getClassifierTokens(self):
         result = SimpleProcessor.getClassifierTokens(SimpleProcessor.clean(self.text))
         expect = "hello my name is timor".split()
+        self.assertEqual(expect, result)
+
+    def test_getFeatures(self):
+        result = SimpleProcessor.getFeatures(SimpleProcessor.clean(self.text))
+        expect = {'timor': 1, 'is': 1, 'my': 1, 'hello': 1, 'name': 1}
         self.assertEqual(expect, result)
diff --git a/tests/textprocessing/test_stopStemmTwitterProcessor.py b/tests/textprocessing/test_stopStemmTwitterProcessor.py
@@ -12,4 +12,9 @@ def test_getClean(self):
         self.assertEqual("the quick brown fox was jumping over the lazy dog #crazyfox @thedog http://fox.com  __h__", StopStemmTwitterProcessor.clean(self.text))
 
     def test_getClassifierTokens(self):
-        self.assertEqual(['quick', 'brown', 'fox', 'jump', 'lazi', 'dog', '#crazyfox', '__h__'], StopStemmTwitterProcessor.getClassifierTokens(self.text))
+        self.assertEqual(['quick', 'brown', 'fox', 'jump', 'lazi', 'dog', '#crazyfox', '__h__'], StopStemmTwitterProcessor.getClassifierTokens(self.text))
+
+    def test_getFeatures(self):
+        result = StopStemmTwitterProcessor.getFeatures(self.text)
+        expect = {'quick': 1, 'brown': 1, 'fox': 1, 'jump': 1, 'lazi': 1, 'dog':1, '#crazyfox':1, '__h__':1}
+        self.assertEqual(expect, result)
diff --git a/toolbox/shell-classifier.py b/toolbox/shell-classifier.py
@@ -5,9 +5,8 @@
 import argparse
 import sys
 
-from smm.classifier.textprocessing import feature_extractor
 from smm import models
-
+from smm import config
 
 parser = argparse.ArgumentParser(description='Interact directly with Trained data',
                                  usage='python shell-classifier.py myClassifier')
@@ -32,13 +31,13 @@
 try:
     while True:
         txt = raw_input('Classify: ')
-
-        prob = cls.prob_classify(feature_extractor(txt))
-        classified_label = cls.classify(feature_extractor(txt))
+        features = config.classifier_tokenizer.getFeatures(txt)
+        prob = cls.prob_classify(features)
+        classified_label = cls.classify(features)
         print ''
         print "Classification: %s with %0.2f%%" % (classified_label, prob.prob(classified_label) * 100)
         print ""
-        cls.explain(feature_extractor(txt))
+        cls.explain(features)
         print '\n'
 
 except (KeyboardInterrupt, SystemExit, EOFError):

diff --git a/toolbox/test-accuracy.py b/toolbox/test-accuracy.py
@@ -9,10 +9,9 @@
 import io
 import nltk
 
-from smm.classifier.textprocessing import feature_extractor
 from smm.classifier import labels
 from smm import models
-
+from smm import config
 
 parser = argparse.ArgumentParser(description='Test accuracy of Trained data',
                                  usage='python test-accuracy.py myClassifier data/testDataSource.csv')
@@ -40,7 +39,8 @@
 for l in f.readlines():
     label, text = l.split('\t')
     if label in [labels.negative, labels.positive]:
-        gold.append(((feature_extractor(text), label)))
+        features = config.classifier_tokenizer.getFeatures(text)
+        gold.append(((features, label)))
 
 
 row  = models.TrainedClassifiers.objects(name=args.name).first()

diff --git a/toolbox/train-classifier.py b/toolbox/train-classifier.py
@@ -8,11 +8,11 @@
 import argcomplete
 import sys
 import io
-import mongoengine
 
-from smm.classifier.textprocessing import feature_extractor, TwitterMixin
+from smm.classifier.textprocessing import TwitterMixin
 from smm import models
 from smm.classifier import labels
+from smm.config import classifier_tokenizer as tokenizer
 from smm import config
 
 parser = argparse.ArgumentParser(description='Classify collected raw tweets', usage='python train-classifier.py myClassifier 1000')
@@ -70,15 +70,15 @@ def apply_features(row):
         else:
             continue
 
-        featureset.append(( feature_extractor(gloss), label))
+        featureset.append((tokenizer.getFeatures(gloss), label))
 
 else:
 
 
     #featureset = nltk.apply_features(apply_features, models.TrainDataRaw.objects(polarity=1)[0:args.size])
     #featureset += nltk.apply_features(apply_features, models.TrainDataRaw.objects(polarity=-1)[0:args.size])
-    featureset = [(feature_extractor(row.text), labels.positive) for row in models.TrainDataRaw.objects(polarity=1).timeout(False)[0:args.size]]
-    featureset += [(feature_extractor(row.text), labels.negative) for row in models.TrainDataRaw.objects(polarity=-1).timeout(False)[0:args.size]]
+    featureset = [(tokenizer.getFeatures(row.text), labels.positive) for row in models.TrainDataRaw.objects(polarity=1).timeout(False)[0:args.size]]
+    featureset += [(tokenizer.getFeatures(row.text), labels.negative) for row in models.TrainDataRaw.objects(polarity=-1).timeout(False)[0:args.size]]
 
 if args.type == 'maxent':
     # Train