diff --git a/smm/classifier/textprocessing.py b/smm/classifier/textprocessing.py index a01704d..5c20cd9 100644 --- a/smm/classifier/textprocessing.py +++ b/smm/classifier/textprocessing.py @@ -1,10 +1,12 @@ from nltk import pos_tag from nltk import corpus from nltk import PorterStemmer +from nltk import FreqDist from smm.classifier import emoticons from smm.classifier.ngrams import bigrams import re + stopwords = corpus.stopwords.words('english') @@ -26,6 +28,10 @@ def getSearchTokens(cls, text): def getClassifierTokens(cls, text): return text.split() + @classmethod + def getFeatures(cls, text): + fq = FreqDist(cls.getClassifierTokens(text)) + return dict((k, v) for k, v in fq.items()) class StopWordsMixin(): @classmethod @@ -154,6 +160,10 @@ def getSearchTokens(cls, text): tokes = cls.remove_stop_words(tokes) return tokes + @classmethod + def getFeatures(cls, text): + fq = FreqDist(cls.getClassifierTokens(text)) + return dict((k, v) for k, v in fq.items()) class StopStemmTwitterProcessor(StopTwitterProcessor): """ @@ -206,9 +216,3 @@ def getClassifierTokens(cls, text): tokens = StopTwitterProcessor.getClassifierTokens(text) return pos_tag(tokens) - -def feature_extractor(text): - # poor's man delayed import :) - from smm.config import classifier_tokenizer - - return dict.fromkeys(classifier_tokenizer.getClassifierTokens(text), 1) diff --git a/smm/classifier/worker.py b/smm/classifier/worker.py index 52906c8..7af4712 100644 --- a/smm/classifier/worker.py +++ b/smm/classifier/worker.py @@ -8,7 +8,6 @@ from smm.models import RawStreamQueue, ClassifiedStream from smm import config from smm.classifier import labels -from smm.classifier.textprocessing import feature_extractor import logging class ClassifierWorker(Process): @@ -51,7 +50,7 @@ def save(self, raw_data): def get_polarity(self, text): - features = feature_extractor(text) + features = config.classifier_tokenizer.getFeatures(text) prob = self.classifier.prob_classify(features) diff --git a/tests/textprocessing/test_simpleProcessor.py b/tests/textprocessing/test_simpleProcessor.py index 233a9f5..3d3a824 100644 --- a/tests/textprocessing/test_simpleProcessor.py +++ b/tests/textprocessing/test_simpleProcessor.py @@ -21,4 +21,9 @@ def test_getSearchTokens(self): def test_getClassifierTokens(self): result = SimpleProcessor.getClassifierTokens(SimpleProcessor.clean(self.text)) expect = "hello my name is timor".split() + self.assertEqual(expect, result) + + def test_getFeatures(self): + result = SimpleProcessor.getFeatures(SimpleProcessor.clean(self.text)) + expect = {'timor': 1, 'is': 1, 'my': 1, 'hello': 1, 'name': 1} self.assertEqual(expect, result) \ No newline at end of file diff --git a/tests/textprocessing/test_stopStemmTwitterProcessor.py b/tests/textprocessing/test_stopStemmTwitterProcessor.py index b6fa165..01020eb 100644 --- a/tests/textprocessing/test_stopStemmTwitterProcessor.py +++ b/tests/textprocessing/test_stopStemmTwitterProcessor.py @@ -12,4 +12,9 @@ def test_getClean(self): self.assertEqual("the quick brown fox was jumping over the lazy dog #crazyfox @thedog http://fox.com __h__", StopStemmTwitterProcessor.clean(self.text)) def test_getClassifierTokens(self): - self.assertEqual(['quick', 'brown', 'fox', 'jump', 'lazi', 'dog', '#crazyfox', '__h__'], StopStemmTwitterProcessor.getClassifierTokens(self.text)) \ No newline at end of file + self.assertEqual(['quick', 'brown', 'fox', 'jump', 'lazi', 'dog', '#crazyfox', '__h__'], StopStemmTwitterProcessor.getClassifierTokens(self.text)) + + def test_getFeatures(self): + result = StopStemmTwitterProcessor.getFeatures(self.text) + expect = {'quick': 1, 'brown': 1, 'fox': 1, 'jump': 1, 'lazi': 1, 'dog':1, '#crazyfox':1, '__h__':1} + self.assertEqual(expect, result) \ No newline at end of file diff --git a/toolbox/shell-classifier.py b/toolbox/shell-classifier.py index 9badd9e..82a3311 100755 --- a/toolbox/shell-classifier.py +++ b/toolbox/shell-classifier.py @@ -5,9 +5,8 @@ import argparse import sys -from smm.classifier.textprocessing import feature_extractor from smm import models - +from smm import config parser = argparse.ArgumentParser(description='Interact directly with Trained data', usage='python shell-classifier.py myClassifier') @@ -32,13 +31,13 @@ try: while True: txt = raw_input('Classify: ') - - prob = cls.prob_classify(feature_extractor(txt)) - classified_label = cls.classify(feature_extractor(txt)) + features = config.classifier_tokenizer.getFeatures(txt) + prob = cls.prob_classify(features) + classified_label = cls.classify(features) print '' print "Classification: %s with %0.2f%%" % (classified_label, prob.prob(classified_label) * 100) print "" - cls.explain(feature_extractor(txt)) + cls.explain(features) print '\n' except (KeyboardInterrupt, SystemExit, EOFError): diff --git a/toolbox/test-accuracy.py b/toolbox/test-accuracy.py index 5e1abe9..c163174 100755 --- a/toolbox/test-accuracy.py +++ b/toolbox/test-accuracy.py @@ -9,10 +9,9 @@ import io import nltk -from smm.classifier.textprocessing import feature_extractor from smm.classifier import labels from smm import models - +from smm import config parser = argparse.ArgumentParser(description='Test accuracy of Trained data', usage='python test-accuracy.py myClassifier data/testDataSource.csv') @@ -40,7 +39,8 @@ for l in f.readlines(): label, text = l.split('\t') if label in [labels.negative, labels.positive]: - gold.append(((feature_extractor(text), label))) + features = config.classifier_tokenizer.getFeatures(text) + gold.append(((features, label))) row = models.TrainedClassifiers.objects(name=args.name).first() diff --git a/toolbox/train-classifier.py b/toolbox/train-classifier.py index 121f63d..1e9b9f6 100755 --- a/toolbox/train-classifier.py +++ b/toolbox/train-classifier.py @@ -8,11 +8,11 @@ import argcomplete import sys import io -import mongoengine -from smm.classifier.textprocessing import feature_extractor, TwitterMixin +from smm.classifier.textprocessing import TwitterMixin from smm import models from smm.classifier import labels +from smm.config import classifier_tokenizer as tokenizer from smm import config parser = argparse.ArgumentParser(description='Classify collected raw tweets', usage='python train-classifier.py myClassifier 1000') @@ -70,15 +70,15 @@ def apply_features(row): else: continue - featureset.append(( feature_extractor(gloss), label)) + featureset.append((tokenizer.getFeatures(gloss), label)) else: #featureset = nltk.apply_features(apply_features, models.TrainDataRaw.objects(polarity=1)[0:args.size]) #featureset += nltk.apply_features(apply_features, models.TrainDataRaw.objects(polarity=-1)[0:args.size]) - featureset = [(feature_extractor(row.text), labels.positive) for row in models.TrainDataRaw.objects(polarity=1).timeout(False)[0:args.size]] - featureset += [(feature_extractor(row.text), labels.negative) for row in models.TrainDataRaw.objects(polarity=-1).timeout(False)[0:args.size]] + featureset = [(tokenizer.getFeatures(row.text), labels.positive) for row in models.TrainDataRaw.objects(polarity=1).timeout(False)[0:args.size]] + featureset += [(tokenizer.getFeatures(row.text), labels.negative) for row in models.TrainDataRaw.objects(polarity=-1).timeout(False)[0:args.size]] if args.type == 'maxent': # Train