Permalink
Browse files

documentation and refactoring

  • Loading branch information...
1 parent 66d72d5 commit 13e85673e47c6d6fc197e6d5b3d5957422671529 @chochkov committed Feb 16, 2012
View
@@ -1,7 +1,7 @@
# Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
source "http://rubygems.org"
-gem 'activerecord', '>= 3.2.0'
+gem 'activerecord'
group :development do
gem 'pry'
View
@@ -51,7 +51,7 @@ PLATFORMS
ruby
DEPENDENCIES
- activerecord (>= 3.2.0)
+ activerecord
guard-rspec
pry
pry-doc
View
@@ -2,6 +2,7 @@
# More info at https://github.com/guard/guard#readme
guard 'rspec', :version => 2, :cli => '--color' do
- watch(/^lib\/periods\/(.+)\.rb$/) { |m| "spec/#{m[1]}_spec.rb" }
+ watch(/^lib\/green_midget\/(.+)\.rb$/) { |m| "spec/#{m[1]}_spec.rb" }
+ watch(/^lib\/green_midget\/models\/(.+)\.rb$/) { |m| "spec/#{m[1]}_spec.rb" }
watch(%r{^spec/.+_spec\.rb$})
end
View
@@ -1,14 +1,23 @@
# Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
require 'active_record'
+
require 'green_midget/constants'
require 'green_midget/url_detection'
require 'green_midget/logger'
+require 'green_midget/heuristic_checks'
+require 'green_midget/default_features'
require 'green_midget/base'
+
require 'green_midget/models/countable'
require 'green_midget/models/examples'
require 'green_midget/models/features'
require 'green_midget/models/records'
require 'green_midget/models/words'
+
+require 'green_midget/errors/no_text_found'
+require 'green_midget/errors/feature_method_not_implemented'
+require 'green_midget/errors/no_examples_given'
+
require 'green_midget/extensions/classifier'
if classifier = Gem.searcher.find('green_midget')
@@ -1,70 +1,61 @@
# Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
module GreenMidget
class Base
- include Logger
-
+ include DefaultFeatures
+ include HeuristicChecks
+
+ # Get classification for unknown messages based on history
+ #
+ # Examples:
+ #
+ # result = GreenMidget::Classifier.new(unknown_text)
+ # # result is now in -1, 0, 1 meaning respectively
+ # # no_spam, no_answer, spam
+ #
def classify
- CATEGORIES.each do |category|
- if respond_to?(:"pass_#{category}_heuristics?") && send(:"pass_#{category}_heuristics?")
- classify_as!(category)
- return HYPOTHESES[category]
- end
+ if respond_to?(:heuristic_checks, true) && response = heuristic_checks
+ return response
end
+ # load all relevant records in one go
Records.fetch_all(words)
- register_classification
factor = log_ratio
case
when factor >= ACCEPT_ALTERNATIVE_MIN
- ALTERNATIVE_RESPONSE
+ RESPONSES[ALTERNATIVE]
when factor >= REJECT_ALTERNATIVE_MAX
- DUNNO
+ RESPONSES[:dunno]
else
- NULL_RESPONSE
+ RESPONSES[NULL]
end
end
+ # Public method used to train the classifier with examples
+ # belonging to a known `category`.
+ #
+ # Examples:
+ #
+ # classifier = GreenMidget::Classifier.new(known_good_text)
+ # classifier.classify_as!(:ham)
+ # # increases the chances for similar text to pass the check next time
+ #
+ # classifier = GreenMidget::Classifier.new(known_spam_text)
+ # classifier.classify_as!(:spam)
+ # # increases the chances for similar text to fail the check next time
+ #
def classify_as!(category)
- keys = [ Words.objects(words), Features.objects(present_features), Examples.objects(features, true) ].flatten.map do |object|
- object.record_key(category)
- end
+ keys = [
+ Words.objects(words),
+ Features.objects(present_features),
+ Examples.objects(features, true)
+ ].flatten.map { |object| object.record_key(category) }
- Records.increment(keys)
- register_training
+ !! Records.increment(keys)
end
private
- # ------ Features --------
-
- def features
- FEATURES
- end
-
- def present_features
- features.select { |feature| feature_present?(feature) }
- end
-
- def feature_present?(feature)
- method = :"#{ feature }?"
- if respond_to?(method, true)
- send(method)
- else
- raise("You must implement method #{ method } or remove feature #{ feature }.")
- end
- end
-
- def url_in_text?
- UrlDetection.new(text).any?
- end
-
- def email_in_text?
- text.scan(EMAIL_REGEX).size > 0
- end
-
- # ------ Words --------
-
def words
strip_external_links.scan(WORDS_SPLIT_REGEX).uniq.
map(&:downcase).
@@ -76,11 +67,24 @@ def strip_external_links
end
def text
- @text || raise('You should either implement the text method or provide an instance variable at this point.')
+ @text || raise(NoTextFound)
end
+ # Calculate the log ratio between the scores for both categories.
+ # It takes into account the Examples counts ( ie. how much history
+ # there is for each category ), the Words count ( i.e. how much history for
+ # each word in each category ) and if any other Features are there -
+ # accounts for them as well.
def log_ratio
- Examples.log_ratio + words.map{ |word| Words[word].log_ratio }.sum + present_features.map{ |feature| Features[feature].log_ratio }.sum
+ result = Examples.log_ratio
+
+ result += words.map{ |word| Words[word].log_ratio }.sum
+
+ if respond_to?(:features, true)
+ result += present_features.map{ |feature| Features[feature].log_ratio }.sum
+ end
+
+ result
end
end
end
@@ -5,7 +5,7 @@ module GreenMidget
EMAIL_REGEX = /[a-zA-Z][\w\.-]*[a-zA-Z0-9]@[a-zA-Z0-9][\w\.-]*[a-zA-Z0-9]\.[a-zA-Z][a-zA-Z\.]*[a-zA-Z]/
URL_REGEX = /(?i)\b((?:[a-z][\w-]+:(?:\/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}\/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?]))/
- EXTERNAL_LINK_REGEX = Regexp.new(/(#{ EMAIL_REGEX })|(#{ URL_REGEX })/)
+ EXTERNAL_LINK_REGEX = Regexp.new(/(#{EMAIL_REGEX})|(#{URL_REGEX})/)
STOP_WORDS = %w()
@@ -15,15 +15,19 @@ module GreenMidget
WORDS_SPLIT_REGEX = Regexp.new(/\w{#{ MIN_CHARACTERS_IN_WORD },#{ MAX_CHARACTERS_IN_WORD }}/)
FEATURES = %w(url_in_text email_in_text)
- # Decision making: Log(Pr(alternative | text)) - Log(Pr(null | text)) <=> [ REJECT_ALTERNATIVE_MAX, ACCEPT_ALTERNATIVE_MIN ]
+ # Decision making:
+ # Log(Pr(alternative | text)) - Log(Pr(null | text)) <=>
+ # ( REJECT_ALTERNATIVE_MAX..ACCEPT_ALTERNATIVE_MIN )
+ #
ACCEPT_ALTERNATIVE_MIN = Math::log(3.0)
REJECT_ALTERNATIVE_MAX = 0.0
- ALTERNATIVE_RESPONSE = 1
- DUNNO = 0
- NULL_RESPONSE = -1
-
NULL = :ham
ALTERNATIVE = :spam
CATEGORIES = [ NULL, ALTERNATIVE ]
+ RESPONSES = {
+ NULL => -1,
+ :dunno => 0,
+ ALTERNATIVE => 1,
+ }
end
@@ -0,0 +1,40 @@
+# A mixin that implements features check and allows Base sublcasses
+# to define their own features for spam/ham detection.
+#
+# By default texts are checked for presence of external URL or email
+# references. An example of addional feature would be presence of particular
+# words or expressions.
+#
+# See the example in `lib/green_midget/extensions/sample.rb`
+#
+module GreenMidget
+ module DefaultFeatures
+
+ private
+
+ def features
+ FEATURES
+ end
+
+ def present_features
+ features.select { |feature| feature_present?(feature) }
+ end
+
+ def feature_present?(feature)
+ method = :"#{feature}?"
+ if respond_to?(method, true)
+ send(method)
+ else
+ raise FeatureMethodNotImplemented.new(feature, method)
+ end
+ end
+
+ def url_in_text?
+ UrlDetection.new(text).any?
+ end
+
+ def email_in_text?
+ text.scan(EMAIL_REGEX).size > 0
+ end
+ end
+end
@@ -0,0 +1,11 @@
+module GreenMidget
+ class FeatureMethodNotImplemented < StandardError
+ def initialize(feature, method_name)
+ super <<-MSG
+Method #{method_name.inspect} not found. Either implement it or
+delete feature #{feature} from your features list.
+MSG
+ end
+ end
+end
+
@@ -0,0 +1,9 @@
+module GreenMidget
+ class NoExamplesGiven < StandardError
+ def initialize
+ super <<-MSG
+Training examples must be provided for all categories before classification.
+MSG
+ end
+ end
+end
@@ -0,0 +1,10 @@
+module GreenMidget
+ class NoTextFound < StandardError
+ def initialize
+ super <<-MSG
+You should either implement the text method or provide an instance variable at this point.
+MSG
+ end
+ end
+end
+
@@ -0,0 +1,23 @@
+# A mixin that implements heuritics checks for both categories.
+# If there're some conditions under which a spammable object could
+# directly be classified as one of the classification categories
+# the logic could be implemented using heuritic checks in your subclasses
+#
+# See the example in `lib/green_midget/extensions/sample.rb`
+#
+module GreenMidget
+ module HeuristicChecks
+
+ private
+
+ def heuristic_checks
+ CATEGORIES.each do |category|
+ if respond_to?(:"pass_#{category}_heuristics?") && send(:"pass_#{category}_heuristics?")
+ classify_as!(category)
+ return RESPONSES[category]
+ end
+ end
+ return false
+ end
+ end
+end
@@ -1,19 +1,37 @@
# Copyright (c) 2011, SoundCloud Ltd., Nikola Chochkov
+#
+# This is an abstraction from Words, Examples and Features. It provides common
+# methods for building the record keys for individual countables in any
+# category.
+#
+# For example the data record key for the word 'legit' in Spam category would
+# be something like "word::legit::spam_count". The record key for a feature
+# 'url_present' in Ham would be something like "feature::url_present::ham_count"
+# The count of all training examples given for category Spam would be
+# "example::any::spam_count"
+#
+# The example counts for individual features is stored as well. For example for
+# 'url_present' we will have two records: "example::url_present::spam_count" and
+# "example::url_present::ham_count". They will store the informatino about how
+# much training the GreenMidget received for this feature in each category.
+#
+# This class is the link between countable and the Records data store adapter
+#
module GreenMidget
class Countable
attr_accessor :key
class_attribute :prefix
- def self.[](key)
- new(key)
+ def initialize(key)
+ @key = self.class.prefix + key
end
- def self.objects(keys)
- keys.map { |key| new(key) }
- end
+ class << self
+ alias :[] :new
- def initialize(key)
- @key = self.class.prefix + key
+ def objects(keys)
+ keys.map { |key| new(key) }
+ end
end
def [](category)
Oops, something went wrong.

0 comments on commit 13e8567

Please sign in to comment.