In [1]:
# Data
X = ['this movie was bad', 'it was not good and really bad', 'this movie was good']
y = [1, 1, 0]

## Simple Rule Based Classifier
This approach uses keywords as input to find occuring words or ngrams in texts. Keep in mind that just one match is enough to predict the given class. If you have a mix of positive and negative features in one text this rule based classifier is not the best one.

#### Model

In [2]:
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from flashtext import KeywordProcessor

class RuleBasedClassifier(BaseEstimator, ClassifierMixin):
    
    def __init__(self, keywords, random_state=None, verbose=0):
        self.random_state = random_state
        self.verbose      = verbose
        self.load_rules(keywords)
        
    def get_rules_size(self):
        """Get count of rules loaded"""
        return len(self.rules.get_all_keywords())
        
    def load_rules(self, keywords):
        """Loading rules"""
        self.rules = KeywordProcessor()
        for keyword in keywords:
            self.rules.add_keyword(keyword)
        if self.verbose==1: print("Rules loaded:", self.get_rules_size())
    
    def predict(self, X):
        """Predict by rules matching"""
        y = np.zeros(len(X), dtype=np.int)
        for idx, text in enumerate(X):
            rules_found = self.rules.extract_keywords(text)
            if len(rules_found) > 0:
                y[idx] = 1
        return y

#### Evaluation

In [3]:
keywords = ['bad', 'not tasty', 'not good', 'crap']

rule = RuleBasedClassifier(keywords, verbose=1)
rule.predict(X)

Rules loaded: 4


array([1, 1, 0])

## Handselected Feature Classifier
This classifier uses predefined features (e.g. words, ngrams) and lookups in which texts this features occurs. The result is a bag of words approach as input for a classifier e.g. RandomForest to train on. Its a mix of rule based and machine learning which can handle the problem described at simple rule based classifier.

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, ClassifierMixin
from flashtext import KeywordProcessor
import numpy as np

class HandselectedFeatureClassifier(BaseEstimator, ClassifierMixin):
    """This classifier learns handselected features to estimate the target"""
    
    def __init__(self, estimator, features, random_state=None, verbose=0):
        self.estimator         = estimator
        self.random_state      = random_state
        self.verbose           = verbose
        self.estimator.verbose = self.verbose
        self.load_features(features)
        
    def load_features(self, features):
        """Loading features"""
        self.rules = KeywordProcessor()
        for feature in features:
            self.rules.add_keyword(feature)
        if self.verbose==1: print("Features loaded:", self.get_features_size())
    
    def transform(self, X):
        """Vectorize texts by bow approach"""
        X_vector = np.zeros([len(X),self.get_features_size()], dtype=np.int)
        for idx_entry, text in enumerate(X):
            for idx_rule, rule in enumerate(self.rules.get_all_keywords()):
                if rule in text:
                    X_vector[idx_entry][idx_rule] = 1
        return X_vector
    
    def get_features_size(self):
        """Get the amount of features loaded"""
        return len(self.rules.get_all_keywords())
    
    def fit(self, X, y):
        """Train estimator with features and targets"""
        self.estimator.fit(X, y)
    
    def predict(self, X):
        """Predict on data"""
        return self.estimator.predict(X)
    
    def get_estimator_params(self):
        """Get estimator params"""
        return self.estimator.get_params()

In [5]:
handSelected_features = ['bad', 'good', 'great', 'crap', 'not good']

sf = HandselectedFeatureClassifier(RandomForestClassifier(n_estimators=10), handSelected_features, verbose=0)
sf.fit(sf.transform(X),y)
print("Vectorized (bow):\n%s"%sf.transform(X))
sf.predict(sf.transform(X))

Vectorized (bow):
[[1 0 0 0 0]
 [1 1 0 0 1]
 [0 1 0 0 0]]


array([1, 1, 0])