In [1]:
import pandas as pd

In [2]:
# Input data

training_data = [
    {"message": "Movies", "category": "fun"},
    {"message": "Movies", "category": "fun"},
    {"message": "Rent",   "category": "boring"},
    {"message": "Fuel",   "category": "boring"},
]

training_df = pd.DataFrame(training_data)
display(training_df.shape)
display(training_df.head())

(4, 2)

Unnamed: 0,category,message
0,fun,Movies
1,fun,Movies
2,boring,Rent
3,boring,Fuel


In [3]:
# Build a bayesian filtering pipeline

# https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),   # Count the number of words
    ('tfidf', TfidfTransformer()), # Something to do with sparse arrays for big files
    ('clf', MultinomialNB()),      # The bayesial algorithm for figuring stuff out
])

In [5]:
# Train the filter
text_clf.fit(training_df.message, training_df.category)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [12]:
# Run the filter on fresh data

testing_data = [
    {"message": "Movies - lord of the rings"},
    {"message": "Rent - back payment"},
    {"message": "Wages"},
    {"message": "Speeding fines"}
]
testing_df = pd.DataFrame(testing_data)
display(testing_df.shape)
display(testing_df.head())

(4, 1)

Unnamed: 0,message
0,Movies - lord of the rings
1,Rent - back payment
2,Wages
3,Speeding fines


In [13]:
guessed_categories = text_clf.predict(testing_df.message)
testing_df['category'] = pd.Series(guessed_categories)
display(testing_df.shape)
display(testing_df.head())

(4, 2)

Unnamed: 0,message,category
0,Movies - lord of the rings,fun
1,Rent - back payment,boring
2,Wages,boring
3,Speeding fines,boring


# Links

* Practical howto https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
* Theory https://www.datacamp.com/community/tutorials/naive-bayes-scikit-learn