# Document Classification

This notebook demonstrates an auto-classification algorithm using the horey old chessnut of distinguishing email ham from spam.

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy
import nltk
import glob
import re

In [2]:
emails = glob.glob('data/emails/*/*')
data = {'text': [], 'class': []}
for email in emails:
    with open(email, 'r', encoding="latin-1") as content:
        data['text'].append(content.read())
        data['class'].append(email.split('.')[-2])
df = pd.DataFrame(data)

In [3]:
df.head()

Unnamed: 0,class,text
0,ham,Subject: christmas tree farm pictures\n
1,ham,"Subject: vastar resources , inc .\ngary , prod..."
2,ham,Subject: calpine daily gas nomination\n- calpi...
3,ham,Subject: re : issue\nfyi - see note below - al...
4,ham,Subject: meter 7268 nov allocation\nfyi .\n- -...


In [4]:
# define the function to be used for tokenization

nltk.download('stopwords', quiet=True)
stopwords = nltk.corpus.stopwords.words('english')

def tokenize(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    nltk.download('punkt', quiet=True)
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        # include only those that contains letters
        if re.search('[a-zA-Z]', token):
            # exclude stop words, those shorter than 3 characters, and those that
            # start with non-alphanumeric characters
            if token not in stopwords and len(token) > 2 and token[0].isalnum():
                filtered_tokens.append(token)
    return filtered_tokens

In [5]:
# extract features from all emails -- both ham and spam
count_vectorizer = CountVectorizer(tokenizer=tokenize)
counts = count_vectorizer.fit_transform(df['text'].values)
terms = count_vectorizer.get_feature_names()

# build a dataframe of the feature matrix and display a few rows
features_df = pd.DataFrame(counts.toarray(), columns=terms)
features_df.head()

Unnamed: 0,aaa,aaas,aabda,aabvmmq,aac,aachecar,aaer,aafco,aaiabe,aaigrcrb,...,zynve,zyqtaqlt,zyrtec,zyyqywp,zzezrjok,zzn,zzo,zzocb,zzso,zzsyt
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# train the classifier
classifier = MultinomialNB()
targets = df['class'].values
classifier.fit(counts, targets)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [7]:
# test the classifier
examples = [
    'Free Viagra call today!', 
    'I am going to a concert tonight in Manhattan',
    'I have a bank account in Nigeria',
    'Are you looking for singles?',
    'How are you doing?',
    "Hey! It's me Darren. Do you remember me?",
    'We are a design company specializing in web development'
]
example_counts = count_vectorizer.transform(examples)
predictions = classifier.predict(example_counts)
list(predictions)

# make a dataframe of the examples with their corresponding classifications
predictions_df = pd.DataFrame([{'text': x, 'classification': y} for x,y in zip(examples, predictions)])
predictions_df

Unnamed: 0,classification,text
0,spam,Free Viagra call today!
1,ham,I am going to a concert tonight in Manhattan
2,spam,I have a bank account in Nigeria
3,spam,Are you looking for singles?
4,ham,How are you doing?
5,ham,Hey! It's me Darren. Do you remember me?
6,spam,We are a design company specializing in web de...


In [8]:
# use scikit piplining to define the feature extraction and training
# into a single pipeline. This makes cross-validation code more consice.

# The pipeline also makes it easier to experiment with different classifiers.
# In the process of iterating over different classifiers and obtaining cross-
# validation metrics, we can select the model that best fits the data.

pipeline = Pipeline([
    ('vectorizer',  CountVectorizer(tokenizer=tokenize)),
    ('classifier',  MultinomialNB())
])

In [9]:
# cross-validation procedure

def cross_validate():
    kf = KFold(n_splits=3)
    scores = []
    confusion = numpy.array([[0, 0], [0, 0]])

    for train_indices, test_indices in kf.split(df):
        train_text = df.iloc[train_indices]['text'].values
        train_y = df.iloc[train_indices]['class'].values

        test_text = df.iloc[test_indices]['text'].values
        test_y = df.iloc[test_indices]['class'].values

        pipeline.fit(train_text, train_y)
        predictions = pipeline.predict(test_text)

        confusion += confusion_matrix(test_y, predictions)
        score = f1_score(test_y, predictions, pos_label='ham')
        scores.append(score)

    print('Total emails classified:', len(df))
    print('Score:', sum(scores)/len(scores))
    print('Confusion matrix:')
    print(confusion)
    
#cross_validate() # Uncomment this if you want to run the cross validation step, it's time consuming