In [1]:
import os

# 1. Load data

In [2]:
corpus = []
spam = []

import random

ham_files = [f for f in os.listdir("data/train/ham")]
spam_files = [f for f in os.listdir("data/train/spam")]

for f in ham_files:
    with open("data/train/ham/" + f, 'r', encoding='latin1') as myfile:
        content = myfile.read()
        corpus.append(content)
        spam.append(0)
        
for f in spam_files:
    with open("data/train/spam/" + f, 'r', encoding='latin1') as myfile:
        content = myfile.read()
        corpus.append(content)
        spam.append(1)

# 2. Feature engineering within Transformation Pipeline

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer

# Transformation 1 - Delete Email header
class HeaderDeletor(BaseEstimator, TransformerMixin):
    def __init__(self, delete_header):
        self.delete_header = delete_header

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        if self.delete_header:
            def delete_header(x):
                try:
                    return x.split("\n\n")[1]
                except IndexError: # ValueError
                    return ""
                
            return list(map(delete_header, X))
        else:
            return X

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer

# Transformation 2 - vectorize
class Vectorizor(BaseEstimator, TransformerMixin):
    def __init__(self, lowercase):
        self.vectorizor = CountVectorizer(lowercase=lowercase)

    def fit(self, X, y=None):
        self.vectorizor.fit(X)
        return self
    
    def transform(self, X, y=None):
        return self.vectorizor.transform(X).toarray()

In [5]:
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin

# Transformation 3 - X + y
class XYConcatenator(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self  # nothing else to do
    
    def transform(self, X, y=None):
        results = np.c_[X, spam]
        return results

In [9]:
# Transformation pipeline
from sklearn.pipeline import Pipeline

lowercase = True
delete_header = False

pipeline = Pipeline([
    
        ('header_deletor', HeaderDeletor(delete_header=delete_header)),
        ('vectorizer', Vectorizor(lowercase=lowercase)),
        # ('concat_xy', XYConcatenator())
    ])

X = pipeline.fit_transform(corpus, spam)
y = spam

# 3. Train model

In [10]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X, y)



SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False)

# 4. Evaluation

In [11]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(sgd_clf, X, y, cv=5, scoring="accuracy")

print(scores)
print(scores.mean() * 100)
print(scores.std() * 100)



[ 0.90517241  0.87931034  0.89565217  0.9122807   0.89473684]
89.7430495279
1.11285576902
