In [2]:
import os
from sklearn.feature_extraction.text import CountVectorizer
import joblib
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from features import Features


## Enron Spam dataset source : http://www.aueb.gr/users/ion/data/enron-spam

In [5]:
data_directory = "../data/processed"
for directories, subdirs, files in os.walk(data_directory):
    if (os.path.split(directories)[1]  == 'spam'):
        print(directories, subdirs, len(files))

    if (os.path.split(directories)[1]  == 'ham'):
        print(directories, subdirs, len(files))

../data/processed/enron1/spam [] 1500
../data/processed/enron1/ham [] 3672
../data/processed/enron6/spam [] 4500
../data/processed/enron6/ham [] 1500
../data/processed/enron5/spam [] 3675
../data/processed/enron5/ham [] 1500
../data/processed/enron2/spam [] 1496
../data/processed/enron2/ham [] 4361
../data/processed/enron3/spam [] 1500
../data/processed/enron3/ham [] 4012
../data/processed/enron4/spam [] 4500
../data/processed/enron4/ham [] 1500


In [10]:
#Reading all the files in those folders.
model_file="../models/spam_mnb.pkl"
vocabulary_file="../models/spam_vocabulary.pkl"
max_features=20000
local_model_threshold = 0.5

def load_files():
    # We read the files and append them to the ham and spam list
    ham_list = []
    spam_list = []
    for directories, subdirs, files in os.walk(data_directory):
        if (os.path.split(directories)[1]  == 'ham'):
            for file_name in files:      
                with open(os.path.join(directories, file_name), encoding="latin-1") as f:
                    message = f.read()
                    ham_list.append(message)

        if (os.path.split(directories)[1]  == 'spam'):
            for file_name in files:
                with open(os.path.join(directories, file_name), encoding="latin-1") as f:
                    message = f.read()
                    spam_list.append(message)
                    
    return ham_list, spam_list

In [20]:
model_file="../models/spam_mnb.pkl"

ham_list, spam_list=load_files()
print("Ham Example :", ham_list[2])
print('---------------------------')
print("Spam Example :", spam_list[2])

Ham Example : Subject: hpl nominations for december 28 , 1999
( see attached file : hpll 228 . xls )
- hpll 228 . xls
---------------------------
Spam Example : Subject: food for thoughts
[
join now - take
a free tour ]
click here to be
removed .



In [13]:
def get_features_by_wordbag():
    ham_list, spam_list=load_files()
    x=ham_list + spam_list
    y=[0]*len(ham_list)+[1]*len(spam_list)
    vectorizer=None

    if os.path.exists(vocabulary_file):
        vocabulary=joblib.load(vocabulary_file)
        vectorizer = CountVectorizer(
                                     decode_error='ignore',
                                     vocabulary=vocabulary,
                                     strip_accents='ascii',
                                     max_features=max_features,
                                     stop_words='english',
                                     max_df=1.0,
                                     min_df=1 )
    else:
        vectorizer = CountVectorizer(
                                     decode_error='ignore',
                                     strip_accents='ascii',
                                     max_features=max_features,
                                     stop_words='english',
                                     max_df=1.0,
                                     min_df=1 )

    x=vectorizer.fit_transform(x)
    x=x.toarray()

    if not os.path.exists(vocabulary_file):
        vocabulary=vectorizer.vocabulary
        joblib.dump(vocabulary,vocabulary_file)

    return x,y

In [14]:
def train_nb_spam():
    X, y = get_features_by_wordbag()  # Fit the Data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    
    nb_wordbag(X_train, X_test, y_train, y_test)
    
def nb_wordbag(X_train, X_test, y_train, y_test):
    print ("Naive Bays and wordbag")
    clf = MultinomialNB()
    print  (clf)
    clf.fit(X_train, y_train)
    joblib.dump(clf,model_file)   
    y_pred = clf.predict(X_test)
    print ("f1_score:")
    print (metrics.f1_score(y_test, y_pred))

In [15]:
class Spam_Check(object):
    def __init__(self):
        self.name="Spam_Check"
        self.clf=joblib.load(model_file)
        #self.features_extract=Features()

    def check_spam(self,featurevectors):
        y_pred = self.clf.predict_proba([featurevectors])[0,-1]
        label = float(y_pred >= local_model_threshold)
        return label

In [17]:
train_nb_spam()
spam_Check=Spam_Check()
features_extract = Features(vocabulary_file)
featurevectors=features_extract.extract("learn to make a fortune with ICO!")
spam_Check.check_spam(featurevectors)

Naive Bays and wordbag
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
f1_score:
0.9814601259646945


1.0