In [64]:
import os 
import pandas as pd

In [65]:
data_path = "dataset/enron1"

In [66]:
#Create list of dictionaries to hold data from dataset 
#Format: [{'filename': file, 'content': content, 'label': 'ham/spam'}]
ham = []
spam = []

In [67]:
#Fill the list using OS walk, return filled lists
def list_fill(ham, spam):
    for root, dirs, files in os.walk(data_path):
    #in subdirectory
        for file in files:
            with open(os.path.join(root, file), 'r', encoding='latin1') as f:
                content = f.read()
                #if in ham, append ham 
                if 'ham' in root:
                    ham.append({'filename': file, 'content': content, 'label': 'ham'})
                #if in spam, append spam
                elif 'spam' in root:
                    spam.append({'filename': file, 'content': content, 'label': 'spam'})
    return ham, spam

In [68]:
#Take in 2 lists, create 2 dataframes and then merge. Return merged dataframe
def create_df(ham, spam): 
    ham_df = pd.DataFrame(ham)
    spam_df = pd.DataFrame(spam)
    df = pd.concat([ham_df, spam_df], ignore_index = True)
    return df

In [69]:
#Create df, verifiy contents
ham, spam = list_fill(ham, spam)
df = create_df(ham, spam)
print(df.head)

<bound method NDFrame.head of                             filename  \
0     1365.2000-06-16.farmer.ham.txt   
1     3560.2001-02-09.farmer.ham.txt   
2     0877.2000-04-06.farmer.ham.txt   
3     2937.2000-11-27.farmer.ham.txt   
4     1270.2000-06-07.farmer.ham.txt   
...                              ...   
5167     1505.2004-07-09.GP.spam.txt   
5168     2148.2004-09-13.GP.spam.txt   
5169     2406.2004-10-06.GP.spam.txt   
5170     1459.2004-06-30.GP.spam.txt   
5171     2030.2004-08-30.GP.spam.txt   

                                                content label  
0     Subject: revised sea robin availabilities effe...   ham  
1     Subject: re : january spot purchases - deals n...   ham  
2     Subject: re : buyback / deficiency deals works...   ham  
3     Subject: king ranch processed volumes at tailg...   ham  
4     Subject: confirming requisitions\nconfirming t...   ham  
...                                                 ...   ...  
5167  Subject: what she doesnt know sprig

In [70]:
#Preprocessing phase. Filter text to be all lower case, remove punctuation and newline characters.
#Tokenize text and remove stopwords. Return filtered tokens.
import string 
import nltk
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer
def preprocess(text): 
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.replace('\n', ' ')
    tokens = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    #stemmer = PorterStemmer()
    #stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    
    return ' '.join(filtered_tokens)

In [71]:
#Assign a new column for preprocessed content
df['preprocessed_content'] = df['content'].apply(preprocess)
print(df.iloc[4000]['preprocessed_content'])


subject 8 dear friend size 1 order confirmation order shipped january via fedex federal express tracking number 45954036 thank registering userid 56075519 learn make fortune ebay complete turnkey system software videos turorials clk information clilings


In [83]:
#Create generic train and predict functions. 
#Train takes in split training and testing values,and a model. returns fitted model. 
#Predict takes in X_test, X_train(for vectorization fitting), y_test values and model. Displays prediction for 
#first 5 iterations. 
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer

def train(X_train, X_test, y_train, y_test, model):
    vectorizer = CountVectorizer()
    X_train_vectorized = vectorizer.fit_transform(X_train)
    X_test_vectorized = vectorizer.transform(X_test) 
    model.fit(X_train_vectorized, y_train)
    print(f'Model trained. Model score: {model.score(X_test_vectorized, y_test)}')
    return model

def predict(X_test, X_train, y_test, model): 
    vectorizer = CountVectorizer()
    X_train_vectorized = vectorizer.fit_transform(X_train)
    X_test_vectorized = vectorizer.transform(X_test)
    for pred in range(5): 
        prediction = model.predict(X_test_vectorized)
        print(f'Predicted: {prediction[pred]}')
        print(f'Actual: {y_test.iloc[pred]}')
        
def print_model_score(X_train, X_test, y_test, model): 
    vectorizer = CountVectorizer()
    X_train_vectorized = vectorizer.fit_transform(X_train)
    X_test_vectorized = vectorizer.transform(X_test)
    print(f'Model score: {model.score(X_test_vectorized, y_test)}')
    

In [84]:
#Create model (In this case, decision tree)
#Establish X and y values. Split using train_test_split. 
#Assign model to trained model using train method.
#Show predictions for first 5 iterations, comparing predictions to actual values.
model = DecisionTreeClassifier()
X = df['preprocessed_content']
y= df['label']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = .2, random_state= 42)
model = train(X_train, X_test, y_train, y_test, model)
predict(X_test, X_train, y_test, model)

Model trained. Model score: 0.9536231884057971
Predicted: ham
Actual: ham
Predicted: ham
Actual: ham
Predicted: ham
Actual: ham
Predicted: ham
Actual: ham
Predicted: ham
Actual: ham


In [85]:
#Verify model on new unseen data
datapath_1 = 'dataset/enron2'
ham2 = []
spam2 = []
ham2, spam2 = list_fill(ham1, spam2)
df2 = create_df(ham2,spam2)
df2['preprocessed_features'] = df['content'].apply(preprocess)
X2 = df2['preprocessed_features']
y2 = df2['label']
X_train2, X_test2, y_train_2, y_test2 = train_test_split(X, y, test_size = .2, random_state = 42)

print_model_score(X_train2, X_test2, y_test2, model)
predict(X_test2, X_train2, y_test2, model)

Model score: 0.9536231884057971
Predicted: ham
Actual: ham
Predicted: ham
Actual: ham
Predicted: ham
Actual: ham
Predicted: ham
Actual: ham
Predicted: ham
Actual: ham
