# Linear Regression Email Spam Classifier

### 1. Fetch Spam Assassin Data

HAM and SPAM represent non-spam and spam emails respectively.


In [1]:
import os 
import shutil
from modules.download import DataDownloader

In [2]:
# Set up variables for downloading the data

DOWNLOAD_ROOT = "http://spamassassin.apache.org/old/publiccorpus/"
HAM_FILE = "20030228_easy_ham.tar.bz2"
SPAM_FILE = "20030228_spam.tar.bz2"
HAM_URL = DOWNLOAD_ROOT + HAM_FILE
SPAM_URL = DOWNLOAD_ROOT + SPAM_FILE
SPAM_PATH = os.path.join("datasets", "spam") # datasets/spam
HAM_PATH = os.path.join("datasets", "ham") # datasets/ham

In [19]:
# Create a DataDownloader object and download the files

data = DataDownloader(DOWNLOAD_ROOT, [HAM_FILE, SPAM_FILE])
data.download()

Ctor called
Downloading 20030228_easy_ham.tar.bz2... to datasets/20030228_easy_ham.tar.bz2
Filetype:  bz2
Done!
Downloading 20030228_spam.tar.bz2... to datasets/20030228_spam.tar.bz2
Filetype:  bz2
Done!


In [None]:
# Modify file directory for convenience

sources = ["datasets/20030228_easy_ham.tar.bz2/easy_ham", "datasets/20030228_spam.tar.bz2/spam"]
for source in sources: 
    shutil.move(source, "datasets", copy_function=shutil.copytree)
    os.rmdir(os.path.dirname(source))
    # print(os.path.dirname(source))

### 2. Parsing Data

In [4]:
import email
import pandas as pd
import numpy as np

In [5]:
# Assemble pd database from parsing

content = []
# Loop through ham and spam
for ham_or_spam in os.listdir("datasets/"):
    for file in os.listdir("datasets/" + ham_or_spam):
        path_to_email = os.path.join("datasets", ham_or_spam, file)
        with open(path_to_email, "rb") as email_file:
            # Parse the email message using the email package
            message = email.message_from_binary_file(email_file)

            # Get the subject of the email
            subject = message["Subject"]

            # Get the sender of the email
            sender = message["From"]

            # Get the recipient(s) of the email
            recipients = message["To"]

            # Get the body of the email
            if message.is_multipart():
                for part in message.walk():
                    content_type = part.get_content_type()
                    content_disposition = str(part.get("Content-Disposition"))
                    if content_type == "text/plain" and "attachment" not in content_disposition:
                        body = part.get_payload()
                        break
            else:
                body = message.get_payload()
            
            content.append([subject, sender, body, recipients, 1 if ham_or_spam == "spam" else 0])


In [6]:
data_representation = np.array(content)
df = pd.DataFrame(data_representation, columns = ["Subject", "Sender", "Body", "Recipient", "Spam"])
df.head()


Unnamed: 0,Subject,Sender,Body,Recipient,Spam
0,Re: New Sequences Window,Robert Elz <kre@munnari.OZ.AU>,"Date: Wed, 21 Aug 2002 10:54:46 -05...",Chris Garrigues <cwg-dated-1030377287.06fa6d@D...,0
1,[zzzzteana] RE: Alexander,Steve Burt <Steve_Burt@cursor-system.com>,"Martin A posted:\nTassos Papadopoulos, the Gre...","""'zzzzteana@yahoogroups.com'"" <zzzzteana@yahoo...",0
2,[zzzzteana] Moscow bomber,"""Tim Chapman"" <timc@2ubh.com>",Man Threatens Explosion In Moscow \n\nThursday...,zzzzteana <zzzzteana@yahoogroups.com>,0
3,[IRR] Klez: The Virus That Won't Die,Monty Solomon <monty@roscom.com>,Klez: The Virus That Won't Die\n \nAlready the...,undisclosed-recipient: ;,0
4,Re: [zzzzteana] Nothing like mama used to make,Stewart Smith <Stewart.Smith@ee.ed.ac.uk>,"> in adding cream to spaghetti carbonara, whi...",zzzzteana@yahoogroups.com,0


### 3. Data Exploration

In [7]:
df[df["Spam"]==1].head()

Unnamed: 0,Subject,Sender,Body,Recipient,Spam
2501,Life Insurance - Why Pay More?,12a1mailbot1@web.de,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Tr...",<dcek1a1@netsgo.com>,1
2502,[ILUG] Guaranteed to lose 10-12 lbs in 30 days...,"""Slim Down"" <taylor@s3.serveimage.com>",1) Fight The Risk of Cancer!\nhttp://www.adcli...,<ilug@linux.ie>,1
2503,Guaranteed to lose 10-12 lbs in 30 days ...,"""Slim Down"" <sabrina@mx3.1premio.com>",1) Fight The Risk of Cancer!\nhttp://www.adcli...,<zzzz@spamassassin.taint.org>,1
2504,Re: Fw: User Name & Password to Membership To ...,Account Services <wsup@playful.com>,##############################################...,zzzz@spamassassin.taint.org,1
2505,[ILUG-Social] re: Guaranteed to lose 10-12 lbs...,"""Slim n Trim"" <yenene@mx2.1premio.com>",I thought you might like these:\n1) Slim Down ...,<social@linux.ie>,1


In [9]:
print(df.iloc[0]["Body"].strip())

Date:        Wed, 21 Aug 2002 10:54:46 -0500
    From:        Chris Garrigues <cwg-dated-1030377287.06fa6d@DeepEddy.Com>
    Message-ID:  <1029945287.4797.TMDA@deepeddy.vircio.com>


  | I can't reproduce this error.

For me it is very repeatable... (like every time, without fail).

This is the debug log of the pick happening ...

18:19:03 Pick_It {exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace} {4852-4852 -sequence mercury}
18:19:03 exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace 4852-4852 -sequence mercury
18:19:04 Ftoc_PickMsgs {{1 hit}}
18:19:04 Marking 1 hits
18:19:04 tkerror: syntax error in expression "int ...

Note, if I run the pick command by hand ...

delta$ pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace  4852-4852 -sequence mercury
1 hit

That's where the "1 hit" comes from (obviously).  The version of nmh I'm
using is ...

delta$ pick -version
pick -- nmh-1.0.4 [compiled on fuchsia.cs.mu.OZ.AU at Sun Mar 17 14:55:56 

In [10]:
print(df.iloc[0]["Body"])

    Date:        Wed, 21 Aug 2002 10:54:46 -0500
    From:        Chris Garrigues <cwg-dated-1030377287.06fa6d@DeepEddy.Com>
    Message-ID:  <1029945287.4797.TMDA@deepeddy.vircio.com>


  | I can't reproduce this error.

For me it is very repeatable... (like every time, without fail).

This is the debug log of the pick happening ...

18:19:03 Pick_It {exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace} {4852-4852 -sequence mercury}
18:19:03 exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace 4852-4852 -sequence mercury
18:19:04 Ftoc_PickMsgs {{1 hit}}
18:19:04 Marking 1 hits
18:19:04 tkerror: syntax error in expression "int ...

Note, if I run the pick command by hand ...

delta$ pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace  4852-4852 -sequence mercury
1 hit

That's where the "1 hit" comes from (obviously).  The version of nmh I'm
using is ...

delta$ pick -version
pick -- nmh-1.0.4 [compiled on fuchsia.cs.mu.OZ.AU at Sun Mar 17 14:55

### 4. Preprocessing

In [27]:
from modules.processing import Process
import nltk

In [28]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\amira\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\amira\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [29]:
processed_text = []
for index, row in df.iterrows():
    tx = Process.lem_words(df.at[index, "Body"])
    # tx = Process.stop_word_removal(tx)
    processed_text.append(tx)

df["Processed"] = processed_text
df.head()

AttributeError: type object 'Process' has no attribute 'lem_words'

In [18]:
print(df["Body"].iloc[0])


    Date:        Wed, 21 Aug 2002 10:54:46 -0500
    From:        Chris Garrigues <cwg-dated-1030377287.06fa6d@DeepEddy.Com>
    Message-ID:  <1029945287.4797.TMDA@deepeddy.vircio.com>


  | I can't reproduce this error.

For me it is very repeatable... (like every time, without fail).

This is the debug log of the pick happening ...

18:19:03 Pick_It {exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace} {4852-4852 -sequence mercury}
18:19:03 exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace 4852-4852 -sequence mercury
18:19:04 Ftoc_PickMsgs {{1 hit}}
18:19:04 Marking 1 hits
18:19:04 tkerror: syntax error in expression "int ...

Note, if I run the pick command by hand ...

delta$ pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace  4852-4852 -sequence mercury
1 hit

That's where the "1 hit" comes from (obviously).  The version of nmh I'm
using is ...

delta$ pick -version
pick -- nmh-1.0.4 [compiled on fuchsia.cs.mu.OZ.AU at Sun Mar 17 14:55

In [23]:
print(df["Processed"].iloc[0])

date : wed , 21 aug 2002 10:54:46 -0500 from : chri garrigu < cwg-dated-1030377287.06fa6d @ deepeddy.com > message-id : < 1029945287.4797.tmda @ deepeddy.vircio.com > | i ca n't reproduc thi error . for me it is veri repeat ... ( like everi time , without fail ) . thi is the debug log of the pick happen ... 18:19:03 pick_it { exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace } { 4852-4852 -sequenc mercuri } 18:19:03 exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace 4852-4852 -sequenc mercuri 18:19:04 ftoc_pickmsg { { 1 hit } } 18:19:04 mark 1 hit 18:19:04 tkerror : syntax error in express `` int ... note , if i run the pick command by hand ... delta $ pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace 4852-4852 -sequenc mercuri 1 hit that 's where the `` 1 hit '' come from ( obvious ) . the version of nmh i'm use is ... delta $ pick -version pick -- nmh-1.0.4 [ compil on fuchsia.cs.mu.oz.au at sun mar 17 14:55:56 ict 2002 ] and the relev 

In [45]:
from sklearn.model_selection import train_test_split

In [46]:
X_train, X_test, Y_train, Y_test = train_test_split(df["Body"], df["Spam"], test_size=0.2, random_state=42)