In [1]:
import os
import tarfile
import urllib.request

DOWNLOAD_ROOT = "http://spamassassin.apache.org/old/publiccorpus/"
HAM_URL = DOWNLOAD_ROOT + "20030228_easy_ham.tar.bz2"
SPAM_URL = DOWNLOAD_ROOT + "20030228_spam.tar.bz2"
SPAM_PATH = os.path.join("datasets", "spam")

def fetch_spam_data(ham_url=HAM_URL, spam_url=SPAM_URL, spam_path=SPAM_PATH):
    if not os.path.isdir(spam_path):
        os.makedirs(spam_path)
    for filename, url in (("ham.tar.bz2", ham_url), ("spam.tar.bz2", spam_url)):
        path = os.path.join(spam_path, filename)
        if not os.path.isfile(path):
            urllib.request.urlretrieve(url, path)
        tar_bz2_file = tarfile.open(path)
        tar_bz2_file.extractall(path=spam_path)
        tar_bz2_file.close()

In [2]:
fetch_spam_data()

In [3]:
HAM_DIR = os.path.join(SPAM_PATH, "easy_ham")
SPAM_DIR = os.path.join(SPAM_PATH, "spam")
ham_filenames = [name for name in sorted(os.listdir(HAM_DIR)) if len(name) > 20]
spam_filenames = [name for name in sorted(os.listdir(SPAM_DIR)) if len(name) > 20]

In [4]:
import email
import email.policy

def load_email(is_spam, filename, spam_path=SPAM_PATH):
    directory = "spam" if is_spam else "easy_ham"
    with open(os.path.join(spam_path, directory, filename), "rb") as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

ham_emails = [load_email(is_spam=False, filename=name) for name in ham_filenames]
spam_emails = [load_email(is_spam=True, filename=name) for name in spam_filenames]

In [5]:
print(ham_emails[1].get_content().strip())

Martin A posted:
Tassos Papadopoulos, the Greek sculptor behind the plan, judged that the
 limestone of Mount Kerdylio, 70 miles east of Salonika and not far from the
 Mount Athos monastic community, was ideal for the patriotic sculpture. 
 
 As well as Alexander's granite features, 240 ft high and 170 ft wide, a
 museum, a restored amphitheatre and car park for admiring crowds are
planned
---------------------
So is this mountain limestone or granite?
If it's limestone, it'll weather pretty fast.

------------------------ Yahoo! Groups Sponsor ---------------------~-->
4 DVDs Free +s&p Join Now
http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM
---------------------------------------------------------------------~->

To unsubscribe from this group, send an email to:
forteana-unsubscribe@egroups.com

 

Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/


In [6]:
print(spam_emails[2].get_content().strip())

1) Fight The Risk of Cancer!
http://www.adclick.ws/p.cfm?o=315&s=pk007

2) Slim Down - Guaranteed to lose 10-12 lbs in 30 days
http://www.adclick.ws/p.cfm?o=249&s=pk007

3) Get the Child Support You Deserve - Free Legal Advice
http://www.adclick.ws/p.cfm?o=245&s=pk002

4) Join the Web's Fastest Growing Singles Community
http://www.adclick.ws/p.cfm?o=259&s=pk007

5) Start Your Private Photo Album Online!
http://www.adclick.ws/p.cfm?o=283&s=pk007

Have a Wonderful Day,
Offer Manager
PrizeMama













If you wish to leave this list please use the link below.
http://www.qves.com/trim/?zzzz@spamassassin.taint.org%7C17%7C308417


In [8]:
print(spam_emails[100].get_content().strip())

<html>
<head>
<title>Digital Publishing Tools - Free Software Alert!</title>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
</head>
<body bgcolor="#FFFFFF" text="#000000">
<center>
<table width="582" border="2" cellspacing="0" cellpadding="5" bordercolor="#0077CC">
<tr>
<td colspan="3" width="582" align="center" bgcolor="#0077CC"><!5122qHWL1-032pyeM4045IIgM3-001oYhw0942jQSK5-726UDqG9283lEHR8-145EiGhl64>
<a href="http://3dpageturningebook.com" style="text-decoration:none;">
<b><font face="Verdana, Arial, Helvetica, sans-serif" size="4" color="#FFFFFF">Publish Like a Professional with Digital Publishing Tools</font></b>
</a>
</td>
</tr>

<tr>
<td colspan="1" width="204" valign="top">
<b><font face="Verdana Arial, Helvetica, sans-serif" color="#000066" size="2">Easily Create Professional:</font></b>
<font face="Verdana, Arial, Helvetica, sans-serif" size="1" color="#000066">
<ul>
<li>eBooks</li>
<li>eBrochures</li>
<li>eCatalogs</li>
<li>Resumes</li>
<li>Newslett

In [14]:
print(spam_emails[300].get_content().strip())

*** FREE BONUS OFFER - SEE BELOW ***

We can supply TOP QUALITY, VIRTUALLY IDENTICAL REPLICAS of just about anything - from watches to wallets, from lighters to lingerie, clothing, accessories, even electrical goods. All your favorite designer labels reproduced at a fraction of the price.

All major Credit Cards accepted. Worldwide certified shipping. Quality guaranteed.

We are currently building our catalog so let us know that you want to receive notification when our on-line catalog is published later this month to qualify for a GREAT FREE BONUS OFFER.

*** FREE BONUS: Register NOW for our catalog and receive one pair of designer REPLICA sunglasses (our regular price $15) & check out the quality of our goods FREE OF CHARGE ***

For more information email: replicas@fastnetspain.net

Register NOW to be sure you don't miss out !!!








To remove from future mailings email: noreplicas@fastnetspain.net

Register NOW to be sure you don't miss out !!!














-- 
Irish Linux Users

In [15]:
import numpy as np
from sklearn.model_selection import train_test_split

X = np.array(ham_emails + spam_emails, dtype=object)
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
X_train

array([<email.message.EmailMessage object at 0x7fbfc46ab130>,
       <email.message.EmailMessage object at 0x7fbfc46abf40>,
       <email.message.EmailMessage object at 0x7fbfc43433a0>, ...,
       <email.message.EmailMessage object at 0x7fbfc4b846d0>,
       <email.message.EmailMessage object at 0x7fbfc4caee80>,
       <email.message.EmailMessage object at 0x7fbfc484a640>],
      dtype=object)

In [23]:
from bs4 import BeautifulSoup
import pandas as pd

# Define a function to extract plain text from an EmailMessage object with HTML content
def extract_plain_text(email_message):
    # Extract the email's HTML content
    html_content = email_message.get_body(preferencelist=('html'))
    
    # Use BeautifulSoup to parse and extract plain text
    if html_content:
        soup = BeautifulSoup(html_content.get_content(), 'html.parser')
        plain_text = soup.get_text()
        return plain_text
    else:
        return ""

In [24]:
def detect_email_format(email_message):

    if email_message.is_multipart():
        # If the email is multipart (contains different parts), return 'multipart'
        return 'multipart'
    else:
        # Get the content type of the email
        content_type = email_message.get_content_type()
        return content_type

# Example usage:
for email_message in X_train:
    email_format = detect_email_format(email_message)
    print(f"Email format: {email_format}")


Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/html
Email format: text/plain
Email format: text/plain
Email format: multipart
Email format: text/plain
Email format: text/html
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/html
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/html
Email format: multipart
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email 

Email format: text/plain
Email format: text/plain
Email format: text/html
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/html
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: multipart
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Ema

Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: multipart
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/html
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Em

Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/html
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: multipart
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Em

Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: multipart
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: multipart
Email format: text/plain
Email format: text/plain
Email format: multipart
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/html
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/html
Email format: text/plain
Email format: text/html
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email format: text/plain
Email 

In [25]:
from collections import Counter

# Create a list to store the content types from the dataset
content_types = []

# Iterate through the dataset and extract content types
for email_message in X_train:
    content_type = detect_email_format(email_message)
    content_types.append(content_type)

# Use Counter to count occurrences of each content type
content_type_counts = Counter(content_types)

# Print the content types and their counts
for content_type, count in content_type_counts.items():
    print(f"Content Type: {content_type}, Count: {count}")


Content Type: text/plain, Count: 2090
Content Type: text/html, Count: 150
Content Type: multipart, Count: 159
Content Type: multipart/alternative, Count: 1


In [32]:
for email_message in X_train:
    if detect_email_format(email_message) == "text/html":
        # Only perform the following action if the format is "text/html"
        email_plain_text = extract_plain_text(email_message)
        # Do something with email_plain_text


Decoding error: 'str' object has no attribute 'decode'
Decoding error: 'str' object has no attribute 'decode'
Decoding error: 'str' object has no attribute 'decode'
Decoding error: 'str' object has no attribute 'decode'
Decoding error: 'str' object has no attribute 'decode'
Decoding error: 'str' object has no attribute 'decode'
Decoding error: unknown encoding: DEFAULT_CHARSET
Decoding error: 'str' object has no attribute 'decode'
Decoding error: 'str' object has no attribute 'decode'
Decoding error: 'str' object has no attribute 'decode'
Decoding error: 'str' object has no attribute 'decode'
Decoding error: 'str' object has no attribute 'decode'
Decoding error: 'str' object has no attribute 'decode'
Decoding error: 'str' object has no attribute 'decode'
Decoding error: 'str' object has no attribute 'decode'
Decoding error: 'str' object has no attribute 'decode'
Decoding error: 'str' object has no attribute 'decode'
Decoding error: 'str' object has no attribute 'decode'
Decoding error:

In [42]:
html_spam_emails = [email for email in X_train[y_train==1]
                    if detect_email_format(email) == "text/html"]


In [45]:
sample_html_spam = html_spam_emails[7]
print(sample_html_spam.get_content().strip()[:1000], "...")

<HTML><HEAD><TITLE></TITLE><META http-equiv="Content-Type" content="text/html; charset=windows-1252"><STYLE>A:link {TEX-DECORATION: none}A:active {TEXT-DECORATION: none}A:visited {TEXT-DECORATION: none}A:hover {COLOR: #0033ff; TEXT-DECORATION: underline}</STYLE><META content="MSHTML 6.00.2713.1100" name="GENERATOR"></HEAD>
<BODY text="#000000" vLink="#0033ff" link="#0033ff" bgColor="#CCCC99"><TABLE borderColor="#660000" cellSpacing="0" cellPadding="0" border="0" width="100%"><TR><TD bgColor="#CCCC99" valign="top" colspan="2" height="27">
<font size="6" face="Arial, Helvetica, sans-serif" color="#660000">
<b>OTC</b></font></TD></TR><TR><TD height="2" bgcolor="#6a694f">
<font size="5" face="Times New Roman, Times, serif" color="#FFFFFF">
<b>&nbsp;Newsletter</b></font></TD><TD height="2" bgcolor="#6a694f"><div align="right"><font color="#FFFFFF">
<b>Discover Tomorrow's Winners&nbsp;</b></font></div></TD></TR><TR><TD height="25" colspan="2" bgcolor="#CCCC99"><table width="100%" border="0" 

In [46]:
print(extract_plain_text(sample_html_spam.get_content())[:1000], "...")

AttributeError: 'str' object has no attribute 'get_body'

In [47]:
print(extract_plain_text(sample_html_spam)[:1000], "...")

Decoding error: 'str' object has no attribute 'decode'
 ...


In [48]:
from bs4 import BeautifulSoup

def extract_plain_text(email_message):
    """
    Extract plain text from an EmailMessage object with HTML content.

    Args:
        email_message (email.message.EmailMessage): The email message object to analyze.

    Returns:
        str: The extracted plain text from the email.
    """
    # Extract the email's HTML content
    html_content = email_message.get_body(preferencelist=('html'))
    
    if html_content:
        try:
            # Use BeautifulSoup to parse and extract plain text
            soup = BeautifulSoup(html_content.get_content(), 'html.parser')
            plain_text = soup.get_text()
            return plain_text
        except Exception as e:
            # Handle any parsing errors
            print(f"HTML parsing error: {e}")
            return ""
    else:
        return ""


In [49]:
sample_html_spam = html_spam_emails[7]
print(extract_plain_text.get_content().strip()[:1000], "...")

AttributeError: 'function' object has no attribute 'get_content'

In [50]:
extract_plain_text(html_spam_emails[7])

'\n\n\nOTC\n\n\xa0Newsletter\nDiscover Tomorrow\'s Winners\xa0\n\nFor Immediate Release\n\nCal-Bay (Stock Symbol: CBYI)\nWatch for analyst "Strong Buy Recommendations" and several advisory newsletters picking CBYI.  CBYI has filed to be traded on the OTCBB, share prices historically INCREASE when companies get listed on this larger trading exchange. CBYI is trading around 25 cents and should skyrocket to $2.66 - $3.25 a share in the near future.\nPut CBYI on your watch list, acquire a position TODAY.\n\nREASONS TO INVEST IN CBYI\n\nA profitable company and is on track to beat ALL earnings estimates!\n\nOne of the FASTEST growing distributors in environmental & safety equipment instruments.\n\nExcellent management team, several EXCLUSIVE contracts.  IMPRESSIVE client list including the U.S. Air Force, Anheuser-Busch, Chevron Refining and Mitsubishi Heavy Industries, GE-Energy & Environmental Research.\n\nRAPIDLY GROWING INDUSTRY\nIndustry revenues exceed $900 million, estimates indicate

In [52]:
print(extract_plain_text(sample_html_spam.get_content())[:1000], "...")

AttributeError: 'str' object has no attribute 'get_body'

In [53]:
sample_html_spam = html_spam_emails[7]
print(sample_html_spam.get_content().strip()[:1000], "...")

<HTML><HEAD><TITLE></TITLE><META http-equiv="Content-Type" content="text/html; charset=windows-1252"><STYLE>A:link {TEX-DECORATION: none}A:active {TEXT-DECORATION: none}A:visited {TEXT-DECORATION: none}A:hover {COLOR: #0033ff; TEXT-DECORATION: underline}</STYLE><META content="MSHTML 6.00.2713.1100" name="GENERATOR"></HEAD>
<BODY text="#000000" vLink="#0033ff" link="#0033ff" bgColor="#CCCC99"><TABLE borderColor="#660000" cellSpacing="0" cellPadding="0" border="0" width="100%"><TR><TD bgColor="#CCCC99" valign="top" colspan="2" height="27">
<font size="6" face="Arial, Helvetica, sans-serif" color="#660000">
<b>OTC</b></font></TD></TR><TR><TD height="2" bgcolor="#6a694f">
<font size="5" face="Times New Roman, Times, serif" color="#FFFFFF">
<b>&nbsp;Newsletter</b></font></TD><TD height="2" bgcolor="#6a694f"><div align="right"><font color="#FFFFFF">
<b>Discover Tomorrow's Winners&nbsp;</b></font></div></TD></TR><TR><TD height="25" colspan="2" bgcolor="#CCCC99"><table width="100%" border="0" 

In [55]:
print(extract_plain_text(sample_html_spam))




OTC

 Newsletter
Discover Tomorrow's Winners 

For Immediate Release

Cal-Bay (Stock Symbol: CBYI)
Watch for analyst "Strong Buy Recommendations" and several advisory newsletters picking CBYI.  CBYI has filed to be traded on the OTCBB, share prices historically INCREASE when companies get listed on this larger trading exchange. CBYI is trading around 25 cents and should skyrocket to $2.66 - $3.25 a share in the near future.
Put CBYI on your watch list, acquire a position TODAY.

REASONS TO INVEST IN CBYI

A profitable company and is on track to beat ALL earnings estimates!

One of the FASTEST growing distributors in environmental & safety equipment instruments.

Excellent management team, several EXCLUSIVE contracts.  IMPRESSIVE client list including the U.S. Air Force, Anheuser-Busch, Chevron Refining and Mitsubishi Heavy Industries, GE-Energy & Environmental Research.

RAPIDLY GROWING INDUSTRY
Industry revenues exceed $900 million, estimates indicate that there could be as much as

In [56]:
result = extract_plain_text(sample_html_spam)

# Use type() to check the data type of the result
data_type = type(result)

# Print the data type
print(f"Data Type: {data_type}")
 

Data Type: <class 'str'>


In [59]:
sample_html_spam = html_spam_emails[100]
print(sample_html_spam.get_content().strip()[:1000], "...")
print(extract_plain_text(sample_html_spam))

<html>

<head>
<meta http-equiv="Content-Language" content="en-us">
<meta http-equiv="Content-Type" content="text/html; charset=windows-1252">
<meta name="GENERATOR" content="Microsoft FrontPage 4.0">
<meta name="ProgId" content="FrontPage.Editor.Document">
<title>Does Your Computer Need an Oil Change</title>
</head>

<body>

<table border="0" width="538" height="1">
  <tr>
    <td width="538" height="1" align="center" bgcolor="#000000"><b><font face="Century Gothic" size="5" color="#FFFFFF">Does Your Computer Need an Oil
      Change?</font></b></td>
  </tr>
</table>
<table border="0" width="538" height="151">
  <tr>
    <td width="530" height="145"><b><font face="Tahoma" size="5">Norton</font><font color="#006600" face="Verdana" size="7"><br></font><i><font face="Verdana" color="#CC0000" size="7">SystemWorks
      2002</font></i><font size="4" face="Verdana"><br> </font><font face="Tahoma" size="5">Professional
      Edition</font> </b></td>
  </tr>
</table>
<table border="0" width="

In [64]:
from email.message import EmailMessage
from bs4 import BeautifulSoup

def extract_plain_text_from_email(email):
    """
    Extract plain text content from an email message, regardless of content type.

    Args:
        email (email.message.EmailMessage): The email message object to analyze.

    Returns:
        str: The extracted plain text content.
    """
    plain_text = ""

    # Iterate through email parts
    for part in email.walk():
        content_type = part.get_content_type()

        # Extract plain text content for text/plain or multipart/alternative
        if content_type in ["text/plain", "multipart/alternative"]:
            plain_text += part.get_payload(decode=True).decode("utf-8", errors="ignore")

    # If no plain text was found, try to extract plain text from HTML content
    if not plain_text:
        html_content = email.get_body(preferencelist=("html"))
        if html_content:
            try:
                soup = BeautifulSoup(html_content.get_content(), "html.parser")
                plain_text = soup.get_text()
            except Exception as e:
                print(f"HTML parsing error: {e}")

    return plain_text


Plain Text Content:
This is a plain text email content.



In [66]:
print(extract_plain_text_from_email(sample_html_spam)[:100], "...")







Does Your Computer Need an Oil Change




Does Your Computer Need an Oil
      Change?




Nor ...


In [67]:
$ pip3 install nltk

SyntaxError: invalid syntax (<ipython-input-67-07b6eb305c59>, line 1)

In [68]:
try:
    import nltk

    stemmer = nltk.PorterStemmer()
    for word in ("Computations", "Computation", "Computing", "Computed", "Compute", "Compulsive"):
        print(word, "=>", stemmer.stem(word))
except ImportError:
    print("Error: stemming requires the NLTK module.")
    stemmer = None

Computations => comput
Computation => comput
Computing => comput
Computed => comput
Compute => comput
Compulsive => compuls


In [71]:
%pip install -q -U urlextract

Note: you may need to restart the kernel to use updated packages.


In [72]:
try:
    import urlextract # may require an Internet connection to download root domain names
    
    url_extractor = urlextract.URLExtract()
    print(url_extractor.find_urls("Will it detect github.com and https://youtu.be/7Pq-S557XQU?t=3m32s"))
except ImportError:
    print("Error: replacing URLs requires the urlextract module.")
    url_extractor = None

['github.com', 'https://youtu.be/7Pq-S557XQU?t=3m32s']


In [73]:
from sklearn.base import BaseEstimator, TransformerMixin

class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, strip_headers=True, lower_case=True, remove_punctuation=True,
                 replace_urls=True, replace_numbers=True, stemming=True):
        self.strip_headers = strip_headers
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.stemming = stemming
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_transformed = []
        for email in X:
            text = email_to_text(email) or ""
            if self.lower_case:
                text = text.lower()
            if self.replace_urls and url_extractor is not None:
                urls = list(set(url_extractor.find_urls(text)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    text = text.replace(url, " URL ")
            if self.replace_numbers:
                text = re.sub(r'\d+(?:\.\d*)?(?:[eE][+-]?\d+)?', 'NUMBER', text)
            if self.remove_punctuation:
                text = re.sub(r'\W+', ' ', text, flags=re.M)
            word_counts = Counter(text.split())
            if self.stemming and stemmer is not None:
                stemmed_word_counts = Counter()
                for word, count in word_counts.items():
                    stemmed_word = stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                word_counts = stemmed_word_counts
            X_transformed.append(word_counts)
        return np.array(X_transformed)

In [74]:
X_few = X_train[:3]
X_few_wordcounts = EmailToWordCounterTransformer().fit_transform(X_few)
X_few_wordcounts

NameError: name 'email_to_text' is not defined

In [75]:
from sklearn.base import BaseEstimator, TransformerMixin

class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, strip_headers=True, lower_case=True, remove_punctuation=True,
                 replace_urls=True, replace_numbers=True, stemming=True):
        self.strip_headers = strip_headers
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.stemming = stemming
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_transformed = []
        for email in X:
            text = extract_plain_text_from_email(email) or ""
            if self.lower_case:
                text = text.lower()
            if self.replace_urls and url_extractor is not None:
                urls = list(set(url_extractor.find_urls(text)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    text = text.replace(url, " URL ")
            if self.replace_numbers:
                text = re.sub(r'\d+(?:\.\d*)?(?:[eE][+-]?\d+)?', 'NUMBER', text)
            if self.remove_punctuation:
                text = re.sub(r'\W+', ' ', text, flags=re.M)
            word_counts = Counter(text.split())
            if self.stemming and stemmer is not None:
                stemmed_word_counts = Counter()
                for word, count in word_counts.items():
                    stemmed_word = stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                word_counts = stemmed_word_counts
            X_transformed.append(word_counts)
        return np.array(X_transformed)

In [77]:
import re

In [78]:
X_few = X_train[:3]
X_few_wordcounts = EmailToWordCounterTransformer().fit_transform(X_few)
X_few_wordcounts

array([Counter({'chuck': 1, 'murcko': 1, 'wrote': 1, 'stuff': 1, 'yawn': 1, 'r': 1}),
       Counter({'the': 11, 'of': 9, 'and': 8, 'all': 3, 'christian': 3, 'to': 3, 'by': 3, 'jefferson': 2, 'i': 2, 'have': 2, 'superstit': 2, 'one': 2, 'on': 2, 'been': 2, 'ha': 2, 'half': 2, 'rogueri': 2, 'teach': 2, 'jesu': 2, 'some': 1, 'interest': 1, 'quot': 1, 'url': 1, 'thoma': 1, 'examin': 1, 'known': 1, 'word': 1, 'do': 1, 'not': 1, 'find': 1, 'in': 1, 'our': 1, 'particular': 1, 'redeem': 1, 'featur': 1, 'they': 1, 'are': 1, 'alik': 1, 'found': 1, 'fabl': 1, 'mytholog': 1, 'million': 1, 'innoc': 1, 'men': 1, 'women': 1, 'children': 1, 'sinc': 1, 'introduct': 1, 'burnt': 1, 'tortur': 1, 'fine': 1, 'imprison': 1, 'what': 1, 'effect': 1, 'thi': 1, 'coercion': 1, 'make': 1, 'world': 1, 'fool': 1, 'other': 1, 'hypocrit': 1, 'support': 1, 'error': 1, 'over': 1, 'earth': 1, 'six': 1, 'histor': 1, 'american': 1, 'john': 1, 'e': 1, 'remsburg': 1, 'letter': 1, 'william': 1, 'short': 1, 'again': 1, 'becom

In [79]:
from scipy.sparse import csr_matrix

class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=1000):
        self.vocabulary_size = vocabulary_size
    def fit(self, X, y=None):
        total_count = Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_count[word] += min(count, 10)
        most_common = total_count.most_common()[:self.vocabulary_size]
        self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}
        return self
    def transform(self, X, y=None):
        rows = []
        cols = []
        data = []
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word, 0))
                data.append(count)
        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1))

In [80]:
vocab_transformer = WordCounterToVectorTransformer(vocabulary_size=10)
X_few_vectors = vocab_transformer.fit_transform(X_few_wordcounts)
X_few_vectors

<3x11 sparse matrix of type '<class 'numpy.int64'>'
	with 20 stored elements in Compressed Sparse Row format>

In [81]:
X_few_vectors.toarray()

array([[ 6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [99, 11,  9,  8,  3,  1,  3,  1,  3,  2,  3],
       [67,  0,  1,  2,  3,  4,  1,  2,  0,  1,  0]])

In [82]:
vocab_transformer.vocabulary_

{'the': 1,
 'of': 2,
 'and': 3,
 'to': 4,
 'url': 5,
 'all': 6,
 'in': 7,
 'christian': 8,
 'on': 9,
 'by': 10}

In [83]:
from sklearn.pipeline import Pipeline

preprocess_pipeline = Pipeline([
    ("email_to_wordcount", EmailToWordCounterTransformer()),
    ("wordcount_to_vector", WordCounterToVectorTransformer()),
])

X_train_transformed = preprocess_pipeline.fit_transform(X_train)

HTML parsing error: unknown encoding: DEFAULT_CHARSET


AttributeError: 'NoneType' object has no attribute 'decode'

In [84]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_clf = LogisticRegression(solver="lbfgs", max_iter=1000, random_state=42)
score = cross_val_score(log_clf, X_train_transformed, y_train, cv=3, verbose=3)
score.mean()


NameError: name 'X_train_transformed' is not defined

In [85]:
from sklearn.metrics import precision_score, recall_score

X_test_transformed = preprocess_pipeline.transform(X_test)

log_clf = LogisticRegression(solver="lbfgs", max_iter=1000, random_state=42)
log_clf.fit(X_train_transformed, y_train)

y_pred = log_clf.predict(X_test_transformed)

print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_pred)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred)))

AttributeError: 'NoneType' object has no attribute 'decode'