In [1]:
import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style = "whitegrid", 
        color_codes = True,
        font_scale = 1.5)

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import classification_report

import re

In [7]:
# Load data
emails = pd.read_csv('data/train.csv')

In [8]:
emails.loc[:3999, :].to_csv('train_1.csv', index=False)
emails.loc[4000:, :].to_csv('train_2.csv', index=False)

In [9]:
emails_1 = pd.read_csv('data/train_1.csv')
emails_2 = pd.read_csv('data/train_2.csv')
emails = pd.concat([emails_1, emails_2], ignore_index=True)
emails.head()

Unnamed: 0,id,subject,email,spam
0,0,Subject: A&L Daily to be auctioned in bankrupt...,URL: http://boingboing.net/#85534171\n Date: N...,0
1,1,"Subject: Wired: ""Stronger ties between ISPs an...",URL: http://scriptingnews.userland.com/backiss...,0
2,2,Subject: It's just too small ...,<HTML>\n <HEAD>\n </HEAD>\n <BODY>\n <FONT SIZ...,1
3,3,Subject: liberal defnitions\n,Depends on how much over spending vs. how much...,0
4,4,Subject: RE: [ILUG] Newbie seeks advice - Suse...,hehe sorry but if you hit caps lock twice the ...,0


In [None]:
# Check shape
print(f'There are {emails.shape[0]} rows in the '
      f'dataset and {emails.shape[1]} columns.')

In [None]:
# Check for duplicates
unique_PIDs = len(emails['id'].unique())
total_PIDs = emails.shape[0]
number_of_dupes = total_PIDs - unique_PIDs
print(f'There are {number_of_dupes} duplicates in the dataset.')

In [None]:
# Train test split, seeded for replication
train, test = train_test_split(emails, test_size=0.1, random_state=1)

In [None]:
# Check for missing values
train.isna().sum().sort_values(ascending=False)

In [None]:
# Class imbalance?
train['spam'].replace({0: 'Ham', 1: 'Spam'}).value_counts() * 100 / len(train)

In [None]:
# Some emails have NaNs for their subjects
def handle_missing_data(data):
    data = data.fillna('')
    return data

In [None]:
def frac_upper(string):
    """Computes the fraction of alphabetical characters
    in STRING that are uppercase. If no alphabetical
    characters, returns 0."""
    num_upper = len(re.findall(r'[A-Z]', string))
    num_letters = len(re.findall(r'[a-zA-Z]', string))
    if num_letters == 0:
        return 0
    else:
        return num_upper / num_letters
    
def number_of(regex):
    """Returns a lambda that when applied to a string
    will count the number of occurences of REGEX in the
    string (for use in making new features below)."""
    return lambda string: len(re.findall(regex, string))

In [None]:
def make_new_features(data):
    # Combine subject and email columns
    data['combined'] = data['subject'] + ' ' + data['email']

    # Make everything in 'combined' lowercase and remove punctuation
    data['no_punc'] = (
        data['combined']
        .str.lower()
        .str.replace(pat=r'[^\w\s]', repl=' ')
    )
    
    # Count number of characters, words, new line characters,
    # etc. Take logs of these. Also compute fraction of
    # uppercase letters in email
    data['log_chars'] = np.log1p(data['combined'].apply(len))
    data['log_words'] = np.log1p(data['no_punc'].apply(lambda string: len(string.split())))
    data['log_new_lines'] = np.log1p(data['combined'].apply(number_of(r'[\n]')))
    data['log_angle_brackets'] = np.log1p(data['combined'].apply(number_of(r'[<>]')))
    data['log_exclamations'] = np.log1p(data['combined'].apply(number_of(r'[!]')))
    data['log_punctuation'] = np.log1p(data['combined'].apply(number_of(r'[\n$%<>!?]')))
    data['frac_upper'] = data['combined'].apply(frac_upper)
    
    return data

In [None]:
# The following function runs only on 'train' so as to
# guarantee that, when we run our data through the pipeline,
# the same words are used for both 'train' and 'test'.
def spam_ham_words(min_emails):
    # Combine subject and email columns
    train['combined'] = train['subject'] + ' ' + train['email']
    
    # Make everything in 'combined' lowercase and remove punctuation
    train['no_punc'] = (
        train['combined']
        .str.lower()
        .str.replace(pat=r'[^\w\s]', repl=' ')
    )
    
    # Put email text into "tidy format", i.e., each word of each
    # email gets put into its own row, indexed by id of email
    tidy_format = (
        train['no_punc']
        .str.split(expand=True)
        .stack()
        .reset_index(level=1)
        .rename(columns={'level_1': 'num', 0: 'word'})
        # The following lines drop repeated words in same email
        # Not sure if should keep these
        #.drop('num', axis=1)
        #.reset_index()
        #.drop_duplicates()
        #.set_index('index')
    )
    
    # Find which words are most indicative of a spam email
    # versus a ham email. Limit to words appearing in at
    # least 'min_emails' emails
    words = (
        tidy_format
        .groupby('word')
        .filter(lambda x: x.index.nunique() >= min_emails)
        .merge(train[['spam']], how="left", left_index=True, right_index=True)
        .groupby('word')[['spam']]
        .mean()
        .sort_values('spam', ascending=False)
    )
    
    return words

def words_in_texts(words, texts):
    """Returns a dataframe the (i, j)^th entry of which is 1 
    if the i^th element of TEXTS contains the j^th element of
    WORDS as a substring and is 0 otherwise."""
    indicator_array = np.array([texts.str.contains(word).astype(int) for word in words]).T
    df = pd.DataFrame(indicator_array, columns=words)
    return df

In [None]:
def append_words_cols(data, words):
    """For each word in WORDS, appends a binary feature 
    indicating whether that word appears in a given email."""
    # Must reset index on 'data' for use in concat
    data = data.reset_index(drop=True)
    
    words_in_texts_df = words_in_texts(words, data['no_punc'])
    data = pd.concat([data, words_in_texts_df], axis=1)
    return data

In [None]:
# Drop unneeded columns
def drop_cols(data, cols):
    data = data.drop(cols, axis=1)
    return data

cols_to_drop = ['id', 'subject', 'email', 'combined', 'no_punc']
#cols_to_drop = ['id', 'subject', 'email', 'combined', 'no_punc', 'log_angle_brackets', 'log_exclamations']

# The graphs for 'log_angle_brackets' and 'log_exclamations'
# are strange. Not clear whether we should include these
# features. We'll try running the model with them and
# try it again without them.

In [None]:
def data_process_pipeline(data):
    data = (
        data
        .pipe(handle_missing_data)
        .pipe(make_new_features)
        .pipe(append_words_cols, words)
        .pipe(drop_cols, cols_to_drop)
    )
    
    return data

#### Consider using LogisticRegressionCV
#### Also, hyperparams to tweak: 'min_emails', 'num_spam', and 'num_ham' in 'spam_ham_words', as well as classification threshold in LogisticRegression model object. Also consider dropping the 'log_angle_brackets' and 'log_exclamations' cols.

In [None]:
# Hyperparameter optimization
#
# To do this "right", probably want to use KFold and
# split the training set. Also, will probably want to
# optimize recall rather than accuracy; see below

import itertools

min_emails_range = range(600, 1201, 100)
num_spam_range = range(30, 101, 15)
num_ham_range = range(30, 101, 15)

training_scores = {}
test_scores = {}

# Use nested 'for' loops here b/c computing 'words' is
# the most expensive part. No sense recomputing 'words'
# multiple times w the same value of 'min_emails'
for min_emails in min_emails_range:
    words_df = spam_ham_words(min_emails)
    for num_spam, num_ham in itertools.product(num_spam_range, num_ham_range):
        spam_words = words_df.index[:num_spam].tolist()
        ham_words = words_df.index[-num_ham:].tolist()
        words = spam_words + ham_words
        
        processed_train = data_process_pipeline(train.copy().rename(columns={'spam': 'Spam'}))
        processed_test = data_process_pipeline(test.copy().rename(columns={'spam': 'Spam'}))
        
        X_train = processed_train.drop('Spam', axis=1)
        y_train = processed_train['Spam']
        X_test = processed_test.drop('Spam', axis=1)
        y_test = processed_test['Spam']
        
        model = LogisticRegression()
        model.fit(X_train, y_train)
        
        scores = {}
        
        for C in np.linspace(0, 1, 100):
            y_pred = 1*(model.predict_proba(X_train)[:, 1] > C)
            # May want to track a different metric here instead of
            # accuracy, maybe precision, recall or f1. Look into this
            score = sum(y_pred == y_train) / len(y_train)
            scores[C] = score
        
        C = max(scores, key=scores.get)
        
        y_train_pred = 1*(model.predict_proba(X_train)[:, 1] > C)
        training_accuracy = sum(y_train_pred == y_train) / len(y_train)
        
        y_test_pred = 1*(model.predict_proba(X_test)[:, 1] > C)
        test_accuracy = sum(y_test_pred == y_test) / len(y_test)
        
        quadruple = (min_emails, num_spam, num_ham, C)
        training_scores[quadruple] = training_accuracy
        test_scores[quadruple] = test_accuracy

In [None]:
min_emails, num_spam, num_ham, C = max(test_scores, key=test_scores.get)

In [None]:
max(training_scores, key=training_scores.get)

In [None]:
words_df = spam_ham_words(min_emails)
spam_words = words_df.index[:num_spam].tolist()
ham_words = words_df.index[-num_ham:].tolist()
words = spam_words + ham_words

In [None]:
# Final dataframes before modeling. Rename 'spam' as
# 'Spam' bc one of the words found by 'spam_ham_words'
# is 'spam'
processed_train = data_process_pipeline(train.copy().rename(columns={'spam': 'Spam'}))
processed_test = data_process_pipeline(test.copy().rename(columns={'spam': 'Spam'}))

X_train = processed_train.drop('Spam', axis=1)
y_train = processed_train['Spam']
X_test = processed_test.drop('Spam', axis=1)
y_test = processed_test['Spam']

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

y_train_pred = 1*(model.predict_proba(X_train)[:, 1] > C)
training_accuracy = sum(y_train_pred == y_train) / len(y_train)
        
y_test_pred = 1*(model.predict_proba(X_test)[:, 1] > C)
test_accuracy = sum(y_test_pred == y_test) / len(y_test)

#training_accuracy = model.score(X_train, y_train)
#test_accuracy = model.score(X_test, y_test)
print('Training Accuracy: ', training_accuracy)
print('Test Accuracy: ', test_accuracy)

In [None]:
y_predict = model.predict_proba(X_train)[:, 1]
prec, recall, _ = precision_recall_curve(y_train, y_predict)
plt.plot(recall, prec)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.show()

print(classification_report(y_test, model.predict(X_test)))

In [None]:
from datetime import datetime

evaluation = pd.read_csv('data/eval.csv')
evaluation_predictions = model.predict(data_process_pipeline(evaluation.copy()))

# Construct and save the submission:
submission_df = pd.DataFrame({
    "Id": evaluation['id'], 
    "Class": evaluation_predictions,
}, columns=['Id', 'Class'])
timestamp = datetime.isoformat(datetime.now()).split(".")[0]
submission_df.to_csv("submission_{}.csv".format(timestamp), index=False)

In [None]:
staff_words = (
    {'body', 'click', 'please', 'base64', '2002', 'html', 'subscribed', 
     'wrote', 'mortgage', 'align3dcenterfont', 'dear', 'br', 'width10img',
     'divfont', 'im', 'receive', 'list', 'tags', 'web', 'click',
     'body', 'please', 'money', 'offer', 'receive', 'contact', 'free',
     'tr', 'removed', 'remove', 'html', 'font', 'form',
     'credit', 'business', 'div'}
)

# Lists of words to check (back from when I was working on this in March):
words_to_check = (
    {'body', 'business', 'html', 'money', 'offer', 'please',
     'click', 'please', '2002', 'html', 'subscribed', 're:',
     'wrote', 'mortgage', 'dear', 'br', 'receive', 'list', 'fwd',
     'web', 'money', 'offer', 'contact', 'free', 'tr', 'removed', 
     'remove', 'font', 'form', 'credit', 'business', 'div', 'small',
     'drug', 'bank', 'prescription', 'memo', 'private', 'selected', 
     'viagra', 'large', 'penis', 'horny', 'login', 'cash', 'loan', 
     'now', ' hi', 'act', 'limited', 'sex', 'today', 'free', 
     'gift', 'adult', 'member', 'buy', 'time', 'access', 'password', 
     'member', 'celeb', 'porn', 'remov', 'click', 'wealth', 'name',
     'address', 'work', '---', 'girl', 'babe', 'xxx', 'subscri',
     'weight', '...', 'align', 'font', '==', 'niger', 'zimb', 
     'invest', 'spam', 'original', 'repl', 'virus', 'url', 'wrote'}
)

In [None]:
tidy_format = (
    train['no_punc']
    .str.split(expand=True)
    .stack()
    .reset_index(level=1)
    .rename(columns={'level_1': 'num', 0: 'word'})
)

counts = (
    tidy_format
    .groupby('word')
    .agg(lambda x: x.index.nunique())
    .sort_values('num', ascending=False)
)

words = (
    tidy_format
    .groupby('word')
    .filter(lambda x: x.index.nunique() >= 400)
    .merge(train[['spam']], how="left", left_index=True, right_index=True)
    .groupby('word')[['spam']]
    .mean()
    .sort_values('spam', ascending=False)
)

In [None]:
words_to_check = sorted([word for word in staff_words if word in words.index])

In [None]:
words.loc[words_to_check].sort_values('spam', ascending=False)