In [1]:
import os
import math
import pickle
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.datasets import fetch_20newsgroups, load_files
from tensorflow import keras
from tensorflow.keras.layers import TextVectorization, Embedding, Dense, Lambda
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Prepare the Data

Will use the following datasets for training:
1. 20 Newsgroups: Roughly 20k news articles, no ads.  This can be obtained from sklearn.datasets
2. Kaggle Spam Filter dataset: 5.7k emails with a mix of spam and ham.  https://www.kaggle.com/karthickveerakumar/spam-filter
3. Kaggle Text Classified Ads dataset: 97k job and real estate advertisements (only real estate ads are used, as job ads were found to adversely impact model results).  https://www.kaggle.com/overflow012/playing-with-ads
4. Kaggle All the News dataset: 150k news articles.  https://www.kaggle.com/snapcrack/all-the-news
5. Kaggle Enron Emails Spam dataset: Enron emails separated into spam/ham, ~17k spam.  https://www.kaggle.com/wanderfj/enron-spam

Normally ads are a small portion of all documents.  These datasets will be combined.

In [2]:
def clean_text(row):
    return row.decode('unicode_escape').encode('ascii', 'ignore')


def load_enron_spam_emails():
    # load emails from text files and filter to spam only
    x, y = [], []
    for i in range(1,7):
        emails = load_files(f"enron_spam/enron{i}")
        x = np.append(x, emails.data)
        y = np.append(y, emails.target)
    df = pd.DataFrame({"text": x[np.where(y==1)], "ad": 1})
    df['text'] = df.text.apply(clean_text)
    return df


def load_classified_ads():
    # load text classified ads data, filters to real estate ads only, removes duplicates
    df = pd.read_csv('kaggle_text_classified_ads.csv')
    df = df[df['catid'] != 2].drop_duplicates()
    return df

In [3]:
# get datasets: newgroups = all news, ads = all ads, emails = mixed
data_newsgroups = fetch_20newsgroups(subset='all', shuffle=True, remove=('headers', 'footers', 'quotes'))
data_emails = pd.read_csv('kaggle_spam_filter_data.csv')
data_ads = load_classified_ads()
data_news = pd.concat([
    pd.read_csv('kaggle_all_the_news_1.csv', index_col=0), 
    # pd.read_csv('kaggle_all_the_news_2.csv', index_col=0)
])
data_enron = load_enron_spam_emails()

# combine datasets
data_all = pd.concat([
    data_emails.rename(columns={"spam": "ad"}),
    pd.DataFrame({"text": data_ads['value'], "ad": 1}),
    pd.DataFrame({"text": data_newsgroups.data, "ad": 0}),
    pd.DataFrame({"text": data_news.content, "ad": 0}),
    data_enron,
])
assert(len(data_newsgroups.data) + len(data_emails) + len(data_ads) + len(data_news) + len(data_enron) == len(data_all))
data_all = data_all.sample(frac=1.0).reset_index(drop=True)  # shuffle
data_all.to_csv("training_data.csv")
data_all.head()

  return row.decode('unicode_escape').encode('ascii', 'ignore')
  return row.decode('unicode_escape').encode('ascii', 'ignore')


Unnamed: 0,text,ad
0,"SEOUL, South Korea — President Trump assure...",0
1,A Swedish man is facing charges in court after...,0
2,"(CNN) Donald Trump has made one thing clear, ...",0
3,"b'Subject: microcap stock report\r\ngrant ,\r\...",1
4,b'Subject: 30 - vl . . . benz annal\r\nhtml\r\...,1


In [4]:
print(
    "Training Dataset Composition:\n",
    len(data_newsgroups.data), "news articles, all labeled ham\n", 
    len(data_emails), "emails, split between spam and ham\n",
    len(data_ads), "classified ads, all labeled spam\n",
    len(data_news), "news articles, all labeled ham\n",
    len(data_enron), "emails, all labeled spam\n",
    "Total Size:", len(data_all)
)

Training Dataset Composition:
 18846 news articles, all labeled ham
 5728 emails, split between spam and ham
 2596 classified ads, all labeled spam
 50000 news articles, all labeled ham
 17170 emails, all labeled spam
 Total Size: 94340


In [5]:
data_all.ad.value_counts()

0    73206
1    21134
Name: ad, dtype: int64

In [6]:
# data has already been shuffled

# split into train, val, test sets
test_split = 0.1
nbr_val_test_samples = int(test_split * 2 * len(data_all))
nbr_test_samples = int(test_split * len(data_all))
train = data_all.iloc[:-nbr_val_test_samples]
valid = data_all.iloc[-nbr_val_test_samples:-nbr_test_samples]
test = data_all.iloc[-nbr_test_samples:]
assert(len(train) + len(valid) + len(test) == len(data_all))
print("Train:", len(train), "Valid:", len(valid), "Test:", len(test))

Train: 75472 Valid: 9434 Test: 9434


In [7]:
# vectorize the text (give every token a number)
max_sequence_length = 200
vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=max_sequence_length)
text_ds = tf.data.Dataset.from_tensor_slices(data_all.iloc[:-nbr_val_test_samples]['text'].values).batch(128)
vectorizer.adapt(text_ds)

In [8]:
# save the vectorizer's config and weights
pickle.dump(
    {
        'config': vectorizer.get_config(),
         'weights': vectorizer.get_weights()
    }, 
    open("tv_layer.pkl", "wb")
)

In [9]:
# dictionary map of word to index
vocab = vectorizer.get_vocabulary()
word_index = dict(zip(vocab, range(len(vocab))))

In [10]:
# sense checks
print(vectorizer.get_vocabulary()[:5])
output = vectorizer([["the cat sat on the mat"]])
print(output.numpy()[0, :6])

['', '[UNK]', 'the', 'to', 'of']
[    2  6400  2480    11     2 15970]


# Load Pre-Trained Embeddings

Will use GloV3 embeddings.  https://nlp.stanford.edu/projects/glove/

There are different embeddings available.  This model will use the Wikipedia + Gigaword 6B dataset; a corpus of Wikipedia articles and Gigaword (newswire) articles with 6 billion tokens and 400k vocab size.  The dimensionality is 300.

In [11]:
# !wget http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
# !unzip -q glove.6B.zip

In [12]:
embed_dim = 300
path_to_glove_file = f"glove.6B.{embed_dim}d.txt"

# map all words in the embedding vocabulary to their numpy GloVe representation
embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print(f"Found {len(embeddings_index)} word vectors of {embed_dim} dimensions.")

Found 400000 word vectors of 300 dimensions.


# Embed Each Document

The model will use GloVe embeddings.  A neural network will be built to:
1. vectorize the text
2. embed with GloVe embeddings
3. aggregate up to the document level
4. re-scale the embeddings with a sigmoid function

Step 3 will sum the embeddings so that latent features that are heavily represented by the words in a document will have the largest values.  This will aggregate the embeddings from the word level to the document level.  Step 4 will re-scale the aggregated embeddings with a sigmoid function, which will push latent features with small values towards 0 and latent features with large values towards 1.  

In [13]:
num_tokens = len(vocab) + 2  # vocab size + 2 for padding and OOV token (out of vocabulary)
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embed_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print(f"Converted {hits} words and had {misses} out of vocabulary words")

Converted 17948 words and had 2052 out of vocabulary words


In [14]:
embedding_matrix.shape

(20002, 300)

In [15]:
# create an embedding layer
embedding_layer = Embedding(
    input_dim=num_tokens,
    output_dim=embed_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
)



def sigmoid(x):
    return 1 / (1 + math.e**(-1 * x))


def sum_layer(x):
    return sigmoid(tf.reduce_sum(input_tensor=x, axis=0, keepdims=False))


# build a simple neural network that applies the embedding to the input text and returns an embedding per document
# the network will take the document text strings as input, so doc length does not matter
# the network will not be trained, it just needs to apply the transformation in a single forward pass
string_input = keras.Input(shape=(1,), dtype="string")
vectorized_input = vectorizer(string_input)
x = embedding_layer(vectorized_input)
x = Lambda(sum_layer)(x)  # aggregate from sentence level to document level
model = keras.Model(string_input, x)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVec  (None, 200)              0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 200, 300)          6000600   
                                                                 
 lambda (Lambda)             (200, 300)                0         
                                                                 
Total params: 6,000,600
Trainable params: 0
Non-trainable params: 6,000,600
_________________________________________________________________


In [16]:
model.save('embedding_model')
print("Saved model to disk")

INFO:tensorflow:Assets written to: embedding_model/assets
Saved model to disk


In [17]:
# vectorize the documents in preparation for a forward pass
x_train = [np.array(doc) for doc in train.text]
x_valid = [np.array(doc) for doc in valid.text]
x_test = [np.array(doc) for doc in test.text]

In [18]:
# forward pass to embed every document
x_train = [model(d) for d in x_train]
x_valid = [model(d) for d in x_valid]
x_test = [model(d) for d in x_test]

with open('x_train.pkl', 'wb') as f:
    pickle.dump(x_train, f)
with open('x_valid.pkl', 'wb') as f:
    pickle.dump(x_train, f)
with open('x_test.pkl', 'wb') as f:
    pickle.dump(x_train, f)

# Train Model

The XGBoost model will fit to the document level embeddings, which are 300 latent features that have been aggregated from the word content of the documents.  

In [19]:
y_train = train.ad
y_valid = valid.ad
y_test = test.ad

In [26]:
xg = XGBClassifier(
    booster='gbtree', 
    eta=0.3, 
    gamma=0, 
    max_depth=6, 
    alpha=0, 
    verbosity=0
)
xg.fit(np.stack(x_train), y_train)

model_file_path = "xgb_ad_classifier.pkl"
pickle.dump(xg, open(model_file_path, "wb"))

TypeError: Input data can not be a list.

In [28]:
y_valid_pred = xg.predict(np.stack(x_valid))
y_test_pred = xg.predict(np.stack(x_test))

In [29]:
print(
    "Validation Set Scores:\n", 
    f"Accuracy: {accuracy_score(y_valid, y_valid_pred):2f}", "\n",
    f"Precision: {precision_score(y_valid, y_valid_pred):2f}", "\n",
    f"Recall: {recall_score(y_valid, y_valid_pred):2f}", "\n",
    f"F1: {f1_score(y_valid, y_valid_pred):2f}"
)
print(
    "Test Set Scores:\n", 
    f"Accuracy: {accuracy_score(y_test, y_test_pred):2f}", "\n",
    f"Precision: {precision_score(y_test, y_test_pred):2f}", "\n",
    f"Recall: {recall_score(y_test, y_test_pred):2f}", "\n",
    f"F1: {f1_score(y_test, y_test_pred):2f}"
)

Validation Set Scores:
 Accuracy: 0.986326 
 Precision: 0.973219 
 Recall: 0.965370 
 F1: 0.969278
Test Set Scores:
 Accuracy: 0.984418 
 Precision: 0.968008 
 Recall: 0.961020 
 F1: 0.964501


# Test on Blackwing Data

In [202]:
blackwing_data = pd.read_csv('blackwing_3m_9k.csv')
blackwing_data = blackwing_data[blackwing_data['disposition'].isin(['SELECT', 'IGNORE'])]
blackwing_data = blackwing_data[['text', 'disposition']].drop_duplicates()
# note that ignore DOES NOT mean it is an ad, but it is an indication that it might be
blackwing_y = blackwing_data.disposition.map({"SELECT": 0, "IGNORE": 1})

# vectorize the documents
blackwing_test = [np.array(doc) for doc in blackwing_data.text]
# embed the documents
blackwing_test = [model(d) for d in blackwing_test]
# make predictions
blackwing_pred = xg.predict(blackwing_test)
blackwing_pred_prob = xg.predict(blackwing_test)

In [203]:
# score based on somewhat faulty ground truth
print(
    "Blackwing Set Scores:\n", 
    f"Accuracy: {accuracy_score(blackwing_y, blackwing_pred):2f}", "\n",
    f"Precision: {precision_score(blackwing_y, blackwing_pred):2f}", "\n",
    f"Recall: {recall_score(blackwing_y, blackwing_pred):2f}", "\n",
    f"F1: {f1_score(blackwing_y, blackwing_pred):2f}"
)

Blackwing Set Scores:
 Accuracy: 0.722973 
 Precision: 0.617647 
 Recall: 0.160305 
 F1: 0.254545


In [204]:
blackwing_data['model_pred'] = blackwing_pred
blackwing_data['model_pred_prob'] = blackwing_pred_prob
blackwing_data.loc[blackwing_data.model_pred == 1, ['text', 'disposition', 'model_pred', 'model_pred_prob']].to_csv("blackwing_predicted_ads.csv", index=False)