In [1]:
#-----------------------------------------------------
# This program loads (reads) an LR model (built before 
# by the logistic_regression_builder.py program) and 
# then predicts new emails as "spam" or "non-spam".
#------------------------------------------------------
# Input Parameters:
#    argv[0]: String, is the name of the Python program
#    argv[1]: String, new emails to be classified [Query Data]
#    argv[2]: String, output path for the saved built model
#-------------------------------------------------------

In [16]:
from __future__ import print_function
import sys
from pyspark.sql import SparkSession
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.classification import LogisticRegressionModel

In [17]:
#
#===============================================
# Next, we build a simple Python function, `classify()` 
# to classify new emails with the built LR model:
# predict/classify email and return 
#    0 (for non-spam), and 
#    1 (for spam)
#
# Parameters:
#
#    email: a new email to be classified
#    tf: an instance of HashingTF
#    model: an LR model
#
def classify(email, tf, model):
    # tokenize an email into words
    tokens = email.split()
    # create features for a given email
    features = tf.transform(tokens)
    # classify email into "spam" (class 1) or "non-spam" (class 0)	
    return model.predict(features)
#end-def
#===============================================

In [18]:
spark = SparkSession\
        .builder\
        .appName("logistic_regression_predictor")\
        .getOrCreate()

In [19]:
# Step-1: define input paths
saved_model_path =  "/user/arnavmoutl12edu/module7/model"
# "/.../model"
new_emails_path = "/user/arnavmoutl12edu/module7/new_emails.txt"
# "/.../new_emails.txt"

In [20]:
# check out the inputs
print("saved_model_path: {}".format(saved_model_path))
print("new_emails_path: {}".format(new_emails_path))

saved_model_path: /user/arnavmoutl12edu/module7/model
new_emails_path: /user/arnavmoutl12edu/module7/new_emails.txt


In [21]:
# Next, we load the model from a saved location:
LR_model = LogisticRegressionModel.load(spark.sparkContext, saved_model_path)

In [22]:
# We still need to create a TF to create features 
# for the query of new emails:
FEATURES_HTF = HashingTF(numFeatures=128)

In [23]:
# Finally, we can predict new emails:
    
# build an RDD[String] for the new emails
# for new_emails, create RDD[String], where each 
# element is an email as a String
new_emails = spark.sparkContext.textFile(new_emails_path)

In [24]:
# next we classify every new email
# classified is an RDD[classification, email], 
# where classification will be either 0 (for non-spam email)  
# or 1 (for spam email)
classified = new_emails.map(lambda email: (classify(email, FEATURES_HTF, LR_model), email))

In [25]:
# Next, we debug/examine the classified emails:
# use collect() if you are doing debugging/testing
# of small number of emails
predictions = classified.collect()
spam_count = 0
nospam_count = 0
error_count = 0

In [27]:
#
# predications is a list of pair of (classification, email)
# p denotes a pair of (classification, email); 
# p[0] denotes predicted classification and 
# p[1] denotes an email
for p in predictions: 
    if p[0] == 0:
        nospam_count += 1
    elif p[0] == 1:
        spam_count += 1
    else:
        error_count += 1
    #
    print("prediction=" + str(p[0]) + "\t query email=" + str(p[1]))
#end-for

prediction=1	 query email=this is a year of promotion for Galaxy End of YearPromo You have 1 week remaining to retrieve your won prize for the Samsung Galaxy Xmas Promo 'C' draw category winning prize of Seven Hundred and Fifty Thousand Euros each and a Samsung Galaxy S6 EDGE. Winning Ticket Number:WIN-707-COS.  We advise you to keep this winning notification confidential and away from public notice to avoid double claim/mistransfer or impersonation until after remittance/payment to you.
prediction=1	 query email=you are the lucky one: We've picked out 10 new matches for you. Meet them now and then check out all the singles in your area! you might win a prize too
prediction=0	 query email=Do not miss your chances: Get Viagra real cheap!  Send money right away to ...
prediction=0	 query email=Get real money fast: With my position in the office i assure you with 100% risk free that this transaction is not a childish game play and i want you to indicate your full interest with assurance o

In [28]:
print("spam_count=" + str(spam_count))
print("nospam_count=" + str(nospam_count))
print("error_count=" + str(error_count))

spam_count=4
nospam_count=6
error_count=0


In [15]:
spark.stop()