# Naive-Bayes Spam Classifier


In [27]:
import findspark

findspark.init('/home/guipleite/spark-3.0.2-bin-hadoop3.2')

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.sql.functions import length
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer, VectorAssembler
from pyspark.ml.linalg import Vector
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

spark = SparkSession.builder.appName('NLP_exe').getOrCreate()

# https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection
data = spark.read.csv("./SMSSpamCollection", inferSchema=True, sep='\t')
df = data.withColumnRenamed('_c0', 'class').withColumnRenamed('_c1', 'text')

df.show(5)

+-----+--------------------+
|class|                text|
+-----+--------------------+
|  ham|Go until jurong p...|
|  ham|Ok lar... Joking ...|
| spam|Free entry in 2 a...|
|  ham|U dun say so earl...|
|  ham|Nah I don't think...|
+-----+--------------------+
only showing top 5 rows



In [28]:
df = df.withColumn('length',length(df['text']))
df.show(5)

+-----+--------------------+------+
|class|                text|length|
+-----+--------------------+------+
|  ham|Go until jurong p...|   111|
|  ham|Ok lar... Joking ...|    29|
| spam|Free entry in 2 a...|   155|
|  ham|U dun say so earl...|    49|
|  ham|Nah I don't think...|    61|
+-----+--------------------+------+
only showing top 5 rows



In [29]:
df.groupby('class').mean().show()

+-----+-----------------+
|class|      avg(length)|
+-----+-----------------+
|  ham|71.45431945307645|
| spam|138.6706827309237|
+-----+-----------------+



In [30]:

indexer = StringIndexer(inputCol='class', outputCol='label')

tokenizer = Tokenizer(inputCol="text", outputCol="token_text")

stopremover = StopWordsRemover(inputCol='token_text', outputCol='stop_tokens')

count_vec = CountVectorizer(inputCol='stop_tokens', outputCol='c_vec')

idf = IDF(inputCol="c_vec", outputCol="tf_idf")

clean_data = VectorAssembler(inputCols=['tf_idf','length'],outputCol='features')

In [31]:

prepro = Pipeline(stages=[indexer, tokenizer, stopremover, count_vec, idf, clean_data])
pipe = prepro.fit(df)

final_df = pipe.transform(df)

In [33]:
final_df.show(5)

+-----+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|class|                text|length|label|          token_text|         stop_tokens|               c_vec|              tf_idf|            features|
+-----+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|  ham|Go until jurong p...|   111|  0.0|[go, until, juron...|[go, jurong, poin...|(13423,[7,11,31,6...|(13423,[7,11,31,6...|(13424,[7,11,31,6...|
|  ham|Ok lar... Joking ...|    29|  0.0|[ok, lar..., joki...|[ok, lar..., joki...|(13423,[0,24,297,...|(13423,[0,24,297,...|(13424,[0,24,297,...|
| spam|Free entry in 2 a...|   155|  1.0|[free, entry, in,...|[free, entry, 2, ...|(13423,[2,13,19,3...|(13423,[2,13,19,3...|(13424,[2,13,19,3...|
|  ham|U dun say so earl...|    49|  0.0|[u, dun, say, so,...|[u, dun, say, ear...|(13423,[0,70,80,1...|(13423,[0,70,8

In [39]:
(training, testing) = final_df.randomSplit([0.7,0.3])

nb = NaiveBayes()
nb_model = nb.fit(training)
results = nb_model.transform(testing)

results.select(['label','features','prediction']).show()

+-----+--------------------+----------+
|label|            features|prediction|
+-----+--------------------+----------+
|  0.0|(13424,[3,6,5140,...|       0.0|
|  0.0|(13424,[3,84,114,...|       0.0|
|  0.0|(13424,[0,3,14,18...|       0.0|
|  0.0|(13424,[3,6,278,1...|       0.0|
|  0.0|(13424,[5,78,116,...|       0.0|
|  0.0|(13424,[115,116,2...|       0.0|
|  0.0|(13424,[15,116,30...|       0.0|
|  0.0|(13424,[6,215,245...|       0.0|
|  0.0|(13424,[6,242,278...|       0.0|
|  0.0|(13424,[165,250,6...|       0.0|
|  0.0|(13424,[5,18,71,1...|       0.0|
|  0.0|(13424,[32,66,168...|       0.0|
|  0.0|(13424,[245,1154,...|       0.0|
|  0.0|(13424,[22,3697,1...|       0.0|
|  0.0|(13424,[3,4,6,7,9...|       0.0|
|  0.0|(13424,[3,6,339,7...|       0.0|
|  0.0|(13424,[242,759,9...|       0.0|
|  0.0|(13424,[11,26,79,...|       0.0|
|  0.0|(13424,[0,2,20,10...|       0.0|
|  0.0|(13424,[4,15,65,1...|       0.0|
+-----+--------------------+----------+
only showing top 20 rows



In [40]:
acc_eval = MulticlassClassificationEvaluator()
acc_eval.evaluate(results)


0.9276120876950424

Model shows a realtive good accuracy given it's simply using the word email lenght and a few features to predict wether it is spam or not 