### Install PySpark

In [1]:
# Install the dependencies
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.0.1/spark-3.0.1-bin-hadoop3.2.tgz
!tar xf spark-3.0.1-bin-hadoop3.2.tgz
!pip install -q findspark

In [2]:
# Set the environment variables for running PySpark in the collaboration environmentimport os
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.1-bin-hadoop3.2"

In [3]:
# Run the local session to test the installation
import findspark
findspark.init('spark-3.0.1-bin-hadoop3.2')
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[*]').getOrCreate()

In [91]:
df = spark.read.csv('drive/MyDrive/dataset/spam.csv',inferSchema=True, sep=',', header=True)

In [92]:
df.show(5)

+----+--------------------+
|  v1|                  v2|
+----+--------------------+
| ham|Go until jurong p...|
| ham|Ok lar... Joking ...|
|spam|Free entry in 2 a...|
| ham|U dun say so earl...|
| ham|Nah I don't think...|
+----+--------------------+
only showing top 5 rows



In [93]:
df = df.withColumnRenamed('v1','class').withColumnRenamed('v2','text')

### Clean Data

In [94]:
from pyspark.sql.functions import length

In [95]:
df= df.withColumn('length', length(df['text']))

In [96]:
df.show(5)

+-----+--------------------+------+
|class|                text|length|
+-----+--------------------+------+
|  ham|Go until jurong p...|   111|
|  ham|Ok lar... Joking ...|    29|
| spam|Free entry in 2 a...|   155|
|  ham|U dun say so earl...|    49|
|  ham|Nah I don't think...|    61|
+-----+--------------------+------+
only showing top 5 rows



In [97]:
df = df.filter((df["class"]=='ham') | (df["class"]=='spam'))
df.groupBy('class').mean().show()

+-----+------------------+
|class|       avg(length)|
+-----+------------------+
|  ham| 71.07626943005181|
| spam|138.89558232931728|
+-----+------------------+



### Feature Transformation

### Creating Assemblers/Features

In [98]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer

In [99]:
tokenizer = Tokenizer(inputCol='text',outputCol='token_text')
stop_word_remover = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
count_vec = CountVectorizer(inputCol='stop_tokens',outputCol='c_vec')
idf = IDF(inputCol='c_vec',outputCol='tf_idf')
ham_spam_to_num = StringIndexer(inputCol='class',outputCol='label')

In [100]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

In [101]:
cleaned = VectorAssembler(inputCols=['tf_idf','length'],outputCol='features')

### Naive Bayes Model

In [102]:
from pyspark.ml.classification import NaiveBayes

In [103]:
nb = NaiveBayes()

### PipeLine

In [104]:
from pyspark.ml import Pipeline

In [105]:
pipeline = Pipeline(stages=[
                            ham_spam_to_num,
                            tokenizer,
                            stop_word_remover,
                            count_vec,
                            idf,
                            cleaned
])

In [106]:
cleaner = pipeline.fit(df)

In [107]:
clean_df = cleaner.transform(df)

In [108]:
clean_df.show(5)

+-----+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|class|                text|length|label|          token_text|         stop_tokens|               c_vec|              tf_idf|            features|
+-----+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|  ham|Go until jurong p...|   111|  0.0|[go, until, juron...|[go, jurong, poin...|(13378,[7,10,31,6...|(13378,[7,10,31,6...|(13379,[7,10,31,6...|
|  ham|Ok lar... Joking ...|    29|  0.0|[ok, lar..., joki...|[ok, lar..., joki...|(13378,[0,23,293,...|(13378,[0,23,293,...|(13379,[0,23,293,...|
| spam|Free entry in 2 a...|   155|  1.0|[free, entry, in,...|[free, entry, 2, ...|(13378,[2,13,19,2...|(13378,[2,13,19,2...|(13379,[2,13,19,2...|
|  ham|U dun say so earl...|    49|  0.0|[u, dun, say, so,...|[u, dun, say, ear...|(13378,[0,68,78,1...|(13378,[0,68,7

### Train Model and Evaluation

In [109]:
clean_df= clean_df.select(['label','features'])

In [110]:
(train, test) = clean_df.randomSplit([0.7,0.3], seed=42)

In [111]:
pred = nb.fit(train)

In [112]:
res = pred.transform(test)

In [113]:
res.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(13379,[0,1,2,40,...|[-1067.7843629943...|[1.0,5.8896897748...|       0.0|
|  0.0|(13379,[0,1,5,20,...|[-801.36315624411...|[1.0,2.1424453595...|       0.0|
|  0.0|(13379,[0,1,7,8,1...|[-1157.5027777014...|[1.0,3.9864212656...|       0.0|
|  0.0|(13379,[0,1,7,15,...|[-664.17212341543...|[1.0,6.3371897842...|       0.0|
|  0.0|(13379,[0,1,12,33...|[-445.27469323041...|[1.0,1.0908141025...|       0.0|
|  0.0|(13379,[0,1,14,18...|[-1364.8812547970...|[1.0,2.5170733523...|       0.0|
|  0.0|(13379,[0,1,14,31...|[-216.57472599429...|[1.0,6.4424068775...|       0.0|
|  0.0|(13379,[0,1,18,20...|[-861.65008827040...|[1.0,4.3766041506...|       0.0|
|  0.0|(13379,[0,1,22,62...|[-1336.0401637712...|[1.0,3.5685476076...|       0.0|
|  0.0|(13379,[0

In [114]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [115]:
eval = MulticlassClassificationEvaluator()
acc = eval.evaluate(res)
print(f'Accuracy: {acc * 100}')

Accuracy: 91.85519252666279
