In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import length
from pyspark.ml.feature import Tokenizer,StopWordsRemover,CountVectorizer, IDF,StringIndexer,VectorAssembler
from pyspark.ml.linalg import Vector
from pyspark.ml.classification import NaiveBayes
from pyspark.ml import Pipeline
from pyspark.sql.functions import col, sum
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
spark = SparkSession.builder \
    .appName("Phishing1") \
    .getOrCreate()

24/04/04 14:29:20 WARN Utils: Your hostname, Ozguns-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.212.144.182 instead (on interface en0)
24/04/04 14:29:20 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/04 14:29:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
schema = StructType([
    StructField("_c0", StringType(), True),
    StructField("Email Text", StringType(), True),
    StructField("Email Type", StringType(), True)
])

In [4]:
df = spark.read.csv("Phishing_Email.csv", schema=schema, header=True, multiLine=True, quote='"', sep=",", escape='"')

In [5]:
df.show()

24/04/04 14:29:23 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , Email Text, Email Type
 Schema: _c0, Email Text, Email Type
Expected: _c0 but found: 
CSV file: file:///Users/ozgunozkan/Phishing_Email.csv


+---+--------------------+--------------+
|_c0|          Email Text|    Email Type|
+---+--------------------+--------------+
|  0|re : 6 . 1100 , d...|    Safe Email|
|  1|the other side of...|    Safe Email|
|  2|re : equistar dea...|    Safe Email|
|  3|\nHello I am your...|Phishing Email|
|  4|software at incre...|Phishing Email|
|  5|global risk manag...|    Safe Email|
|  6|On Sun, Aug 11, 2...|    Safe Email|
|  7|entourage , stock...|Phishing Email|
|  8|we owe you lots o...|Phishing Email|
|  9|re : coastal deal...|    Safe Email|
| 10|make her beg you ...|Phishing Email|
| 11|URL: http://www.n...|    Safe Email|
| 12|begin forwarded t...|    Safe Email|
| 13|re : fyi - wellhe...|    Safe Email|
| 14|rmmla / ads * * *...|    Safe Email|
| 15|re : testing ir &...|    Safe Email|
| 16|The academic disc...|    Safe Email|
| 17|re : 3 . 402 quer...|    Safe Email|
| 18|a resume john , t...|    Safe Email|
| 19|EFFector       Vo...|    Safe Email|
+---+--------------------+--------

In [6]:
df = df.na.drop(subset=["Email Text"])

In [7]:
df = df.withColumnRenamed("_c0","emailID")

In [8]:
df = df.withColumn("length", length(df['Email Text']))

In [9]:
df.show()

+-------+--------------------+--------------+------+
|emailID|          Email Text|    Email Type|length|
+-------+--------------------+--------------+------+
|      0|re : 6 . 1100 , d...|    Safe Email|  1030|
|      1|the other side of...|    Safe Email|   479|
|      2|re : equistar dea...|    Safe Email|  1245|
|      3|\nHello I am your...|Phishing Email|   688|
|      4|software at incre...|Phishing Email|   441|
|      5|global risk manag...|    Safe Email|  3295|
|      6|On Sun, Aug 11, 2...|    Safe Email|   908|
|      7|entourage , stock...|Phishing Email|  7653|
|      8|we owe you lots o...|Phishing Email|   613|
|      9|re : coastal deal...|    Safe Email|  1822|
|     10|make her beg you ...|Phishing Email|  1474|
|     11|URL: http://www.n...|    Safe Email|   149|
|     12|begin forwarded t...|    Safe Email|  5900|
|     13|re : fyi - wellhe...|    Safe Email|  1229|
|     14|rmmla / ads * * *...|    Safe Email|  3017|
|     15|re : testing ir &...|    Safe Email| 

24/04/04 14:29:24 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , Email Text, Email Type
 Schema: _c0, Email Text, Email Type
Expected: _c0 but found: 
CSV file: file:///Users/ozgunozkan/Phishing_Email.csv


In [10]:
tokenizer = Tokenizer(inputCol="Email Text", outputCol = "token_email_text")
stop_word_remover = StopWordsRemover(inputCol="token_email_text", outputCol="stop_tokens")
count_vec = CountVectorizer(inputCol="stop_tokens",outputCol="count_vec")
idf = IDF(inputCol="count_vec", outputCol="tf_idf")
safe_phishing_to_num = StringIndexer(inputCol="Email Type", outputCol="label")

In [11]:
cleaned_data = VectorAssembler(inputCols=['tf_idf', 'length'], outputCol="features")

In [12]:
nb = NaiveBayes()

In [13]:
pipeline = Pipeline(stages=[
    safe_phishing_to_num,
    tokenizer,
    stop_word_remover,
    count_vec,
    idf,
    cleaned_data
])

In [14]:
cleaner = pipeline.fit(df.limit(100))

24/04/04 14:29:24 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , Email Text, Email Type
 Schema: _c0, Email Text, Email Type
Expected: _c0 but found: 
CSV file: file:///Users/ozgunozkan/Phishing_Email.csv
24/04/04 14:29:25 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , Email Text, Email Type
 Schema: _c0, Email Text, Email Type
Expected: _c0 but found: 
CSV file: file:///Users/ozgunozkan/Phishing_Email.csv
24/04/04 14:29:26 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , Email Text, Email Type
 Schema: _c0, Email Text, Email Type
Expected: _c0 but found: 
CSV file: file:///Users/ozgunozkan/Phishing_Email.csv
                                                                                

In [15]:
clean_df = cleaner.transform(df.limit(100))

In [16]:
clean_df = clean_df.select(['label', 'features'])
clean_df.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(6012,[0,2,4,5,8,...|
|  0.0|(6012,[0,2,7,13,2...|
|  0.0|(6012,[0,2,3,4,5,...|
|  1.0|(6012,[1,27,44,47...|
|  1.0|(6012,[0,2,8,9,39...|
|  0.0|(6012,[0,2,3,4,5,...|
|  0.0|(6012,[1,26,27,31...|
|  1.0|(6012,[0,2,3,4,5,...|
|  1.0|(6012,[0,2,3,4,5,...|
|  0.0|(6012,[0,2,3,4,5,...|
|  1.0|(6012,[0,2,3,12,1...|
|  0.0|(6012,[618,835,31...|
|  0.0|(6012,[18,20,27,2...|
|  0.0|(6012,[0,2,3,4,5,...|
|  0.0|(6012,[0,2,3,4,5,...|
|  0.0|(6012,[0,2,3,4,22...|
|  0.0|(6012,[66,78,94,1...|
|  0.0|(6012,[0,2,3,4,8,...|
|  0.0|(6012,[0,2,4,13,1...|
|  0.0|(6012,[1,7,14,20,...|
+-----+--------------------+
only showing top 20 rows



24/04/04 14:29:27 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , Email Text, Email Type
 Schema: _c0, Email Text, Email Type
Expected: _c0 but found: 
CSV file: file:///Users/ozgunozkan/Phishing_Email.csv


In [17]:
(train,test) = clean_df.randomSplit([0.7, 0.3], seed=42)

In [18]:
pred = nb.fit(train)

24/04/04 14:29:27 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , Email Text, Email Type
 Schema: _c0, Email Text, Email Type
Expected: _c0 but found: 
CSV file: file:///Users/ozgunozkan/Phishing_Email.csv
24/04/04 14:29:27 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , Email Text, Email Type
 Schema: _c0, Email Text, Email Type
Expected: _c0 but found: 
CSV file: file:///Users/ozgunozkan/Phishing_Email.csv


In [19]:
res = pred.transform(test)
res.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(6012,[0,2,3,4,5,...|[-11026.073907973...|[1.0,5.2746578448...|       0.0|
|  0.0|(6012,[0,2,3,4,5,...|[-11720.016937427...|[1.02505159741735...|       1.0|
|  0.0|(6012,[0,2,3,4,5,...|[-21844.416692851...|[0.99999977772424...|       0.0|
|  0.0|(6012,[0,2,3,4,5,...|[-4845.6314253368...|[1.0,1.7494267563...|       0.0|
|  0.0|(6012,[0,2,3,4,5,...|[-4475.2039665818...|[1.0,2.2057099798...|       0.0|
|  0.0|(6012,[0,2,3,4,5,...|[-13014.465771948...|           [1.0,0.0]|       0.0|
|  0.0|(6012,[0,2,3,4,5,...|[-4986.2882172483...|[1.0,1.2150945776...|       0.0|
|  0.0|(6012,[0,2,3,4,7,...|[-16230.279765482...|[1.0,1.4072285441...|       0.0|
|  0.0|(6012,[0,2,3,4,8,...|[-9906.9287557877...|[7.98324586644927...|       1.0|
|  0.0|(6012,[0,

24/04/04 14:29:28 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , Email Text, Email Type
 Schema: _c0, Email Text, Email Type
Expected: _c0 but found: 
CSV file: file:///Users/ozgunozkan/Phishing_Email.csv
24/04/04 14:29:28 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


In [20]:
eval = MulticlassClassificationEvaluator()
acc = eval.evaluate(res)
print(f"Accuracy: {acc*100}")

Accuracy: 65.29376426769193


24/04/04 14:29:28 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , Email Text, Email Type
 Schema: _c0, Email Text, Email Type
Expected: _c0 but found: 
CSV file: file:///Users/ozgunozkan/Phishing_Email.csv
