# Text analytics (Unstructured)

## Spark Env

In [1]:
try:
    spark
except NameError:
    # initialize Spark Session
    import os
    import sys
    top_dir = os.path.abspath(os.path.join(os.getcwd(), "../"))
    if top_dir not in sys.path:
        sys.path.append(top_dir)

    from init_spark import init_spark
    spark = init_spark()

print('Spark UI running on port ' + spark.sparkContext.uiWebUrl.split(':')[2])
spark

Initializing Spark...
Spark found in :  /home/ubuntu/apps/spark
Spark config:
	 spark.app.name=TestApp
	spark.master=local[*]
	executor.memory=2g
	spark.sql.warehouse.dir=/tmp/tmp_y2z4iqp
Spark UI running on port 4040
Spark UI running on port 4040


## Read email data


In [2]:
email_text = spark.read.text("../data/emails/")
email_text.show(30, False)

+------------------------------------------+
|value                                     |
+------------------------------------------+
|From: "my name" <me@me.com>               |
|To: "your name" <you@you.com>             |
|Sent-From:  4.4.4.4                       |
|Date: 2017-11-01T16:42:15-0500            |
|Subject: team meeting this afternoon @ 2pm|
|                                          |
|Team,                                     |
|let's do a quick meeting today afternoon. |
|Let's discuss the current project.        |
|                                          |
|see you then!                             |
|From: "me" <me@me.com>                    |
|To: "your name" <you@you.com>             |
|Sent-From:  3.3.3.3                       |
|Date: 2017-11-01T16:42:15-0500            |
|Subject: Free Diploma!                    |
|                                          |
|!!!FREE Diploma!!!                        |
|Get your free diploma here                |
|Just clic

In [3]:
# How many lines of text? 
email_text.count()

45

## Hmm SPAM!
Let's look for spammy content.  
For simplicity, we are going to classify email as spam if it has `!!!`

In [4]:

spam_lines = email_text.filter(email_text['value'].contains('!!!'))
spam_lines.show(10, False)
spam_lines.count()

+------------------------+
|value                   |
+------------------------+
|!!!FREE Diploma!!!      |
|Subject: !!!HOT DEALS!!!|
|!!!! HOT DEALS!!!!      |
|Subject: !!!MEDS Sale!!!|
+------------------------+



4

## Identify Spam Emails
For this we need to know the `file_name` of the email.

In [5]:
from pyspark.sql.functions import input_file_name

emails = spark.read.text("../data/emails/").withColumn("file_name", input_file_name())
emails.show(100, False)

+------------------------------------------+----------------------------------------------------------------------------+
|value                                     |file_name                                                                   |
+------------------------------------------+----------------------------------------------------------------------------+
|From: "my name" <me@me.com>               |file:///home/ubuntu/dev/work/ElephantScale/spark-workshop/data/emails/e4.txt|
|To: "your name" <you@you.com>             |file:///home/ubuntu/dev/work/ElephantScale/spark-workshop/data/emails/e4.txt|
|Sent-From:  4.4.4.4                       |file:///home/ubuntu/dev/work/ElephantScale/spark-workshop/data/emails/e4.txt|
|Date: 2017-11-01T16:42:15-0500            |file:///home/ubuntu/dev/work/ElephantScale/spark-workshop/data/emails/e4.txt|
|Subject: team meeting this afternoon @ 2pm|file:///home/ubuntu/dev/work/ElephantScale/spark-workshop/data/emails/e4.txt|
|                       

In [6]:
## Find Spam

spam_lines = emails.filter(emails['value'].contains('!!!'))
spam_lines.show(10, False)

+------------------------+----------------------------------------------------------------------------+
|value                   |file_name                                                                   |
+------------------------+----------------------------------------------------------------------------+
|!!!FREE Diploma!!!      |file:///home/ubuntu/dev/work/ElephantScale/spark-workshop/data/emails/e3.txt|
|Subject: !!!HOT DEALS!!!|file:///home/ubuntu/dev/work/ElephantScale/spark-workshop/data/emails/e2.txt|
|!!!! HOT DEALS!!!!      |file:///home/ubuntu/dev/work/ElephantScale/spark-workshop/data/emails/e2.txt|
|Subject: !!!MEDS Sale!!!|file:///home/ubuntu/dev/work/ElephantScale/spark-workshop/data/emails/e5.txt|
+------------------------+----------------------------------------------------------------------------+



In [7]:
## select the file names
spam_lines.select('file_name').show(10, False)

+----------------------------------------------------------------------------+
|file_name                                                                   |
+----------------------------------------------------------------------------+
|file:///home/ubuntu/dev/work/ElephantScale/spark-workshop/data/emails/e3.txt|
|file:///home/ubuntu/dev/work/ElephantScale/spark-workshop/data/emails/e2.txt|
|file:///home/ubuntu/dev/work/ElephantScale/spark-workshop/data/emails/e2.txt|
|file:///home/ubuntu/dev/work/ElephantScale/spark-workshop/data/emails/e5.txt|
+----------------------------------------------------------------------------+



In [8]:
## Distinct
spam_lines.select('file_name').distinct().show(10, False)

+----------------------------------------------------------------------------+
|file_name                                                                   |
+----------------------------------------------------------------------------+
|file:///home/ubuntu/dev/work/ElephantScale/spark-workshop/data/emails/e3.txt|
|file:///home/ubuntu/dev/work/ElephantScale/spark-workshop/data/emails/e5.txt|
|file:///home/ubuntu/dev/work/ElephantScale/spark-workshop/data/emails/e2.txt|
+----------------------------------------------------------------------------+



In [9]:
## group by
spam_lines.groupby('file_name').count().show(10, False)

+----------------------------------------------------------------------------+-----+
|file_name                                                                   |count|
+----------------------------------------------------------------------------+-----+
|file:///home/ubuntu/dev/work/ElephantScale/spark-workshop/data/emails/e3.txt|1    |
|file:///home/ubuntu/dev/work/ElephantScale/spark-workshop/data/emails/e5.txt|1    |
|file:///home/ubuntu/dev/work/ElephantScale/spark-workshop/data/emails/e2.txt|2    |
+----------------------------------------------------------------------------+-----+

