In [1]:
#Import needed Apache Arrow Python library for flight

from pyarrow import flight
import pyarrow as pa

In [2]:
#Define Dremio Client Authentication Methods

class HttpDremioClientAuthHandler(flight.ClientAuthHandler):

    def __init__(self, username, password):
        super(flight.ClientAuthHandler, self).__init__()
        self.basic_auth = flight.BasicAuth(username, password)
        self.token = None

    def authenticate(self, outgoing, incoming):
        auth = self.basic_auth.serialize()
        outgoing.write(auth)
        self.token = incoming.read()

    def get_token(self):
        return self.token

In [3]:
username = 'george'
password = '<redacted>'
sql = '''select * from "fraud.dremio".SMSSpamCollection'''

In [4]:
#Connect to Dremio with flight connector on port 47470 mentions earlier in the writing

client = flight.FlightClient('grpc+tcp://10.0.2.15:47470')
client.authenticate(HttpDremioClientAuthHandler(username, password)) 

In [5]:
#passing in SQL query statement to Dremio, execute and returns the data in pandas dataframe pdf
info = client.get_flight_info(flight.FlightDescriptor.for_command(sql))
reader = client.do_get(info.endpoints[0].ticket)
batches = []

In [6]:
import pandas as pd
while True: 
    try:
        batch, metadata = reader.read_chunk()
        batches.append(batch)
    except StopIteration:
        break
data = pa.Table.from_batches(batches)
pdf = data.to_pandas()

In [7]:
pdf.head()

Unnamed: 0,A,B
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:

import sys,os,os.path
os.environ['SPARK_HOME']='/opt/spark/'
import matplotlib.pyplot as plt
%matplotlib inline
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext

spark = SparkSession.builder.getOrCreate()
sc=spark.sparkContext

In [9]:
#Convert the pandas dataframe to SparkSQL dataframe
sqlCtx = SQLContext(sc)
df = sqlCtx.createDataFrame(pdf)

In [10]:
df.show(5)

+----+--------------------+
|   A|                   B|
+----+--------------------+
| ham|Go until jurong p...|
| ham|Ok lar... Joking ...|
|spam|Free entry in 2 a...|
| ham|U dun say so earl...|
| ham|Nah I don't think...|
+----+--------------------+
only showing top 5 rows



In [11]:
#Note: Spam is Spam, Han is OK. Rename Column name A as status, B as feature

df = df.withColumnRenamed('A', 'status').withColumnRenamed('B', 'message')
df.show(3, truncate = False)

+------+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
|status|message                                                                                                                                                    |
+------+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
|ham   |Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...                                            |
|ham   |Ok lar... Joking wif u oni...                                                                                                                              |
|spam  |Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's|
+------+--

In [12]:
#Encode status column to numeric: ham to 1.0 and spam to 0. 
#All our fields need to be numeric for machine to learn, also rename the column status to label

df.createOrReplaceTempView('temp')
df = spark.sql('select case status when "ham" then 1.0  else 0 end as label, message from temp')
df.show()

+-----+--------------------+
|label|             message|
+-----+--------------------+
|  1.0|Go until jurong p...|
|  1.0|Ok lar... Joking ...|
|  0.0|Free entry in 2 a...|
|  1.0|U dun say so earl...|
|  1.0|Nah I don't think...|
|  0.0|FreeMsg Hey there...|
|  1.0|Even my brother i...|
|  1.0|As per your reque...|
|  0.0|WINNER!! As a val...|
|  0.0|Had your mobile 1...|
|  1.0|I'm gonna be home...|
|  0.0|SIX chances to wi...|
|  0.0|URGENT! You have ...|
|  1.0|I've been searchi...|
|  1.0|I HAVE A DATE ON ...|
|  0.0|XXXMobileMovieClu...|
|  1.0|Oh k...i'm watchi...|
|  1.0|Eh u remember how...|
|  1.0|Fine if thats th...|
|  0.0|England v Macedon...|
+-----+--------------------+
only showing top 20 rows



In [13]:
#1 is OK, 0 is Spam
#Tokenize the messages Tokenization is the process of taking text (such as a sentence) 
# and breaking it into individual terms (usually words). Let’s tokenize the messages 
#and create a list of words of each message. 


from pyspark.ml.feature import  Tokenizer
tokenizer = Tokenizer(inputCol="message", outputCol="words")
wordsData = tokenizer.transform(df)
wordsData.show(3)

+-----+--------------------+--------------------+
|label|             message|               words|
+-----+--------------------+--------------------+
|  1.0|Go until jurong p...|[go, until, juron...|
|  1.0|Ok lar... Joking ...|[ok, lar..., joki...|
|  0.0|Free entry in 2 a...|[free, entry, in,...|
+-----+--------------------+--------------------+
only showing top 3 rows



In [14]:
#CountVectorizer converts a collection of text documents to vectors of token counts.

from pyspark.ml.feature import CountVectorizer
count = CountVectorizer (inputCol="words", outputCol="rawFeatures")
model = count.fit(wordsData)
featurizedData = model.transform(wordsData)
featurizedData.show(3)

+-----+--------------------+--------------------+--------------------+
|label|             message|               words|         rawFeatures|
+-----+--------------------+--------------------+--------------------+
|  1.0|Go until jurong p...|[go, until, juron...|(13587,[8,43,53,6...|
|  1.0|Ok lar... Joking ...|[ok, lar..., joki...|(13587,[5,76,409,...|
|  0.0|Free entry in 2 a...|[free, entry, in,...|(13587,[0,3,8,22,...|
+-----+--------------------+--------------------+--------------------+
only showing top 3 rows



In [15]:
#IDF reduces the features that often appear in the corpus. When using text as a feature, 
#this usually improves performance because the most common, 
#and therefore less important, words are weighted down

from pyspark.ml.feature import  IDF
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
rescaledData.select("label", "features").show(3)  #Only needed to train

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  1.0|(13587,[8,43,53,6...|
|  1.0|(13587,[5,76,409,...|
|  0.0|(13587,[0,3,8,22,...|
+-----+--------------------+
only showing top 3 rows



In [16]:
#Randomly Split DataFrame into 80% Training (trainDF) and 20 Testing (testDF)

seed = 0  # random seed 0
trainDF, testDF = rescaledData.randomSplit([0.8,0.2],seed)

In [17]:
trainDF.count()

3853

In [18]:
testDF.count()

984

In [19]:
#Logistic regression classifier

#Logistic regression is a common method of predicting classification responses. 
#A special case of a generalized linear model is the probability of predicting a result. 
#In spark.ml, logistic regression can be used to predict binary results 
#by binomial logistic regression, or it can be used to predict multiple types of results by using multiple logistic regression. 
#Use the family parameter to choose between these two algorithms, or leave it unset and Spark will infer the correct variable.

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import numpy as np
lr = LogisticRegression(maxIter = 100)

model_lr = lr.fit(trainDF)

In [20]:
prediction_lr = model_lr.transform(testDF)

In [21]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
my_eval_lr = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='label', metricName='areaUnderROC')
my_eval_lr.evaluate(prediction_lr)

0.8734030197444833

In [22]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
my_mc_lr = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='f1')
my_mc_lr.evaluate(prediction_lr)

0.9654997463216642

In [23]:
my_mc_lr = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='accuracy')
my_mc_lr.evaluate(prediction_lr)

0.967479674796748

In [24]:
train_fit_lr = prediction_lr.select('label','prediction')
train_fit_lr.groupBy('label','prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  0.0|       1.0|   31|
|  1.0|       1.0|  860|
|  0.0|       0.0|   92|
|  1.0|       0.0|    1|
+-----+----------+-----+



In [25]:
#Naive Bayes Naive Bayesian classifiers are a class of simple probability classifiers that apply strong (naive) 
#independent assumptions between features based on Bayes' theorem. The spark.ml implementation 
#currently supports polynomial naive Bayes and Bernoulli Naïve Bayes.

from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes()
Model_nb = nb.fit(trainDF)

In [26]:
predictions_nb = Model_nb.transform(testDF)
predictions_nb.select('label', 'prediction').show(5)

+-----+----------+
|label|prediction|
+-----+----------+
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
+-----+----------+
only showing top 5 rows



In [27]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
my_eval_nb = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='label', metricName='areaUnderROC')
my_eval_nb.evaluate(predictions_nb)

0.937862950058072

In [28]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
my_mc_nb = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='f1')
my_mc_nb.evaluate(predictions_nb)

0.933544453535483

In [29]:
my_mc_nb = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='accuracy')
my_mc_nb.evaluate(predictions_nb)

0.9278455284552846