![](images/11_00.jpg)

# 1. Đọc dữ liệu

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("Ham_vs_Spam").getOrCreate()

In [3]:
data = spark.read.csv("./data/smsspamcollection/SMSSpamCollection", inferSchema=True, sep="\t")

In [4]:
data = data.withColumnRenamed('_c0', 'class').withColumnRenamed('_c1', 'text')

In [5]:
data.show(3)

+-----+--------------------+
|class|                text|
+-----+--------------------+
|  ham|Go until jurong p...|
|  ham|Ok lar... Joking ...|
| spam|Free entry in 2 a...|
+-----+--------------------+
only showing top 3 rows



# 2. Làm sạch dữ liệu

In [6]:
from pyspark.sql.functions import length

In [7]:
data = data.withColumn('length', length(data['text']))

In [8]:
data.show(5)

+-----+--------------------+------+
|class|                text|length|
+-----+--------------------+------+
|  ham|Go until jurong p...|   111|
|  ham|Ok lar... Joking ...|    29|
| spam|Free entry in 2 a...|   155|
|  ham|U dun say so earl...|    49|
|  ham|Nah I don't think...|    61|
+-----+--------------------+------+
only showing top 5 rows



In [9]:
data.groupBy('class').mean().show()

+-----+-----------------+
|class|      avg(length)|
+-----+-----------------+
|  ham|71.45431945307645|
| spam|138.6706827309237|
+-----+-----------------+



# 3. Feature transformations và prepare dữ liệu

In [10]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer

In [11]:
tokenizer = Tokenizer(inputCol='text', outputCol='token_text')
stopremove = StopWordsRemover(inputCol='token_text', outputCol='stop_tokens')
count_vec = CountVectorizer(inputCol='stop_tokens', outputCol='c_vec')
idf = IDF(inputCol='c_vec', outputCol='tf_idf')
ham_spam_to_num = StringIndexer(inputCol='class', outputCol='label')

In [12]:
from pyspark.ml.feature import VectorAssembler

In [13]:
clean_up = VectorAssembler(inputCols=['tf_idf', 'length'], outputCol='features')

* Áp dụng pipeline

In [14]:
from pyspark.ml import Pipeline

In [15]:
data_prep_pipe = Pipeline(stages=[ham_spam_to_num, tokenizer, stopremove, count_vec, idf, clean_up])

In [16]:
cleaner = data_prep_pipe.fit(data)

In [17]:
clean_data = cleaner.transform(data)

In [18]:
clean_data.toPandas()

Unnamed: 0,class,text,length,label,token_text,stop_tokens,c_vec,tf_idf,features
0,ham,"Go until jurong point, crazy.. Available only ...",111,0.0,"[go, until, jurong, point,, crazy.., available...","[go, jurong, point,, crazy.., available, bugis...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.11261885...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.11261885..."
1,ham,Ok lar... Joking wif u oni...,29,0.0,"[ok, lar..., joking, wif, u, oni...]","[ok, lar..., joking, wif, u, oni...]","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(2.016698353160939, 0.0, 0.0, 0.0, 0.0, 0.0, 0...","(2.016698353160939, 0.0, 0.0, 0.0, 0.0, 0.0, 0..."
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,1.0,"[free, entry, in, 2, a, wkly, comp, to, win, f...","[free, entry, 2, wkly, comp, win, fa, cup, fin...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 2.704469176684504, 0.0, 0.0, 0.0, 0...","(0.0, 0.0, 2.704469176684504, 0.0, 0.0, 0.0, 0..."
3,ham,U dun say so early hor... U c already then say...,49,0.0,"[u, dun, say, so, early, hor..., u, c, already...","[u, dun, say, early, hor..., u, c, already, sa...","(2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(4.033396706321878, 0.0, 0.0, 0.0, 0.0, 0.0, 0...","(4.033396706321878, 0.0, 0.0, 0.0, 0.0, 0.0, 0..."
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,0.0,"[nah, i, don't, think, he, goes, to, usf,, he,...","[nah, think, goes, usf,, lives, around, though]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...,...,...
5569,spam,This is the 2nd time we have tried 2 contact u...,160,1.0,"[this, is, the, 2nd, time, we, have, tried, 2,...","[2nd, time, tried, 2, contact, u., u, won, £75...","(1.0, 1.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(2.016698353160939, 2.3645559120072774, 5.4089...","(2.016698353160939, 2.3645559120072774, 5.4089..."
5570,ham,Will ü b going to esplanade fr home?,36,0.0,"[will, ü, b, going, to, esplanade, fr, home?]","[ü, b, going, esplanade, fr, home?]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5571,ham,"Pity, * was in mood for that. So...any other s...",57,0.0,"[pity,, *, was, in, mood, for, that., so...any...","[pity,, *, mood, that., so...any, suggestions?]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5572,ham,The guy did some bitching but I acted like i'd...,125,0.0,"[the, guy, did, some, bitching, but, i, acted,...","[guy, bitching, acted, like, interested, buyin...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [19]:
clean_data.show(3)

+-----+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|class|                text|length|label|          token_text|         stop_tokens|               c_vec|              tf_idf|            features|
+-----+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|  ham|Go until jurong p...|   111|  0.0|[go, until, juron...|[go, jurong, poin...|(13423,[7,11,31,6...|(13423,[7,11,31,6...|(13424,[7,11,31,6...|
|  ham|Ok lar... Joking ...|    29|  0.0|[ok, lar..., joki...|[ok, lar..., joki...|(13423,[0,24,297,...|(13423,[0,24,297,...|(13424,[0,24,297,...|
| spam|Free entry in 2 a...|   155|  1.0|[free, entry, in,...|[free, entry, 2, ...|(13423,[2,13,19,3...|(13423,[2,13,19,3...|(13424,[2,13,19,3...|
+-----+--------------------+------+-----+--------------------+--------------------+--------------------+--------------

In [20]:
clean_data = clean_data.select('label', 'features')

In [21]:
clean_data.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(13424,[7,11,31,6...|
|  0.0|(13424,[0,24,297,...|
|  1.0|(13424,[2,13,19,3...|
|  0.0|(13424,[0,70,80,1...|
|  0.0|(13424,[36,134,31...|
+-----+--------------------+
only showing top 5 rows



# 4. Tách dữ liệu train test

In [22]:
training, testing = clean_data.randomSplit((.7, .3))

# 5. Build model và dự đoán cho testing data

In [23]:
from pyspark.ml.classification import NaiveBayes

In [24]:
nb = NaiveBayes()

In [25]:
spam_predictor = nb.fit(training)

In [26]:
test_results = spam_predictor.transform(testing)

In [27]:
test_results.show(3)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(13424,[0,1,3,9,1...|[-578.33703657090...|[0.99999999995724...|       0.0|
|  0.0|(13424,[0,1,9,14,...|[-538.84056804264...|[1.0,2.1775376387...|       0.0|
|  0.0|(13424,[0,1,12,33...|[-443.36662227550...|[1.0,5.4693631751...|       0.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 3 rows



In [28]:
test_results.groupBy('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|  209|
|  0.0|       1.0|  134|
|  1.0|       0.0|    4|
|  0.0|       0.0| 1320|
+-----+----------+-----+



# 6. Đánh giá model

In [29]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [30]:
acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)

In [31]:
acc

0.9249574996043879

> **Nhận xét**
> Độ chính xác cao.