In [None]:
#Import các thư viện 

import pandas as pd
import numpy as np
import seaborn as sns
import time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
#Đọc dữ liệu
df = pd.read_csv("/content/drive/My Drive/Data/SMSSpamCollection", sep ='\t',names=["label", "text"])
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
#Shape dữ liệu
df.shape

(5572, 2)

In [None]:
#Số các message spam và ham
df.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

# **Xử lý dữ liệu**

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords.words('english')
import string
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

#Hàm xử lý ngôn ngữ 

def transform_text(text):

    #Chuyển thành chữ không in hoa
    text = text.lower()
    text = nltk.wordpunct_tokenize(text)

    #Bỏ các ký tự đặc biệt
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)
            
    text = y[:]
    y.clear()
    
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)       
            
    text = y[:]
    y.clear()
    
    for i in text:
        y.append(ps.stem(i))

    return " ".join(y)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
#Apply hàm xử lý ngôn ngữ vào dataframe
df['text'] = df['text'].apply(transform_text)
df.head()

Unnamed: 0,label,text
0,ham,go jurong point crazi avail bugi n great world...
1,ham,ok lar joke wif u oni
2,spam,free entri 2 wkli comp win fa cup final tkt 21...
3,ham,u dun say earli hor u c alreadi say
4,ham,nah think goe usf live around though


In [None]:
#Chia X, y
X = df.drop(columns=['label'])
y = df.label
print(X)
print(y)

                                                   text
0     go jurong point crazi avail bugi n great world...
1                                 ok lar joke wif u oni
2     free entri 2 wkli comp win fa cup final tkt 21...
3                   u dun say earli hor u c alreadi say
4                  nah think goe usf live around though
...                                                 ...
5567  2nd time tri 2 contact u u 750 pound prize 2 c...
5568                            ü b go esplanad fr home
5569                                  piti mood suggest
5570  guy bitch act like interest buy someth els nex...
5571                                     rofl true name

[5572 rows x 1 columns]
0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: label, Length: 5572, dtype: object


In [None]:
#Chia train, test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# **Không phân tán**

In [None]:
#Naive Bayes train
def NB_train(X_train, y_train):

  #Thêm nhãn vào tập X_train
  training_set = X_train.copy()
  training_set['label'] = y_train

  spam = training_set[training_set['label'] == 'spam']  #Lọc ra những dòng spam
  P_spam = len(spam)/len(training_set)  #Tỉ lệ spam trong tập train
  ham = training_set[training_set['label'] == 'ham']  #Lọc ra những dòng ham
  P_ham = len(ham)/len(training_set)  #Tỉ lệ ham trong tập train

  #Tạo ma trận
  vectorizer = CountVectorizer()
  vector_X_train = vectorizer.fit_transform(X_train['text'])
  matrix_X_train = vector_X_train.toarray()
  
  #Danh sách các từ
  vocab = vectorizer.get_feature_names_out()

  #Danh sách từ spam trong train
  spam_corpus = []
  for msg in training_set[training_set['label'] == 'spam']['text'].tolist():
    for word in msg.split():
        spam_corpus.append(word)
       
  spam_corpus = set(spam_corpus)
 
  #Danh sách từ ham trong train
  ham_corpus = []
  for msg in training_set[training_set['label'] == 'ham']['text'].tolist():
    for word in msg.split():
        ham_corpus.append(word)
       
  ham_corpus = set(ham_corpus)
  
  count_array_spam = [0]*len(vocab)
  spam_appear = 0
  count_array_ham = [0]*len(vocab)
  ham_appear = 0

  for row in matrix_X_train:
    for index in range(len(row)):
      if row[index] != 0:
        word = vocab[index]
        if word in spam_corpus:
          count_array_spam[index] = count_array_spam[index] + row[index]
          spam_appear = spam_appear + row[index]
        if word in ham_corpus:
          count_array_ham[index] = count_array_ham[index] + row[index]
          ham_appear = ham_appear + row[index]
  
  prob_array_spam = [0]*len(vocab)
  for i in range(len(vocab)):
    prob_array_spam[i] = (count_array_spam[i] + 1)/(1*len(vocab)+spam_appear)

  prob_array_ham = [0]*len(vocab)
  for i in range(len(vocab)):
    prob_array_ham[i] = (count_array_ham[i] + 1)/(1*len(vocab)+ham_appear)

  return [vocab, P_spam, P_ham, prob_array_spam, prob_array_ham]

In [None]:
#Naive Bayes test
def NB_test(model, X_test):

  vocab, P_spam, P_ham, prob_array_spam, prob_array_ham = model

  text_test = []
  for msg in X_test:
    for word in msg.split():
      text_test.append(word)

  count_array_test = [0]*len(vocab)

  for index in range(len(vocab)):
    if vocab[index] in text_test:
      count_array_test[index] = int(text_test.count(vocab[index]))

  result_spam = P_spam
  for index, value in zip(count_array_test, prob_array_spam):
    if index != 0:
      result_spam = result_spam * pow(value, index)

  result_ham = P_ham
  for index, value in zip(count_array_test, prob_array_ham):
    if index != 0:
      result_ham = result_ham * pow(value, index)

  if result_spam > result_ham:
    return 'spam'
  else:
    return 'ham'

In [None]:
def MNB(X_train, y_train, X_test):

  model = NB_train(X_train, y_train)

  results = list()
  
  for i in range(len(X_test)):
    results.append(NB_test(model, X_test.iloc[i]))

  return results

In [None]:
start = time.time()

y_pred = MNB(X_train, y_train, X_test)

stop = time.time()

print('Time: ', stop - start)

Time:  14.153392791748047


In [None]:
accuracy_score(y_pred, y_test)

0.97847533632287

# **Phân tán**

In [None]:
#Cài đặt pyspark
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 32 kB/s 
[?25hCollecting py4j==0.10.9.2
  Downloading py4j-0.10.9.2-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 46.0 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.0-py2.py3-none-any.whl size=281805911 sha256=f6b82cc0a030f8c2e218a375fec2765e8d1e4e3f1f52ac8bb6361f591aaeac6e
  Stored in directory: /root/.cache/pip/wheels/0b/de/d2/9be5d59d7331c6c2a7c1b6d1a4f463ce107332b1ecd4e80718
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.2 pyspark-3.2.0


In [None]:
#Khởi tạo
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

sc = SparkSession.builder.master("local[*]").getOrCreate().sparkContext

In [None]:
#Hàm cộng 2 mảng
def f(x, y):
  return [a + b for a, b in zip(x, y)]

#Tương tự hàm NB nhưng xử lý ma trận bằng rdd
def NB_train_2(X_train, y_train):

  training_set = X_train.copy()
  training_set['label'] = y_train

  spam = training_set[training_set['label'] == 'spam']
  P_spam = len(spam)/len(training_set)
  ham = training_set[training_set['label'] == 'ham']
  P_ham = len(ham)/len(training_set)

  vectorizer = CountVectorizer()
  vector_X_train = vectorizer.fit_transform(X_train['text'])
  matrix_X_train = vector_X_train.toarray()

  #Gán label lên phần tử đầu các dòng ma trận
  matrix_label = np.append(np.swapaxes([y_train.values], 0, 1), matrix_X_train, axis = 1)

  #Tạo rdd
  rdd = sc.parallelize(matrix_label)
  #Chia phần tử thành key - value
  rdd1 = rdd.map(lambda x: (x[0], x[1::]))
  #Sử dụng reduceBykey
  count_array_1, count_array_2 = rdd1.reduceByKey(lambda x, y: f(x, y)).collect()

  #Tìm danh sách spam và ham
  if count_array_1[0] == 'ham':
    count_array_ham = count_array_1[1]
    count_array_spam = count_array_2[1]
  else:
    count_array_ham = count_array_2[1]
    count_array_spam = count_array_1[1]
  
  #Danh sách các từ
  vocab = vectorizer.get_feature_names_out()

  spam_appear = sum(count_array_spam)
  ham_appear = sum(count_array_ham)

  prob_array_spam = [0]*len(vocab)
  for i in range(len(vocab)):
    prob_array_spam[i] = (count_array_spam[i] + 1)/(1*len(vocab)+spam_appear)

  prob_array_ham = [0]*len(vocab)
  for i in range(len(vocab)):
    prob_array_ham[i] = (count_array_ham[i] + 1)/(1*len(vocab)+ham_appear)

  return [vocab, P_spam, P_ham, prob_array_spam, prob_array_ham]

In [None]:
# def MNB_2(X_train, y_train, X_test):

#   model = NB_train_2(X_train, y_train)

#   results = sc.parallelize(X_test['text'])
#   results = results.map(lambda x: [x])
#   results = results.map(lambda x: NB_test(model, x))

#   return results.collect()

# ----- Không nhanh hơn là bao --------

In [None]:
def MNB_2(X_train, y_train, X_test):

  model = NB_train_2(X_train, y_train)

  results = list()
  #reduce
  for i in range(len(X_test)):
    results.append(NB_test(model, X_test.iloc[i]))

  return results

In [None]:
start = time.time()

y_pred = MNB_2(X_train, y_train, X_test)

stop = time.time()

print('Time: ', stop - start)

Time:  8.591895818710327


In [None]:
accuracy_score(y_pred, y_test)

0.9883408071748879