In [1]:

import tensorflow as tf
import os
import io
import pandas as pd
import re

In [2]:
tf.__version__

'2.5.0'

* Đọc dữ liệu ko có định dạng rõ ràng.

In [3]:
lines = io.open("./data/smsspamcollection/SMSSpamCollection").read().strip().split('\n')

lines[0]

'ham\tGo until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

* Tách các label và text ra, đồng thời gán giá trị $1$ cho `spam` và $0$ cho `ham`.

In [4]:
spam_dataset = []

for line in lines:
    label, text = line.split('\t')
    
    if label.strip() == 'spam':
        spam_dataset.append((1, text.strip()))
    else:
        spam_dataset.append((0, text.strip()))

In [5]:
spam_dataset[0]

(0,
 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...')

In [6]:
len(spam_dataset)

5574

# Text normalization

In [7]:
df = pd.DataFrame(spam_dataset, columns=['Spam', 'Message'])

In [8]:
def message_length(x):
    return len(x)

def num_capitals(x):
    _, count = re.subn(r"[A-Z]", "", x)
    return count

def num_punctuation(x):
    _, count = re.subn(r"\W", "", x)
    return count

In [9]:
df['Capitals'] = df['Message'].apply(num_capitals)
df['Punctuation'] = df['Message'].apply(num_punctuation)
df['Length'] = df["Message"].apply(message_length)

df.describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length
count,5574.0,5574.0,5574.0,5574.0
mean,0.134015,5.621636,18.942591,80.443488
std,0.340699,11.683233,14.825994,59.841746
min,0.0,0.0,0.0,2.0
25%,0.0,1.0,8.0,36.0
50%,0.0,2.0,15.0,61.0
75%,0.0,4.0,27.0,122.0
max,1.0,129.0,253.0,910.0


* Chia dữ liệu thành train data và test data với kích thước 80% và 20%.

In [10]:
train = df.sample(frac=0.8, random_state=42)
test = df.drop(train.index)

x_train = train[['Length', 'Capitals', 'Punctuation']]
y_train = train[['Spam']]

x_test = test[['Length', 'Capitals', 'Punctuation']]
y_test = test[['Spam']]

# Modeling normalized data
* Model sẽ sử dụng binary cross-entropy để tính toán loss value và Adam optimizer để train data.
* MOdel dc build dựa trên 3 feature là `Length`, `Capitals` và `Punctuation`.

In [11]:
def make_model(input_dims=3, num_units=12):
    """
    Dùng để build model

    Args:
        input_dims (int, optional): Số feature có trong input.
        num_units (int, optional): [description]. Defaults to 12.

    Returns:
        [type]: [description]
    """
    model = tf.keras.Sequential()
    
    '''Thêm một layer được kết nối với 12 unit của model'''
    model.add(tf.keras.layers.Dense(num_units,
                                    input_dim=input_dims,
                                    activation='relu'))
    
    '''Thêm một sigmoid layer với binary output unit'''
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

In [12]:
model = make_model()
model.fit(x_train, y_train, epochs=10, batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fc28808cd90>

In [13]:
model.evaluate(x_test, y_test)



[0.20615598559379578, 0.9237667918205261]

In [14]:
y_train_pred = model.predict_classes(x_train)



In [15]:
tf.math.confusion_matrix(tf.constant(y_train.Spam), y_train_pred)

<tf.Tensor: shape=(2, 2), dtype=int32, numpy=
array([[3781,   86],
       [ 221,  371]], dtype=int32)>

# Tokenization
* Có thể xem các ngôn ngữ mà `stanfordnlp` hỗ trợ tokenization tại [https://stanfordnlp.github.io/stanfordnlp/models.html](https://stanfordnlp.github.io/stanfordnlp/models.html).

In [17]:
import stanfordnlp as snlp

In [19]:
vi = snlp.download('vi')

Using the default treebank "vi_vtb" for language "vi".
Would you like to download the models for: vi_vtb now? (Y/n)

Default download directory: /home/manhcuong/stanfordnlp_resources
Hit enter to continue or type an alternate directory.

Downloading models for: vi_vtb
Download location: /home/manhcuong/stanfordnlp_resources/vi_vtb_models.zip


100%|██████████| 217M/217M [00:52<00:00, 4.13MB/s]



Download complete.  Models saved to: /home/manhcuong/stanfordnlp_resources/vi_vtb_models.zip
Extracting models file for: vi_vtb
Cleaning up...Done.


In [20]:
en = snlp.download('en')

Using the default treebank "en_ewt" for language "en".
Would you like to download the models for: en_ewt now? (Y/n)

Default download directory: /home/manhcuong/stanfordnlp_resources
Hit enter to continue or type an alternate directory.

Downloading models for: en_ewt
Download location: /home/manhcuong/stanfordnlp_resources/en_ewt_models.zip


100%|██████████| 235M/235M [00:57<00:00, 4.09MB/s]



Download complete.  Models saved to: /home/manhcuong/stanfordnlp_resources/en_ewt_models.zip
Extracting models file for: en_ewt
Cleaning up...Done.


In [21]:
vi = snlp.Pipeline(lang='vi', processors='tokenize')

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/home/manhcuong/stanfordnlp_resources/vi_vtb_models/vi_vtb_tokenizer.pt', 'lang': 'vi', 'shorthand': 'vi_vtb', 'mode': 'predict'}
Done loading processors!
---


In [27]:
sentence = "Có con đường nào bước qua ta đến mang em món quà hẹn hò yêu thương ta say đến già. Nắng mưa là chuyện nắng mưa, ai biết yêu thương đã vừa..."
tokenized = vi(sentence)

In [28]:
len(tokenized.sentences)

2

> * `sentence` gồm 2 câu, câu 1 bắt đầu từ "Có con đường... đến già.". Câu hai là "Nắng mưa... đã vừa..."

In [29]:
for snt in tokenized.sentences:
    for word in snt.tokens:
        print(word.text)
        
    print("<End of Sentence>")

Có
con
đường
nào
bước
qua ta
đến
mang
em
món
quà
hẹn
hò
yêu
thương
ta
say
đến
già
.
<End of Sentence>
Nắng
mưa
là
chuyện
nắng mưa
,
ai
biết
yêu thương
đã
vừa
...
<End of Sentence>


In [37]:
en = snlp.Pipeline(lang='en', processors='tokenize')

def word_counts(x, pipeline=en):
    """
    Dùng để đếm số từ trong `x`
    """
    doc = pipeline(x)
    count = sum([len(sentence.tokens) for sentence in doc.sentences])
    
    return count

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/home/manhcuong/stanfordnlp_resources/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Done loading processors!
---


In [38]:
train['Words'] = train['Message'].apply(word_counts)
test['Words'] = test['Message'].apply(word_counts)

x_train = train[['Words', 'Capitals', 'Punctuation', 'Length']]
y_train = train[['Spam']]

x_test = test[['Words', 'Capitals', 'Punctuation', 'Length']]
y_test = test[['Spam']]

model = make_model(input_dims=4)

In [39]:
model.fit(x_train, y_train, epochs=10, batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fc2194e8d90>

In [40]:
train.loc[train.Spam == 1].describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length,Words
count,592.0,592.0,592.0,592.0,592.0
mean,1.0,15.320946,29.086149,138.856419,29.511824
std,0.0,11.635105,7.083572,28.07998,7.474256
min,1.0,0.0,2.0,13.0,3.0
25%,1.0,7.0,26.0,132.0,26.0
50%,1.0,14.0,30.0,149.0,30.0
75%,1.0,21.0,34.0,157.0,35.0
max,1.0,128.0,49.0,197.0,49.0


In [41]:
train.loc[train.Spam == 0].describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length,Words
count,3867.0,3867.0,3867.0,3867.0,3867.0
mean,0.0,4.018878,17.325058,71.354538,17.344194
std,0.0,10.599291,14.826644,57.755351,13.811278
min,0.0,0.0,0.0,2.0,1.0
25%,0.0,1.0,8.0,33.0,8.0
50%,0.0,2.0,13.0,53.0,13.0
75%,0.0,3.0,23.0,92.0,22.0
max,0.0,129.0,253.0,910.0,209.0


# Stopwords

In [42]:
import stopwordsiso as stopwords

In [45]:
vi_sw = stopwords.stopwords('vi')

In [48]:
len(vi_sw)

645

In [49]:
vi_sw

{'a ha',
 'a-lô',
 'ai',
 'ai ai',
 'ai nấy',
 'alô',
 'amen',
 'anh',
 'bao giờ',
 'bao lâu',
 'bao nhiêu',
 'bao nả',
 'bay biến',
 'biết',
 'biết bao',
 'biết bao nhiêu',
 'biết chừng nào',
 'biết mấy',
 'biết đâu',
 'biết đâu chừng',
 'biết đâu đấy',
 'bà',
 'bài',
 'bác',
 'bây bẩy',
 'bây chừ',
 'bây giờ',
 'bây nhiêu',
 'bèn',
 'béng',
 'bông',
 'bạn',
 'bản',
 'bất chợt',
 'bất cứ',
 'bất giác',
 'bất kì',
 'bất kể',
 'bất kỳ',
 'bất luận',
 'bất nhược',
 'bất quá',
 'bất thình lình',
 'bất tử',
 'bất đồ',
 'bấy',
 'bấy chầy',
 'bấy chừ',
 'bấy giờ',
 'bấy lâu',
 'bấy lâu nay',
 'bấy nay',
 'bấy nhiêu',
 'bập bà bập bõm',
 'bập bõm',
 'bắt đầu từ',
 'bằng',
 'bằng không',
 'bằng nấy',
 'bằng ấy',
 'bển',
 'bệt',
 'bị',
 'bỏ mẹ',
 'bỗng',
 'bỗng chốc',
 'bỗng dưng',
 'bỗng không',
 'bỗng nhiên',
 'bỗng đâu',
 'bộ',
 'bội phần',
 'bớ',
 'bởi',
 'bởi chưng',
 'bởi nhưng',
 'bởi thế',
 'bởi vì',
 'bởi vậy',
 'bức',
 'cao',
 'cha',
 'cha chả',
 'chao ôi',
 'chiếc',
 'cho',
 'cho nên

* Cập nhật lại hàm `word_counts` thêm chức năng loại bỏ stopword như sau:

In [50]:
en_sw = stopwords.stopwords('en')
def word_counts(x, pipeline=en):
    doc = pipeline(x)
    cnt = 0
    
    for sentence in doc.sentences:
        for token in sentence.tokens:
            if token.text.lower() not in en_sw:
                cnt += 1
                
    return cnt

In [51]:
train['Words'] = train['Message'].apply(word_counts)
test['Words'] = test['Message'].apply(word_counts)

x_train = train[['Words', 'Capitals', 'Punctuation', 'Length']]
y_train = train[['Spam']]

x_test = test[['Words', 'Capitals', 'Punctuation', 'Length']]
y_test = test[['Spam']]

model = make_model(input_dims=4)

In [52]:
model.fit(x_train, y_train, epochs=10, batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fc27aebea30>

In [53]:
train.loc[train.Spam == 1].describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length,Words
count,592.0,592.0,592.0,592.0,592.0
mean,1.0,15.320946,29.086149,138.856419,18.464527
std,0.0,11.635105,7.083572,28.07998,6.100852
min,1.0,0.0,2.0,13.0,2.0
25%,1.0,7.0,26.0,132.0,14.0
50%,1.0,14.0,30.0,149.0,19.0
75%,1.0,21.0,34.0,157.0,23.0
max,1.0,128.0,49.0,197.0,33.0


In [54]:
train.loc[train.Spam == 0].describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length,Words
count,3867.0,3867.0,3867.0,3867.0,3867.0
mean,0.0,4.018878,17.325058,71.354538,7.911042
std,0.0,10.599291,14.826644,57.755351,7.32639
min,0.0,0.0,0.0,2.0,0.0
25%,0.0,1.0,8.0,33.0,4.0
50%,0.0,2.0,13.0,53.0,6.0
75%,0.0,3.0,23.0,92.0,10.0
max,0.0,129.0,253.0,910.0,147.0


In [55]:
model.evaluate(x_test, y_test)



[0.20706488192081451, 0.9300448298454285]