# 使用深度学习来对文本进行分类

In [156]:
import pandas as pd 
import numpy as np
import re
from torch.utils.data import Dataset,DataLoader
import torch
from torch import nn

In [157]:
df = pd.read_csv("SPAM_text.csv",encoding= 'ISO-8859-1')

# spam代表垃圾邮件
new_label = {"Category": {"ham": 0, "spam": 1}}
df = df.replace(new_label)
df = df.rename(columns = {"Category": "label"})

In [158]:
df.head()


Unnamed: 0,label,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [159]:
train, val, test = np.split(df.sample(frac=1), [int(0.8*len(df)), int(0.9*len(df))])

In [162]:
import tensorflow as tf 
def df_to_dataset(dataframe,shuffle=True,batch_size=512):
    df = dataframe.copy()
    labels = df.pop("label")
    df = df['Message']
    ds = tf.data.Dataset.from_tensor_slices((df,labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    ds = ds.prefetch(tf.data.AUTOTUNE)
    return ds 


In [163]:
train_data = df_to_dataset(train)
valid_data = df_to_dataset(val)
test_data = df_to_dataset(test)

LSTM模型

In [164]:
encoder = tf.keras.layers.TextVectorization(max_tokens=2000)
encoder.adapt(train_data.map(lambda text,label:text))


In [165]:
vocab = np.array(encoder.get_vocabulary())
vocab[:20]

array(['', '[UNK]', 'i', 'to', 'you', 'a', 'the', 'u', 'and', 'is', 'in',
       'me', 'my', 'for', 'your', 'it', 'of', 'call', 'on', 'have'],
      dtype='<U36')

In [166]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=32,
        mask_zero= True
    ),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(32,activation='relu'),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_1 (TextV  (None, None)             0         
 ectorization)                                                   
                                                                 
 embedding_1 (Embedding)     (None, None, 32)          64000     
                                                                 
 lstm_1 (LSTM)               (None, 32)                8320      
                                                                 
 dense_2 (Dense)             (None, 32)                1056      
                                                                 
 dropout_1 (Dropout)         (None, 32)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 33        
                                                      

In [167]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=['accuracy'])

In [168]:
# model.evaluate(train_data)
# model.evaluate(valid_data)

In [169]:
history = model.fit(train_data, epochs=5, validation_data=valid_data)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [184]:
model.evaluate(train_data)
model.evaluate(valid_data)



[0.32495930790901184, 0.8420107960700989]

In [185]:
model.evaluate(test_data)



[0.2952617108821869, 0.8673835396766663]

In [188]:
model.predict(["Got c... I lazy to type... I forgot ü in lect... I saw a pouch but like not v nice..."])

array([[0.06010267]], dtype=float32)