### 文本数据建模流程范例

1. 准备数据
- `imdb`数据集的目标是根据电影评论的文本内容预测评论的情感标签
- `tensorflow`中完成文本数据预处理的常用方案有两种：
  1. 利用`tf.keras.preprocessing`中的`Tokenizer`词典构建工具和`tf.keras.utils.Sequence`构建文本数据生成器管道（较为复杂）
  2. 使用`tf.data.Dataset`搭配`.keras.layers.experimental.preprocessing.TextVectorization`预处理层（TensorFlow原生方式，相对简单）

In [1]:
import numpy as np 
import pandas as pd 
from matplotlib import pyplot as plt
import tensorflow as tf
from tensorflow.keras import models,layers,preprocessing,optimizers,losses,metrics
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import re,string

2023-08-18 10:43:02.070328: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-18 10:43:02.135062: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-18 10:43:02.136348: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
train_data_path = "../data/imdb/train.csv"
test_data_path =  "../data/imdb/test.csv"

In [3]:
MAX_WORDS = 10000  # 仅考虑最高频的10000个词
MAX_LEN = 200  # 每个样本保留200个词的长度
BATCH_SIZE = 20 

In [5]:
# 使用第二种方法构建管道
def split_line(line):
    arr = tf.strings.split(line,"\t")
    # 在指定的位置插入一个新的维度
    # tf.expand_dims 经常用于在输入数据张量上插入批次维度，以便进行批量预测或训练。
    label = tf.expand_dims(tf.cast(tf.strings.to_number(arr[0]),tf.int32),axis = 0)
    text = tf.expand_dims(arr[1],axis = 0)
    return (text,label)

In [6]:
ds_train_raw =  tf.data.TextLineDataset(filenames = [train_data_path]) \
   .map(split_line,num_parallel_calls = tf.data.experimental.AUTOTUNE) \
   .shuffle(buffer_size = 1000).batch(BATCH_SIZE) \
   .prefetch(tf.data.experimental.AUTOTUNE)

2023-08-18 10:47:58.254353: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-08-18 10:47:58.297488: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1960] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [7]:
ds_test_raw = tf.data.TextLineDataset(filenames = [test_data_path]) \
   .map(split_line,num_parallel_calls = tf.data.experimental.AUTOTUNE) \
   .batch(BATCH_SIZE) \
   .prefetch(tf.data.experimental.AUTOTUNE)

In [8]:
#构建词典
def clean_text(text):
    lowercase = tf.strings.lower(text)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    cleaned_punctuation = tf.strings.regex_replace(stripped_html,
         '[%s]' % re.escape(string.punctuation),'')
    return cleaned_punctuation

In [9]:
vectorize_layer = TextVectorization(
    standardize=clean_text,
    split = 'whitespace',
    max_tokens=MAX_WORDS-1, #有一个留给占位符
    output_mode='int',
    output_sequence_length=MAX_LEN)

ds_text = ds_train_raw.map(lambda text,label: text)
vectorize_layer.adapt(ds_text)
print(vectorize_layer.get_vocabulary()[0:100])

['', '[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it', 'i', 'this', 'that', 'was', 'as', 'for', 'with', 'movie', 'but', 'film', 'on', 'not', 'you', 'his', 'are', 'have', 'be', 'he', 'one', 'its', 'at', 'all', 'by', 'an', 'they', 'from', 'who', 'so', 'like', 'her', 'just', 'or', 'about', 'has', 'if', 'out', 'some', 'there', 'what', 'good', 'more', 'when', 'very', 'she', 'even', 'my', 'no', 'would', 'up', 'time', 'only', 'which', 'story', 'really', 'their', 'were', 'had', 'see', 'can', 'me', 'than', 'we', 'much', 'well', 'get', 'been', 'will', 'into', 'people', 'also', 'other', 'do', 'bad', 'because', 'great', 'first', 'how', 'him', 'most', 'dont', 'made', 'then', 'them', 'films', 'movies', 'way', 'make', 'could', 'too', 'any']


In [10]:
#单词编码
ds_train = ds_train_raw.map(lambda text,label:(vectorize_layer(text),label)) \
    .prefetch(tf.data.experimental.AUTOTUNE)
ds_test = ds_test_raw.map(lambda text,label:(vectorize_layer(text),label)) \
    .prefetch(tf.data.experimental.AUTOTUNE)

2. 定义模型

In [11]:
# 使用继承Model基类构建自定义模型
tf.keras.backend.clear_session()

In [12]:
class CnnModel(models.Model):
    def __init__(self):
        super(CnnModel, self).__init__()

    def build(self,input_shape):
        self.embedding = layers.Embedding(MAX_WORDS,7,input_length=MAX_LEN)
        self.conv_1 = layers.Conv1D(16, kernel_size= 5,name = "conv_1",activation = "relu")
        self.pool_1 = layers.MaxPool1D(name = "pool_1")
        self.conv_2 = layers.Conv1D(128, kernel_size=2,name = "conv_2",activation = "relu")
        self.pool_2 = layers.MaxPool1D(name = "pool_2")
        self.flatten = layers.Flatten()
        self.dense = layers.Dense(1,activation = "sigmoid")
        super(CnnModel,self).build(input_shape)

    def call(self, x):
        x = self.embedding(x)
        x = self.conv_1(x)
        x = self.pool_1(x)
        x = self.conv_2(x)
        x = self.pool_2(x)
        x = self.flatten(x)
        x = self.dense(x)
        return(x)

    # 用于显示Output Shape
    def summary(self):
        x_input = layers.Input(shape = MAX_LEN)
        output = self.call(x_input)
        model = tf.keras.Model(inputs = x_input,outputs = output)
        model.summary()

model = CnnModel()
model.build(input_shape =(None,MAX_LEN))
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 200)]             0         
                                                                 
 embedding (Embedding)       (None, 200, 7)            70000     
                                                                 
 conv_1 (Conv1D)             (None, 196, 16)           576       
                                                                 
 pool_1 (MaxPooling1D)       (None, 98, 16)            0         
                                                                 
 conv_2 (Conv1D)             (None, 97, 128)           4224      
                                                                 
 pool_2 (MaxPooling1D)       (None, 48, 128)           0         
                                                                 
 flatten (Flatten)           (None, 6144)              0     

3. 训练模型

In [13]:
# 自定义训练循环训练模型
#打印时间分割线
@tf.function
def printbar():
    today_ts = tf.timestamp()%(24*60*60)

    hour = tf.cast(today_ts//3600+8,tf.int32)%tf.constant(24)
    minite = tf.cast((today_ts%3600)//60,tf.int32)
    second = tf.cast(tf.floor(today_ts%60),tf.int32)

    def timeformat(m):
        if tf.strings.length(tf.strings.format("{}",m))==1:
            return(tf.strings.format("0{}",m))
        else:
            return(tf.strings.format("{}",m))

    timestring = tf.strings.join([timeformat(hour),timeformat(minite),
                timeformat(second)],separator = ":")
    tf.print("=========="*8+timestring)

In [14]:
optimizer = optimizers.Nadam()
loss_func = losses.BinaryCrossentropy()

train_loss = metrics.Mean(name='train_loss')
train_metric = metrics.BinaryAccuracy(name='train_accuracy')

valid_loss = metrics.Mean(name='valid_loss')
valid_metric = metrics.BinaryAccuracy(name='valid_accuracy')

In [15]:
@tf.function
def train_step(model, features, labels):
    with tf.GradientTape() as tape:
        predictions = model(features,training = True)
        loss = loss_func(labels, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    train_loss.update_state(loss)
    train_metric.update_state(labels, predictions)


@tf.function
def valid_step(model, features, labels):
    predictions = model(features,training = False)
    batch_loss = loss_func(labels, predictions)
    valid_loss.update_state(batch_loss)
    valid_metric.update_state(labels, predictions)


def train_model(model,ds_train,ds_valid,epochs):
    for epoch in tf.range(1,epochs+1):

        for features, labels in ds_train:
            train_step(model,features,labels)

        for features, labels in ds_valid:
            valid_step(model,features,labels)

        #此处logs模板需要根据metric具体情况修改
        logs = 'Epoch={},Loss:{},Accuracy:{},Valid Loss:{},Valid Accuracy:{}' 

        if epoch%1==0:
            printbar()
            tf.print(tf.strings.format(logs,
            (epoch,train_loss.result(),train_metric.result(),valid_loss.result(),valid_metric.result())))
            tf.print("")

        train_loss.reset_states()
        valid_loss.reset_states()
        train_metric.reset_states()
        valid_metric.reset_states()

In [16]:
train_model(model,ds_train,ds_test,epochs = 6)

Epoch=1,Loss:0.433327377,Accuracy:0.7744,Valid Loss:0.317821354,Valid Accuracy:0.863

Epoch=2,Loss:0.221809298,Accuracy:0.9127,Valid Loss:0.346467316,Valid Accuracy:0.8644

Epoch=3,Loss:0.143342242,Accuracy:0.94595,Valid Loss:0.421649843,Valid Accuracy:0.8606

Epoch=4,Loss:0.0819885433,Accuracy:0.9717,Valid Loss:0.59040308,Valid Accuracy:0.8522

Epoch=5,Loss:0.0368908308,Accuracy:0.9878,Valid Loss:0.829206944,Valid Accuracy:0.8526

Epoch=6,Loss:0.022493124,Accuracy:0.9925,Valid Loss:1.05547059,Valid Accuracy:0.845



4. 评估模型

In [17]:
# 通过自定义训练循环训练的模型没有经过编译，无法直接使用model.evaluate(ds_valid)方法
def evaluate_model(model,ds_valid):
    for features, labels in ds_valid:
         valid_step(model,features,labels)
    logs = 'Valid Loss:{},Valid Accuracy:{}' 
    tf.print(tf.strings.format(logs,(valid_loss.result(),valid_metric.result())))

    valid_loss.reset_states()
    train_metric.reset_states()
    valid_metric.reset_states()

In [18]:
evaluate_model(model,ds_test)

Valid Loss:1.05547059,Valid Accuracy:0.845


5. 使用模型预测st)

In [20]:
# 使用model.predict(ds_test)方法
model.predict(ds_test)



array([[0.9066615 ],
       [0.9999972 ],
       [0.9999982 ],
       ...,
       [0.9999804 ],
       [0.22427195],
       [1.        ]], dtype=float32)

In [21]:
for x_test,_ in ds_test.take(1):
    print(model(x_test))
    #以下方法等价：
    #print(model.call(x_test))
    #print(model.predict_on_batch(x_test))

tf.Tensor(
[[9.0666139e-01]
 [9.9999720e-01]
 [9.9999821e-01]
 [4.3126863e-07]
 [9.5012975e-01]
 [5.7610436e-05]
 [7.9664034e-08]
 [2.2415623e-01]
 [9.9990994e-01]
 [9.9996918e-01]
 [1.0000000e+00]
 [7.4551636e-01]
 [1.4705578e-10]
 [9.9930972e-01]
 [2.4761547e-07]
 [9.9256420e-01]
 [3.5520742e-04]
 [1.1825867e-01]
 [1.6452361e-05]
 [9.9550885e-01]], shape=(20, 1), dtype=float32)


6. 保存模型

In [22]:
model.save('../data/imdb/tf_model_savedmodel', save_format="tf")
print('export saved model.')

model_loaded = tf.keras.models.load_model('../data/imdb/tf_model_savedmodel')
model_loaded.predict(ds_test)

INFO:tensorflow:Assets written to: ../data/imdb/tf_model_savedmodel/assets


INFO:tensorflow:Assets written to: ../data/imdb/tf_model_savedmodel/assets


export saved model.






array([[0.9066615 ],
       [0.9999972 ],
       [0.9999982 ],
       ...,
       [0.9999804 ],
       [0.22427195],
       [1.        ]], dtype=float32)