In [5]:
#coding=utf8
import sys
sys.getdefaultencoding()

'utf-8'

## 标准的人工智能项目按照如下步骤进行
| 1 准备数据
| 2 准备模型
| 3 训练模型
| 4 评估模型
| 5 使用模型 
| 6 保存模型
## 

In [7]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import tensorflow as tf
from tensorflow.keras import models,layers,preprocessing,optimizers,losses,metrics
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

import re,string

# 1 准备样本
train_data_path="./datasets/imdb/train.csv"
test_data_path="././datasets/imdb/test.csv"
# 最高频1w词
Max_Words =10000
# 保留样本长度 200个词
Max_Len=200
Batch_Size=20

# 构建管道

def split_line(line):
    arr = tf.strings.split(line,"\t")
    label=tf.expand_dims(tf.cast(tf.strings.to_number(arr[0]),tf.int32),axis=0)
    text = tf.expand_dims(arr[1],axis=0)
    return (text,label)

ds_train_raw = tf.data.TextLineDataset(filenames=[train_data_path]) \
    .map(split_line,num_parallel_calls=tf.data.experimental.AUTOTUNE) \
    .shuffle(buffer_size=1000).batch(Batch_Size) \
    .prefetch(tf.data.experimental.AUTOTUNE)

ds_test_raw = tf.data.TextLineDataset(filenames=[test_data_path]) \
    .map(split_line,num_parallel_calls=tf.data.experimental.AUTOTUNE) \
    .batch(Batch_Size) \
    .prefetch(tf.data.experimental.AUTOTUNE)

## 清理文本
def clean_text(text):
    lowercase =tf.strings.lower(text)
    stripped_html =tf.strings.regex_replace(lowercase,'<br />',' ')
    # 清理所有标点符号
    clean_punctuation =tf.strings.regex_replace(stripped_html,'[%s]' % re.escape(string.punctuation),'')
    return  clean_punctuation

# 构建向量层
vectorize_layer = TextVectorization(
    standardize=clean_text,
    split = 'whitespace',
    max_tokens=Max_Words-1,
    output_mode='int',
    output_sequence_length=Max_Len
)

ds_text =ds_train_raw.map(lambda text,label:text)
vectorize_layer.adapt(ds_text)
print(vectorize_layer.get_vocabulary()[0:100])

ds_train =ds_train_raw.map(lambda text,label:(vectorize_layer(text),label)) \
    .prefetch(tf.data.experimental.AUTOTUNE)
ds_test = ds_test_raw.map(lambda text,label:(vectorize_layer(text),label)) \
    .prefetch(tf.data.experimental.AUTOTUNE)


[b'the', b'and', b'a', b'of', b'to', b'is', b'in', b'it', b'i', b'this', b'that', b'was', b'as', b'for', b'with', b'movie', b'but', b'film', b'on', b'not', b'you', b'his', b'are', b'have', b'be', b'he', b'one', b'its', b'at', b'all', b'by', b'an', b'they', b'from', b'who', b'so', b'like', b'her', b'just', b'or', b'about', b'has', b'if', b'out', b'some', b'there', b'what', b'good', b'more', b'when', b'very', b'she', b'even', b'my', b'no', b'would', b'up', b'time', b'only', b'which', b'story', b'really', b'their', b'were', b'had', b'see', b'can', b'me', b'than', b'we', b'much', b'well', b'get', b'been', b'will', b'into', b'people', b'also', b'other', b'do', b'bad', b'because', b'great', b'first', b'how', b'him', b'most', b'dont', b'made', b'then', b'them', b'films', b'movies', b'way', b'make', b'could', b'too', b'any', b'after', b'characters']


In [14]:
## 2 定义模型
tf.keras.backend.clear_session()

class CnnModel(models.Model):
    def __init__(self):
        super(CnnModel,self).__init__()
    
    def build(self, input_shape):
        self.embedding = layers.Embedding(Max_Words,7,input_length=Max_Len)
        self.conv_1 =layers.Conv1D(16,kernel_size=5,name="conv_1",activation="relu")
        self.pool =layers.MaxPool1D()
        self.conv_2 =layers.Conv1D(128,kernel_size=2,name="conv_2",activation="relu")
        self.flatten = layers.Flatten()
        self.dense = layers.Dense(1, activation="sigmoid")
        super(CnnModel, self).build(input_shape)
        
    def call(self,x):
        x=self.embedding(x)
        x=self.conv_1(x)
        x=self.pool(x)
        x=self.conv_2(x)
        x=self.pool(x)
        x=self.flatten(x)
        x=self.dense(x)
        return(x)

model = CnnModel()
model.build(input_shape =(None,Max_Len))
model.summary()

Model: "cnn_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  70000     
_________________________________________________________________
conv_1 (Conv1D)              multiple                  576       
_________________________________________________________________
max_pooling1d (MaxPooling1D) multiple                  0         
_________________________________________________________________
conv_2 (Conv1D)              multiple                  4224      
_________________________________________________________________
flatten (Flatten)            multiple                  0         
_________________________________________________________________
dense (Dense)                multiple                  6145      
Total params: 80,945
Trainable params: 80,945
Non-trainable params: 0
_____________________________________________________

In [18]:
# 训练模型
## 装饰器 提前调用计算图

# 打印出来 时间
@tf.function
def printbar():
    ts =tf.timestamp()
    today_ts = ts%(24*60*60)
    
    hour = tf.cast(today_ts//3600+8,tf.int32)%tf.constant(24)
    minute = tf.cast((today_ts%3600)//60,tf.int32)
    second =tf.cast(tf.floor(today_ts%60),tf.int32)
    
    def timeformat(m):
        if tf.strings.length(tf.strings.format("{}",m))==1:
            return (tf.strings.format("0{}",m))
        else:
            return (tf.strings.format("{}",m))
    timesstring = tf.strings.join([timeformat(hour),timeformat(minute),timeformat(second)],separator=":")
    tf.print("========="*8,end="")
    tf.print(timesstring)
    
#定义超参数
optimizer = optimizers.Nadam()

loss_func = losses.BinaryCrossentropy()

train_loss=metrics.Mean(name='train_loss')

train_metric =metrics.BinaryAccuracy(name="train_accuracy")

valid_loss =metrics.Mean(name='valid_loss')

valid_metric =metrics.BinaryAccuracy(name='valid_accuracy')


#定义训练步骤

@tf.function
def train_step(model,features,labels):
    with tf.GradientTape() as tape:
        predictions =model(features,training =True)
        loss =loss_func(labels,predictions)
    gradients = tape.gradient(loss,model.trainable_variables)
    optimizer.apply_gradients(zip(gradients,model.trainable_variables))
    
    train_loss.update_state(loss)
    train_metric.update_state(labels,predictions)

# 定义验证
@tf.function
def valid_step(model,features,labels):
    predictions =model(features,training = False)
    batch_loss =loss_func(labels,predictions)
    valid_loss.update_state(batch_loss)
    valid_metric.update_state(labels,predictions)
    
    
# 定义训练模型

def train_model(model,ds_train,ds_valid,epochs):
    for epoch in tf.range(1,epochs+1):
        for features,labels in ds_train:
            train_step(model,features,labels)
            
        for features,labels in ds_test:
            valid_step(model,features,labels)
            
        logs ='Epoch={},Loss:{},Accuracy:{},Valid Loss:{},Valid Accuracy{}'
        
        if epoch%1==0:
            printbar()
            tf.print(tf.strings.format(logs,
                    (epoch,train_loss.result(),train_metric.result(),valid_loss.result(),valid_metric.result())))
            tf.print("")
        
        train_loss.reset_states()
        valid_loss.reset_states()
        train_metric.reset_states()
        valid_metric.reset_states()
        
train_model(model,ds_train,ds_test,epochs=6)

Epoch=1,Loss:0.244055256,Accuracy:0.9032,Valid Loss:0.308990479,Valid Accuracy0.8768

Epoch=2,Loss:0.173470512,Accuracy:0.9345,Valid Loss:0.345653564,Valid Accuracy0.8728

Epoch=3,Loss:0.115960017,Accuracy:0.95715,Valid Loss:0.500443101,Valid Accuracy0.848

Epoch=4,Loss:0.0687751696,Accuracy:0.97625,Valid Loss:0.56670624,Valid Accuracy0.8602

Epoch=5,Loss:0.0359254554,Accuracy:0.9884,Valid Loss:0.772485256,Valid Accuracy0.8528

Epoch=6,Loss:0.017658487,Accuracy:0.9951,Valid Loss:1.0004921,Valid Accuracy0.8546



In [19]:
#评估模型
def evluate_model(model,ds_valid):
    for features,labels in ds_valid:
        valid_step(model,features,labels)
    logs="Valid Loss:{},Valid Accuracy{}"
    tf.print(tf.strings.format(logs,(valid_loss.result(),valid_metric.result())))
    
    valid_loss.reset_states()
    train_metric.reset_states()
    valid_metric.reset_states()
    
evluate_model(model,ds_test)




Valid Loss:1.0004921,Valid Accuracy0.8546


In [20]:
#使用模型 
model.predict(ds_test)

array([[0.7925186],
       [0.9999998],
       [0.9999987],
       ...,
       [0.9631697],
       [0.484992 ],
       [1.       ]], dtype=float32)

In [22]:
for x_test,_ in ds_test.take(2):
    print(model(x_test))

tf.Tensor(
[[7.9251921e-01]
 [9.9999988e-01]
 [9.9999857e-01]
 [1.5074768e-10]
 [1.6863579e-01]
 [2.3242046e-06]
 [1.4598606e-07]
 [1.9180906e-09]
 [9.9995601e-01]
 [9.9995553e-01]
 [9.9999988e-01]
 [9.8068851e-01]
 [1.4484363e-09]
 [9.9951124e-01]
 [1.4443553e-10]
 [2.1784449e-01]
 [2.5157800e-05]
 [7.5942245e-03]
 [5.3736067e-06]
 [9.9998558e-01]], shape=(20, 1), dtype=float32)
tf.Tensor(
[[4.6119094e-04]
 [1.0000000e+00]
 [9.9983931e-01]
 [3.1419568e-06]
 [9.9999958e-01]
 [9.9954975e-01]
 [9.0569365e-01]
 [1.0739243e-01]
 [9.9835253e-01]
 [9.9999964e-01]
 [5.5930644e-01]
 [9.9998367e-01]
 [9.9950373e-01]
 [9.9999940e-01]
 [1.0000000e+00]
 [9.6882761e-14]
 [9.9349147e-01]
 [6.3158069e-08]
 [3.8814569e-06]
 [7.7965548e-03]], shape=(20, 1), dtype=float32)


In [23]:
#保存模型
model.save('./datasets/keras_models/0-3',save_format='tf')
print('saved model')

model_loaded = tf.keras.models.load_model('././datasets/keras_models/0-3')

model_loaded.predict(ds_test)


Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: ./datasets/keras_models/0-3\assets
saved model


array([[0.7925186],
       [0.9999998],
       [0.9999987],
       ...,
       [0.9631697],
       [0.484992 ],
       [1.       ]], dtype=float32)