In [1]:
import math, re, os
import numpy as np
import tensorflow as tf
import pandas as pd
from matplotlib import pyplot as plt   # matplotlib进行画图
from tensorflow.keras import layers
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.optimizers import SGD
from kaggle_datasets import KaggleDatasets # Kaggle数据集
import efficientnet.tfkeras as efn    # 导入efficientnet模型
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix 
print("Tensorflow version " + tf.__version__)

Tensorflow version 2.5.0


In [None]:
AUTO = tf.data.experimental.AUTOTUNE # 可以让程序自动的选择最优的线程并行个数
# 从TPU创建部署
tpu = tf.distribute.cluster_resolver.TPUClusterResolver() #如果先前设置好了ＴＰＵ＿ＮＡＭＥ环境变量，不需要再给参数．
tf.config.experimental_connect_to_cluster(tpu) # 配置实验连接到群集
tf.tpu.experimental.initialize_tpu_system(tpu) # 初始化tpu系统
strategy = tf.distribute.experimental.TPUStrategy(tpu) # 设置TPU部署
GCS_DS_PATH = KaggleDatasets().get_gcs_path('tpu-getting-started')


In [None]:
IMAGE_SIZE = [512, 512]
EPOCHS = 12
GCS_PATH = GCS_DS_PATH + '/tfrecords-jpeg-512x512'
AUTO = tf.data.experimental.AUTOTUNE

TRAINING_FILENAMES = tf.io.gfile.glob(GCS_PATH + '/train/*.tfrec')
VALIDATION_FILENAMES = tf.io.gfile.glob(GCS_PATH + '/val/*.tfrec')
TEST_FILENAMES = tf.io.gfile.glob(GCS_PATH + '/test/*.tfrec') 
CLASSES = ['pink primrose', 'hard-leaved pocket orchid', 'canterbury bells', 'sweet pea', 'wild geranium', 'tiger lily', 'moon orchid', 'bird of paradise', 'monkshood', 'globe thistle', 'snapdragon', "colt's foot", 'king protea', 'spear thistle', 'yellow iris', 'globe-flower', 'purple coneflower', 'peruvian lily', 'balloon flower', 'giant white arum lily', 'fire lily', 'pincushion flower', 'fritillary', 'red ginger', 'grape hyacinth', 'corn poppy', 'prince of wales feathers', 'stemless gentian', 'artichoke', 'sweet william', 'carnation', 'garden phlox', 'love in the mist', 'cosmos', 'alpine sea holly', 'ruby-lipped cattleya', 'cape flower', 'great masterwort', 'siam tulip', 'lenten rose', 'barberton daisy', 'daffodil', 'sword lily', 'poinsettia', 'bolero deep blue', 'wallflower', 'marigold', 'buttercup', 'daisy', 'common dandelion', 'petunia', 'wild pansy', 'primula', 'sunflower', 'lilac hibiscus', 'bishop of llandaff', 'gaura', 'geranium', 'orange dahlia', 'pink-yellow dahlia', 'cautleya spicata', 'japanese anemone', 'black-eyed susan', 'silverbush', 'californian poppy', 'osteospermum', 'spring crocus', 'iris', 'windflower', 'tree poppy', 'gazania', 'azalea', 'water lily', 'rose', 'thorn apple', 'morning glory', 'passion flower', 'lotus', 'toad lily', 'anthurium', 'frangipani', 'clematis', 'hibiscus', 'columbine', 'desert-rose', 'tree mallow', 'magnolia', 'cyclamen ', 'watercress', 'canna lily', 'hippeastrum ', 'bee balm', 'pink quill', 'foxglove', 'bougainvillea', 'camellia', 'mallow', 'mexican petunia', 'bromelia', 'blanket flower', 'trumpet creeper', 'blackberry lily', 'common tulip', 'wild rose']

def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels=3)
    image = tf.cast(image, tf.float32) / 255.0  # convert image to floats in [0, 1] range
    image = tf.reshape(image, [*IMAGE_SIZE, 3]) # explicit size needed for TPU
    return image

def read_labeled_tfrecord(example):
    LABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
        "class": tf.io.FixedLenFeature([], tf.int64),  # shape [] means single element
    }
    example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    label = tf.cast(example['class'], tf.int32)
    return image, label # returns a dataset of (image, label) pairs

def read_unlabeled_tfrecord(example):
    UNLABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
        "id": tf.io.FixedLenFeature([], tf.string),  # shape [] means single element
        # class is missing, this competitions's challenge is to predict flower classes for the test dataset
    }
    example = tf.io.parse_single_example(example, UNLABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    idnum = example['id']
    return image, idnum # returns a dataset of image(s)

def load_dataset(filenames, labeled=True, ordered=False):
    # Read from TFRecords. For optimal performance, reading from multiple files at once and
    # disregarding data order. Order does not matter since we will be shuffling the data anyway.
    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False #disable order,increase speed
    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTO) # automatically interleaves reads from multiple files
    dataset = dataset.with_options(ignore_order) # uses data as soon as it streams in, rather than in its original order
    dataset = dataset.map(read_labeled_tfrecord if labeled else read_unlabeled_tfrecord, num_parallel_calls=AUTO)
    # returns a dataset of (image, label) pairs if labeled=True or (image, id) pairs if labeled=False
    return dataset

In [None]:
def data_augment(image, label, seed=2020):
    # 以1比2的概率,输出image沿着第二维翻转的内容,即,width.否则按原样输出图像.
    # 参数：
    # image：形状为[height, width, channels]的三维张量.
    # seed：一个Python整数,用于创建一个随机种子.查看tf.set_random_seed行为.
    # 返回：一个与image具有相同类型和形状的三维张量.
    image = tf.image.random_flip_left_right(image, seed=seed) 
#     image = tf.image.random_flip_up_down(image, seed=seed)
#     image = tf.image.random_brightness(image, 0.1, seed=seed)
#     image = tf.image.random_jpeg_quality(image, 85, 100, seed=seed)
#     image = tf.image.resize(image, [530, 530])
#     image = tf.image.random_crop(image, [512, 512], seed=seed)
    #image = tf.image.random_saturation(image, 0, 2)
    return image, label   

BATCH_SIZE = 16 * strategy.num_replicas_in_sync

def get_training_dataset():
    dataset = load_dataset(TRAINING_FILENAMES, labeled=True)
    # 将数据转换并行化
    # 为num_parallel_calls 参数选择最佳值取决于您的硬件、训练数据的特征（例如其大小和形状）、Map 功能的成本以及在 CPU 上同时进行的其他处理；
    dataset = dataset.map(data_augment, num_parallel_calls=AUTO)
    # 函数形式：repeat(count=None)
    # 参数count:(可选）表示数据集应重复的次数。默认行为（如果count是None或-1）是无限期重复的数据集。
    dataset = dataset.repeat() # the training dataset must repeat for several epochs
    dataset = dataset.shuffle(2048) #将数据打乱，括号中数值越大，混乱程度越大
    dataset = dataset.batch(BATCH_SIZE) # 按照顺序将小批量中样本数目行数据合成一个小批量，最后一个小批量可能小于20
    dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

def get_validation_dataset(ordered=False):
    dataset = load_dataset(VALIDATION_FILENAMES, labeled=True, ordered=ordered)
    dataset = dataset.batch(BATCH_SIZE)# 按照顺序将小批量中样本数目行数据合成一个小批量，最后一个小批量可能小于20
    dataset = dataset.cache() # 使用.cache()方法：当计算缓存空间足够时，将preprocess的数据存储在缓存空间中将大幅提高计算速度。
    dataset = dataset.prefetch(AUTO)# pipeline（管道）读取数据，在训练时预取下一批（自动调整预取缓冲区大小）
    return dataset

def get_train_valid_datasets():
    dataset = load_dataset(VALIDATION_FILENAMES+TRAINING_FILENAMES, labeled=True)
    dataset = dataset.map(data_augment, num_parallel_calls=AUTO)
    # 函数形式：repeat(count=None)
    # 参数count:(可选）表示数据集应重复的次数。默认行为（如果count是None或-1）是无限期重复的数据集。
    dataset = dataset.repeat() # the training dataset must repeat for several epochs
    dataset = dataset.shuffle(2048) #将数据打乱，括号中数值越大，混乱程度越大
    dataset = dataset.batch(BATCH_SIZE) # 按照顺序将小批量中样本数目行数据合成一个小批量，最后一个小批量可能小于20
    dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

def get_test_dataset(ordered=False):
    dataset = load_dataset(TEST_FILENAMES, labeled=False, ordered=ordered)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset

def count_data_items(filenames):
    # the number of data items is written in the name of the .tfrec
    # files, i.e. flowers00-230.tfrec = 230 data items
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) for filename in filenames]
    return np.sum(n)

NUM_TRAINING_IMAGES = count_data_items(TRAINING_FILENAMES)
NUM_VALIDATION_IMAGES = count_data_items(VALIDATION_FILENAMES)
NUM_TEST_IMAGES = count_data_items(TEST_FILENAMES)
print('Dataset: {} training images, {} validation images, {} unlabeled test images'.format(NUM_TRAINING_IMAGES, NUM_VALIDATION_IMAGES, NUM_TEST_IMAGES))

ds_train = get_training_dataset()
ds_valid = get_validation_dataset()
ds_test = get_test_dataset()

In [None]:
def display_one_flower(image, title, subplot, red=False, titlesize=16):
    plt.subplot(*subplot)
    plt.axis('off')
    plt.imshow(image)
    if len(title) > 0:
        plt.title(title, fontsize=int(titlesize) if not red else int(titlesize/1.2), color='red' if red else 'black', fontdict={'verticalalignment':'center'}, pad=int(titlesize/1.5))
    return (subplot[0], subplot[1], subplot[2]+1)

def display_training_curves(training, validation, title, subplot):
    if subplot%10==1: # set up the subplots on the first call # 在第一次调用该函数时设置子图
        plt.subplots(figsize=(10,10), facecolor='#F0F0F0')
        plt.tight_layout()
    ax = plt.subplot(subplot) #设置子图
    ax.set_facecolor('#F8F8F8') #设置背景颜色
    ax.plot(training) #画训练集的曲线
    ax.plot(validation) #画测试集的曲线
    ax.set_title('model '+ title)
    ax.set_ylabel(title) #设置y轴标题
    #ax.set_ylim(0.28,1.05)
    ax.set_xlabel('epoch') #设置x轴标题
    ax.legend(['train', 'valid.']) #设置图例

def batch_to_numpy_images_and_labels(data):
    images, labels = data 
    numpy_images = images.numpy() 
    numpy_labels = labels.numpy()
    if numpy_labels.dtype == object: # 在这种情况下为二进制字符串，它们是图像ID字符串
        numpy_labels = [None for _ in enumerate(numpy_images)]
    # 如果没有标签，只有图像ID，则对标签返回None（测试数据就是这种情况）
    return numpy_images, numpy_labels

# 把实际类型和模型预测出来的模型一起显示在图片上方，对验证集预测完标签后和验证集的实际标签进行比较
# label,图片中花朵的实际类别
# correct_label，当前我们预测的类别
def title_from_label_and_target(label, correct_label):
    # 如果没有预测的类别，则返回实际类别，比如训练集
    if correct_label is None:
        return CLASSES[label], True
    correct = (label == correct_label)
    # 如果一致，则返回OK，不一致则返回NO加实际类别
    return "{} [{}{}{}]".format(CLASSES[label], 'OK' if correct else 'NO', u"\u2192" if not correct else '',
                                CLASSES[correct_label] if not correct else ''), correct

# 展示小批量图片，我们在下面的代码中经常展示20张照片
def display_batch_of_images(databatch, predictions=None):
    """This will work with:
    display_batch_of_images(images)   #只展示图片 测试集需要这个
    display_batch_of_images(images, predictions) #展示图片加预测的类别 测试集需要这个
    display_batch_of_images((images, labels)) #展示图片加实际标签 训练集需要这个
    display_batch_of_images((images, labels), predictions) #展示图片+实际类别+预测类别 验证集需要这个
    """
    images, labels = batch_to_numpy_images_and_labels(databatch)
    if labels is None:
        labels = [None for _ in enumerate(images)]
        
    # 自动平方：这将删除不适合正方形或矩形的数据
    rows = int(math.sqrt(len(images)))
    cols = len(images)//rows  #" // " 表示整数除法,返回不大于结果的一个最大的整数，向下取整
    # 大小和间距
    FIGSIZE = 13.0  #画图大小
    SPACING = 0.1
    subplot=(rows,cols,1)
    if rows < cols:
        # 如果行大于列
        plt.figure(figsize=(FIGSIZE,FIGSIZE/cols*rows))
    else:
        plt.figure(figsize=(FIGSIZE/rows*cols,FIGSIZE))
    # display
    for i, (image, label) in enumerate(zip(images[:rows*cols], labels[:rows*cols])):
        title = '' if label is None else CLASSES[label]
        correct = True
        if predictions is not None:
            title, correct = title_from_label_and_target(predictions[i], label)
        dynamic_titlesize = FIGSIZE*SPACING/max(rows,cols)*40+3 # 经过测试可以在1x1到10x10图像上工作的魔术公式
        subplot = display_one_flower(image, title, subplot, not correct, titlesize=dynamic_titlesize)
    #layout
    plt.tight_layout()
    if label is None and predictions is None:
        plt.subplots_adjust(wspace=0, hspace=0)
    else:
        plt.subplots_adjust(wspace=SPACING, hspace=SPACING)
    plt.show()


In [None]:
def display_confusion_matrix(cmat, score, precision, recall):
    plt.figure(figsize=(15,15))
    ax = plt.gca()
    ax.matshow(cmat, cmap='Reds')
    ax.set_xticks(range(len(CLASSES)))
    ax.set_xticklabels(CLASSES, fontdict={'fontsize': 7})
    plt.setp(ax.get_xticklabels(), rotation=45, ha="left", rotation_mode="anchor")
    ax.set_yticks(range(len(CLASSES)))
    ax.set_yticklabels(CLASSES, fontdict={'fontsize': 7})
    plt.setp(ax.get_yticklabels(), rotation=45, ha="right", rotation_mode="anchor")
    titlestring = ""
    if score is not None:
        titlestring += 'f1 = {:.3f} '.format(score)
    if precision is not None:
        titlestring += '\nprecision = {:.3f} '.format(precision)
    if recall is not None:
        titlestring += '\nrecall = {:.3f} '.format(recall)
    if len(titlestring) > 0:
        ax.text(101, 1, titlestring, fontdict={'fontsize': 18, 'horizontalalignment':'right', 'verticalalignment':'top', 'color':'#804040'})
    plt.show()


In [None]:
np.set_printoptions(threshold=15, linewidth=80)
print("Training data shapes:")
for image, label in ds_train.take(3):
    print(image.numpy().shape, label.numpy().shape)
print("Training data label examples:", label.numpy())

In [None]:
print("Test data shapes:")
for image, idnum in ds_test.take(3):
    print(image.numpy().shape, idnum.numpy().shape)
print("Test data IDs:", idnum.numpy().astype('U')) # U=unicode string

In [None]:
ds_iter = iter(ds_train.unbatch().batch(20))

In [None]:
with strategy.scope():
    pretrained_model = tf.keras.applications.VGG16(
        weights='imagenet',
        include_top=False,
        include_shape=[*IMAGE_SIZE, 3]
    )
    pretrained_model.trainable = False 
    model = tf.keras.Sequential([
        pretrained_model,
        layers.GlobalAvgPool2D(),
        # activation='softmax'：表示这个层将返回图片在104个类别上的概率，其中最大的概率表示这个图片的预测类别
        # softmax激活函数的本质就是将一个K维的任意实数向量压缩（映射）成另一个K维的实数向量，其中向量中的每个元素取值都介于（0，1）之间并且和为1。
        # 在多分类单标签问题中，可以用softmax作为最后的激活层，取概率最高的作为结果
        tf.keras.layers.Dense(len(CLASSES), activation='softmax'),
    ])
    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['sparse_categorical_accuracy'],
    )
    model.summary()


