In [1]:
#!pip install --quiet efficientnet
#!pip install cleanlab

In [2]:
import math, os, re, warnings, random, time
from functools import partial
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from PIL import Image
import cv2
from sklearn.model_selection import StratifiedKFold
import tensorflow as tf

def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'

SEED = 0
seed_everything(SEED)
warnings.filterwarnings('ignore')

### Hardware configuration

Note that we have `32` cores, this is because the `TPU v2 Pod` have more cores than a single `TPU v3` which has `8` cores.

# Model parameters

In [3]:
BATCH_SIZE = 8 
LEARNING_RATE = 1e-5
EPOCHS = 10 #default 10
HEIGHT = 512
WIDTH = 512
HEIGHT_RS = 512
WIDTH_RS = 512
CHANNELS = 3
N_CLASSES = 5
N_FILES = 15
FOLDS_USED = 5
ES_PATIENCE = 5 #default 5
T1=0.75 #tempered-loss t1
T2=2.07 #tempered-loss t2
CUTMIX_PROB=0.2

# Load data

In [4]:
def count_data_items(filenames):
    n = [int(re.compile(r'-([0-9]*)\.').search(filename).group(1)) for filename in filenames]
    return np.sum(n)


#database_base_path = '../input/noisy-label-eda-with-cleanlab'
label_df = pd.read_csv(f'../input/noisy-label-eda-with-cleanlab/train_noisy_info.csv')
print(f'Train samples: {len(label_df)}')

Train samples: 26337


In [5]:
display(label_df.head())
CLASSES = ['Cassava Bacterial Blight', 
           'Cassava Brown Streak Disease', 
           'Cassava Green Mottle', 
           'Cassava Mosaic Disease', 
           'Healthy']
NUM_TRAINING_IMAGES = len(label_df)

Unnamed: 0,image_id,label,source,fold,is_noisy
0,1000015157.jpg,0,2020,3,False
1,1000201771.jpg,3,2020,2,False
2,100042118.jpg,1,2020,4,True
3,1000723321.jpg,1,2020,1,False
4,1000812911.jpg,3,2020,1,False


In [6]:
label_df['class'] = label_df['label'].apply(lambda x: CLASSES[x])
label_df.head()

Unnamed: 0,image_id,label,source,fold,is_noisy,class
0,1000015157.jpg,0,2020,3,False,Cassava Bacterial Blight
1,1000201771.jpg,3,2020,2,False,Cassava Mosaic Disease
2,100042118.jpg,1,2020,4,True,Cassava Brown Streak Disease
3,1000723321.jpg,1,2020,1,False,Cassava Brown Streak Disease
4,1000812911.jpg,3,2020,1,False,Cassava Mosaic Disease


In [7]:
# remove noise label
clean_df = label_df[label_df['is_noisy'] == False].reset_index(drop=True)
noise_df = label_df[label_df['is_noisy'] == True].reset_index(drop=True)
print(clean_df.shape, noise_df.shape)

(24225, 6) (2112, 6)


In [8]:
# make clean dfrecords

In [9]:
# tfrec fold
folds = StratifiedKFold(n_splits=N_FILES, shuffle=True, random_state=SEED)
clean_df['file'] = -1

for fold_n, (train_idx, val_idx) in enumerate(folds.split(clean_df, clean_df['label'])):
    print('File: %s has %s samples' % (fold_n+1, len(val_idx)))
    clean_df['file'].loc[val_idx] = fold_n
    
display(clean_df.head())

File: 1 has 1615 samples
File: 2 has 1615 samples
File: 3 has 1615 samples
File: 4 has 1615 samples
File: 5 has 1615 samples
File: 6 has 1615 samples
File: 7 has 1615 samples
File: 8 has 1615 samples
File: 9 has 1615 samples
File: 10 has 1615 samples
File: 11 has 1615 samples
File: 12 has 1615 samples
File: 13 has 1615 samples
File: 14 has 1615 samples
File: 15 has 1615 samples


Unnamed: 0,image_id,label,source,fold,is_noisy,class,file
0,1000015157.jpg,0,2020,3,False,Cassava Bacterial Blight,13
1,1000201771.jpg,3,2020,2,False,Cassava Mosaic Disease,14
2,1000723321.jpg,1,2020,1,False,Cassava Brown Streak Disease,3
3,1000812911.jpg,3,2020,1,False,Cassava Mosaic Disease,11
4,1000837476.jpg,3,2020,0,False,Cassava Mosaic Disease,11


In [10]:
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels=3)
    image = tf.cast(image, tf.float32) / 255.0
    #image = tf.cast(image, tf.float32)
                      
    image = tf.image.resize(image, [HEIGHT, WIDTH])
    image = tf.reshape(image, [HEIGHT, WIDTH, 3])
    return image

def read_tfrecord(example):
    TFREC_FORMAT = {
        'image': tf.io.FixedLenFeature([], tf.string), 
        'target': tf.io.FixedLenFeature([], tf.int64), 
        'image_name': tf.io.FixedLenFeature([], tf.string), 
    }
    example = tf.io.parse_single_example(example, TFREC_FORMAT)
    image = decode_image(example['image'])
    target = example['target']
    name = example['image_name']
    return image, target, name

def load_dataset(filenames, HEIGHT, WIDTH, CHANNELS=3):
    dataset = tf.data.TFRecordDataset(filenames)
    dataset = dataset.map(read_tfrecord, num_parallel_calls=AUTO)
    return dataset

def display_samples(ds, row, col):
    ds_iter = iter(ds)
    plt.figure(figsize=(15, int(15*row/col)))
    for j in range(row*col):
        image, label, name = next(ds_iter)
        plt.subplot(row,col,j+1)
        plt.axis('off')
        plt.imshow(image[0])
        plt.title(f"{label[0]}: {name[0].numpy().decode('utf-8')}", fontsize=12)
    plt.show()

def count_data_items(filenames):
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) for filename in filenames]
    return np.sum(n)


# Create TF Records
def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def serialize_example(image, target, image_name):
  feature = {
      'image': _bytes_feature(image),
      'label': _int64_feature(target),
      'image_id': _bytes_feature(image_name),
  }
  example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
  return example_proto.SerializeToString()

In [11]:
PATH='../input/cassava-leaf-disease-merged/train/'
IMG_QUALITY = 100
for tfrec_num in range(N_FILES):
    print('\nWriting TFRecord %i of %i...'%(tfrec_num, N_FILES))
    samples = clean_df[clean_df['file'] == tfrec_num]
    n_samples = len(samples)
    print(f'{n_samples} samples')
    with tf.io.TFRecordWriter('Id_train%.2i-%i.tfrec'%(tfrec_num, n_samples)) as writer:
        for row in samples.itertuples():
            label = row.label
            image_name = row.image_id
            img_path = f'{PATH}{image_name}'
            
            img = cv2.imread(img_path)
            img = cv2.resize(img, (HEIGHT, WIDTH))
            img = cv2.imencode('.jpg', img, (cv2.IMWRITE_JPEG_QUALITY, IMG_QUALITY))[1].tostring()
            
            example = serialize_example(img, label, str.encode(image_name))
            writer.write(example)
            
clean_df.to_csv('clean_data.csv', index=False)


Writing TFRecord 0 of 15...
1615 samples

Writing TFRecord 1 of 15...
1615 samples

Writing TFRecord 2 of 15...
1615 samples

Writing TFRecord 3 of 15...
1615 samples

Writing TFRecord 4 of 15...
1615 samples

Writing TFRecord 5 of 15...
1615 samples

Writing TFRecord 6 of 15...
1615 samples

Writing TFRecord 7 of 15...
1615 samples

Writing TFRecord 8 of 15...
1615 samples

Writing TFRecord 9 of 15...
1615 samples

Writing TFRecord 10 of 15...
1615 samples

Writing TFRecord 11 of 15...
1615 samples

Writing TFRecord 12 of 15...
1615 samples

Writing TFRecord 13 of 15...
1615 samples

Writing TFRecord 14 of 15...
1615 samples


In [12]:
samples = noise_df
n_samples = len(samples)
print(f'{n_samples} samples')
with tf.io.TFRecordWriter(f'Id_noisy00-{n_samples}.tfrec') as writer:
    for row in samples.itertuples():
        label = row.label
        image_name = row.image_id
        img_path = f'{PATH}{image_name}'

        img = cv2.imread(img_path)
        img = cv2.resize(img, (HEIGHT, WIDTH))
        img = cv2.imencode('.jpg', img, (cv2.IMWRITE_JPEG_QUALITY, IMG_QUALITY))[1].tostring()

        example = serialize_example(img, label, str.encode(image_name))
        writer.write(example)
            
noise_df.to_csv('noise_data.csv', index=False)

2112 samples
