# Setup

In [0]:
#@title Enter Kaggle Credentials
import os
from getpass import getpass
os.environ['KAGGLE_USERNAME'] = 'ranik40' #@param {type:"string"}
print('Enter your kaggle key')
os.environ['KAGGLE_KEY'] = getpass() 

In [0]:
!pip uninstall -y kaggle
!pip install -q -U kaggle
!pip install -q tensorflow-model-optimization
!pip install -q gcsfs

In [0]:
pip install -qq focal-loss

In [0]:
import os
assert os.environ['COLAB_TPU_ADDR'], 'Make sure to select TPU from Edit > Notebook settings > Hardware accelerator'

In [0]:
TPU_NAME = 'grpc://' + os.environ['COLAB_TPU_ADDR']

In [0]:
TPU_NAME

In [0]:
import sys

if 'google.colab' in sys.modules:
    from google.colab import auth
    auth.authenticate_user()

# tensorflow/models Image Classification

In [0]:
!rm -rf models
!git clone https://github.com/tensorflow/models.git -b v2.2.0

# Pipeline

In [0]:
import sys
sys.path.append("models")

In [0]:
#@title Dataset
import os
# import numpy as np
from typing import Any, List, Optional, Tuple, Mapping, Union
from absl import logging
from dataclasses import dataclass
import tensorflow as tf
import tensorflow_datasets as tfds


from official.modeling.hyperparams import base_config
from official.vision.image_classification import augment
from official.vision.image_classification import preprocessing
from official.vision.image_classification import dataset_factory


class DatasetBuilder(dataset_factory.DatasetBuilder):
  def load_records(self) -> tf.data.Dataset:
    """Return a dataset loading files with TFRecords."""
    logging.info('Using TFRecords to load data.')

    if self.config.filenames is None:
      if self.config.data_dir is None:
        raise ValueError('Dataset must specify a path for the data files.')

      file_pattern = os.path.join(self.config.data_dir,
                                  '{}*'.format(self.config.split))
      
      if self.config.split in ['train', 'validation']:
        shuffle = True
      else:
        shuffle = False

      dataset = tf.data.Dataset.list_files(file_pattern, shuffle=shuffle)
    else:
      dataset = tf.data.Dataset.from_tensor_slices(self.config.filenames)
      if self.is_training:
        # Shuffle the input files.
        dataset.shuffle(buffer_size=self.config.file_shuffle_buffer_size)

    return dataset

  def pipeline(self,
               dataset: tf.data.Dataset,
               input_context: tf.distribute.InputContext = None
              ) -> tf.data.Dataset:
    """Build a pipeline fetching, shuffling, and preprocessing the dataset."""
    if input_context and input_context.num_input_pipelines > 1:
      dataset = dataset.shard(input_context.num_input_pipelines,
                              input_context.input_pipeline_id)

    if self.is_training and not self.config.cache:
      dataset = dataset.repeat()

    if self.config.builder == 'records':
      # Read the data from disk in parallel
      buffer_size = 8 * 1024 * 1024  # Use 8 MiB per file
      dataset = dataset.interleave(
          lambda name: tf.data.TFRecordDataset(name, buffer_size=buffer_size),
          cycle_length=16,
          num_parallel_calls=tf.data.experimental.AUTOTUNE)
    
    dataset = dataset.prefetch(self.global_batch_size)

    if self.config.cache:
      dataset = dataset.cache()

    if self.is_training:
      dataset = dataset.shuffle(self.config.shuffle_buffer_size)
      dataset = dataset.repeat()

    # Parse, pre-process, and batch the data in parallel
    if self.config.builder == 'records':
      if self.config.split in ['train', 'validation']:
        preprocess = self.parse_record
      else:
        preprocess = self.parse_test_record
    else:
      preprocess = self.preprocess

    dataset = dataset.map(preprocess,
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)

    # Dataset balancing utilities
    @tf.function
    def class_func(image, label):
      return label

    @tf.function
    def drop_extra_label(extra_label, image_and_label):
      return image_and_label

    # if self.is_training:
    #   # Balance the dataset
    #   TARGET_DIST = [0.5, 0.5]
    #   INITIAL_DIST = [0.95, 0.05]

    #   resampler = tf.data.experimental.rejection_resample(
    #       class_func, 
    #       target_dist=TARGET_DIST,
    #       # seed=42,
    #       initial_dist=INITIAL_DIST
    #   )
    #   dataset = dataset.apply(resampler)

    dataset = dataset.batch(self.batch_size, drop_remainder=self.is_training)
    
    # if self.is_training:
    #   # The resampler returns creates (class, example) pairs from the output of the class_func. 
    #   # In this case, the example was already a (feature, label) pair, 
    #   # so use map to drop the extra copy of the labels
    #   dataset = dataset.map(
    #       drop_extra_label,
    #       num_parallel_calls=tf.data.experimental.AUTOTUNE
    #   )

    if self.config.split in ['test']:
      options = tf.data.Options()
      options.experimental_optimization.parallel_batch = True
      options.experimental_optimization.map_fusion = True
      # Note: Disabled map vectorization for balanced sampling
      # options.experimental_optimization.map_vectorization.enabled = True
      options.experimental_optimization.map_parallelization = True
      dataset = dataset.with_options(options)
      
    elif self.is_training and self.config.deterministic_train is not None:
      options = tf.data.Options()
      options.experimental_deterministic = self.config.deterministic_train
      options.experimental_slack = self.config.use_slack
      options.experimental_optimization.parallel_batch = True
      options.experimental_optimization.map_fusion = True
      # Note: Disabled map vectorization for balanced sampling
      # options.experimental_optimization.map_vectorization.enabled = True
      options.experimental_optimization.map_parallelization = True
      dataset = dataset.with_options(options)

    # Prefetch overlaps in-feed with training
    # Note: autotune here is not recommended, as this can lead to memory leaks.
    # Instead, use a constant prefetch size like the the number of devices.
    dataset = dataset.prefetch(self.config.num_devices)

    return dataset

  @tf.function
  def parse_record(self, record: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
    """Parse an ImageNet record from a serialized string Tensor."""
    keys_to_features = {
        'image':
            tf.io.FixedLenFeature((), tf.string, ''),
        # "age_approx": tf.io.FixedLenFeature([], tf.int64, -1),  
        # "sex": tf.io.FixedLenFeature([], tf.int64, -1),  
        'target':
            tf.io.FixedLenFeature([], tf.int64, -1)
    }
    
    parsed = tf.io.parse_single_example(record, keys_to_features)

    # age = tf.cast(parsed['age_approx'], tf.float32) / 30.
    # sex = tf.cast(parsed['sex'], tf.float32)

    # label = tf.reshape(parsed['target'], shape=[1])
    label = parsed['target']
    label = tf.cast(label, dtype=tf.int32)

    # image_bytes = tf.reshape(parsed['image'], shape=[])
    image_bytes = parsed['image']
    image, label = self.preprocess(image_bytes, label)

    return image, label
    # return (image, tf.stack([age,sex])), label

  def parse_test_record(self, record: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
    """Parse an ImageNet record from a serialized string Tensor."""
    keys_to_features = {
        'image':
            tf.io.FixedLenFeature((), tf.string, ''),
        # "age_approx": tf.io.FixedLenFeature([], tf.int64, -1),  
        # "sex": tf.io.FixedLenFeature([], tf.int64, -1),  
        'target':
            tf.io.FixedLenFeature([], tf.int64, -1),
        "image_name": 
            tf.io.FixedLenFeature((), tf.string)
    }
    
    parsed = tf.io.parse_single_example(record, keys_to_features)

    # age = tf.cast(parsed['age_approx'], tf.float32) / 30.
    # sex = tf.cast(parsed['sex'], tf.float32)

    # label = tf.reshape(parsed['target'], shape=[1])
    label = parsed['target']
    label = tf.cast(label, dtype=tf.int32)

    # image_bytes = tf.reshape(parsed['image'], shape=[])
    image_bytes = parsed['image']
    image, _ = self.preprocess(image_bytes, label)

    image_name = parsed['image_name']

    return image, image_name
    # return (image, tf.stack([age,sex])), image_name

dataset_factory.DatasetBuilder = DatasetBuilder

In [0]:
#@title EfficientNet Model
import math
import os
from typing import Any, Dict, Optional, Text, Tuple

from absl import logging
import tensorflow as tf

from official.vision.image_classification.efficientnet import efficientnet_model

class ModelConfig(efficientnet_model.ModelConfig):
  num_classes: int = 2


class EfficientNet(tf.keras.Model):
  """Wrapper class for an EfficientNet Keras model.
  Contains helper methods to build, manage, and save metadata about the model.
  """
  def __init__(self,
               config: ModelConfig = None,
               overrides: Dict[Text, Any] = None):
    """Create an EfficientNet model.
    Args:
      config: (optional) the main model parameters to create the model
      overrides: (optional) a dict containing keys that can override
                 config
    """
    overrides = overrides or {}
    config = config or ModelConfig()

    self.config = config.replace(**overrides)

    input_channels = self.config.input_channels
    model_name = self.config.model_name
    input_shape = (None, None, input_channels)  # Should handle any size image

    image_input = tf.keras.layers.Input(shape=input_shape)

    output = efficientnet_model.efficientnet(image_input, self.config)

    logging.info('Building model %s with params %s',
                 model_name,
                 self.config)

    super(EfficientNet, self).__init__(
        inputs=image_input, 
        outputs=output, 
        name=model_name)

  @classmethod
  def from_name(cls,
                model_name: Text,
                model_weights_path: Text = None,
                copy_to_local: bool = False,
                overrides: Dict[Text, Any] = None):
    """Construct an EfficientNet model from a predefined model name"""
    model_configs = dict(efficientnet_model.MODEL_CONFIGS)
    overrides = dict(overrides) if overrides else {}

    # One can define their own custom models if necessary
    model_configs.update(overrides.pop('model_config', {}))

    if model_name not in model_configs:
      raise ValueError('Unknown model name {}'.format(model_name))

    config = model_configs[model_name]

    model = cls(config=config, overrides=overrides)

    # Pop the classification layer
    model = tf.keras.Model(model.inputs, model.layers[-3].output)

    if model_weights_path:
      if copy_to_local:
        tmp_file = os.path.join('/tmp', model_name + '.h5')
        model_weights_file = os.path.join(model_weights_path, 'model.h5')
        tf.io.gfile.copy(model_weights_file, tmp_file, overwrite=True)
        model_weights_path = tmp_file

      loaded_model = tf.keras.models.load_model(model_weights_path, compile=False)
      model.set_weights(loaded_model.get_weights())
    
    initial_bias = -2.3498501
    output_bias = tf.keras.initializers.Constant(initial_bias)

    output = model.output

    # Cast to float32 in case we have a different model dtype
    output = tf.cast(output, tf.float32)

    x = tf.keras.layers.Dense(
      # config.num_classes,
      1,
      kernel_initializer=efficientnet_model.DENSE_KERNEL_INITIALIZER,
      bias_initializer=output_bias,
      # kernel_regularizer=tf.keras.regularizers.l2(config.weight_decay),
      # bias_regularizer=tf.keras.regularizers.l2(config.weight_decay),
      name='logits')(output)
    x = tf.keras.layers.Activation('sigmoid', name='probs', dtype='float32')(x)
    model = tf.keras.Model(inputs=model.inputs, outputs=x)
    return model

efficientnet_model.EfficientNet = EfficientNet
efficientnet_model.ModelConfig = ModelConfig

# Test

In [0]:
#@title Submission
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os

import pprint
from typing import Any, Tuple, Text, Optional, Mapping

from absl import app
from absl import flags
from absl import logging
import tensorflow as tf
import pandas as pd
import numpy as np
from focal_loss import BinaryFocalLoss

from official.vision.image_classification.classifier_trainer import *
from official.vision.image_classification.classifier_trainer import (
    _get_params_from_flags
)
from official.vision.image_classification import classifier_trainer

import tensorflow.keras.backend as K


def _get_dataset_builders(params: base_configs.ExperimentConfig,
                          strategy: tf.distribute.Strategy,
                          one_hot: bool
                         ) -> Tuple[Any, Any]:
  """Create and return train and validation dataset builders."""
  if one_hot:
    logging.warning('label_smoothing > 0, so datasets will be one hot encoded.')
  else:
    logging.warning('label_smoothing not applied, so datasets will not be one '
                    'hot encoded.')

  num_devices = strategy.num_replicas_in_sync if strategy else 1

  image_size = get_image_size_from_model(params)

  dataset_configs = [
      params.validation_dataset
  ]
  for config in dataset_configs:
    if config is not None and config.has_data:
      builder = dataset_factory.DatasetBuilder(
          config,
          image_size=image_size or config.image_size,
          num_devices=num_devices,
          one_hot=one_hot)
    else:
      builder = None

  return builder


def resume_from_checkpoint(model: tf.keras.Model,
                           model_dir: str) -> int:
  logging.info('Load from checkpoint is enabled.')
  latest_checkpoint = tf.train.latest_checkpoint(model_dir)
  logging.info('latest_checkpoint: %s', latest_checkpoint)
  if not latest_checkpoint:
    logging.info('No checkpoint detected.')
    return 0

  logging.info('Checkpoint file %s found and restoring from '
               'checkpoint', latest_checkpoint)
  model.load_weights(latest_checkpoint)
  logging.info('Completed loading from checkpoint.')


def train_and_eval(
    params: base_configs.ExperimentConfig,
    strategy_override: tf.distribute.Strategy) -> Mapping[str, Any]:
  """Runs the train and eval path using compile/fit."""
  logging.info('Running train and eval.')

  # Note: for TPUs, strategy and scope should be created before the dataset
  strategy = strategy_override or distribution_utils.get_distribution_strategy(
      distribution_strategy=params.runtime.distribution_strategy,
      all_reduce_alg=params.runtime.all_reduce_alg,
      num_gpus=params.runtime.num_gpus,
      tpu_address=params.runtime.tpu)

  strategy_scope = distribution_utils.get_strategy_scope(strategy)

  logging.info('Detected %d devices.',
               strategy.num_replicas_in_sync if strategy else 1)

  label_smoothing = params.model.loss.label_smoothing
  one_hot = label_smoothing and label_smoothing > 0

  builder = _get_dataset_builders(params, strategy, one_hot)
  dataset = builder.build()

  validation_builder = builder  # pylint: disable=unbalanced-tuple-unpacking
  validation_dataset = dataset

  validation_steps = params.evaluation.steps or validation_builder.num_steps

  initialize(params, validation_builder)

  logging.info('Global batch size: %d', validation_builder.global_batch_size)

  with strategy_scope:
    model_params = params.model.model_params.as_dict()
    model = get_models()[params.model.name](**model_params)
    if params.train.resume_checkpoint:
      resume_from_checkpoint(model=model, model_dir=params.model_dir)

  serialize_config(params=params, model_dir=params.model_dir)

  # Generate submission
  # GCS_PATH = 'gs://kds-c89313da1d85616eec461ab327fed61e1335defb486fb7729cf897b1'
  GCS_PATH ='gs://recursion-kaggle/melanoma'
  sub = pd.read_csv(GCS_PATH + '/sample_submission.csv')

  test_ids_ds = validation_dataset.map(lambda image, idnum: idnum).unbatch()
  NUM_TEST_IMAGES = 10982
  test_ids = next(iter(test_ids_ds.batch(NUM_TEST_IMAGES))).numpy().astype('U')

  validation_dataset = validation_dataset.map(lambda image, idnum: image)

  probabilities = model.predict(validation_dataset)
  probabilities = np.concatenate(probabilities)

  print('Generating submission.csv file...')
  print(test_ids)
  print(probabilities)
  
  pred_df = pd.DataFrame({'image_name': test_ids, 
                          'target': probabilities})
  pred_df.head()

  # sub.head()
  # del sub['target']
  # sub = sub.merge(pred_df, on='image_name')
  SUBMISSION_FILE = '/content/submission.csv'
  pred_df.to_csv(SUBMISSION_FILE, index=False)
  pred_df.head()

classifier_trainer._get_dataset_builders = _get_dataset_builders
classifier_trainer.train_and_eval = train_and_eval
classifier_trainer.resume_from_checkpoint = resume_from_checkpoint

In [0]:
define_classifier_flags()

In [0]:
%%writefile config.yaml

# Training configuration for EfficientNet trained on ImageNet on TPUs.
runtime:
  model_dir: null
  mode: 'train_and_eval'
  distribution_strategy: 'tpu'
  run_eagerly: False
  enable_xla: True
validation_dataset:
  name: 'imagenet2012'
  data_dir: null
  builder: 'records'
  split: 'test'
  one_hot: False
  num_classes: 2
  num_examples: 6625
  image_size: 456
  batch_size: 64
  use_per_replica_batch_size: True
  dtype: 'bfloat16'
model:
  model_params:
    model_name: 'efficientnet-b5'
    overrides:
      num_classes: 2
      batch_norm: 'tpu'
      dtype: 'bfloat16'
  loss:
    label_smoothing: 0.0
  num_classes: 2
train:
  resume_checkpoint: True

In [0]:
#@title Test
logging.set_verbosity(logging.INFO)

if __name__ == '__main__':
  if '-f' in sys.argv:
    sys.argv.remove('-f')
  flags.FLAGS.mode = 'train_and_eval' 
  flags.FLAGS.model_type = 'efficientnet' 
  flags.FLAGS.dataset = 'imagenet' 
  flags.FLAGS.tpu = TPU_NAME 
  flags.FLAGS.model_dir = 'gs://recursion-kaggle/melanoma/models/model_b5_456' #@param {type:"string"}
  flags.FLAGS.data_dir = 'gs://recursion-kaggle/melanoma/stratified_ex/test' #@param {type:"string"}
  flags.FLAGS.config_file = 'config.yaml' #@param {type:"string"}

  app.run(main)

In [0]:
import pandas as pd
import matplotlib.pyplot as plt

sub = pd.read_csv('submission.csv')

plt.hist(sub.target,bins=100)
plt.ylim((0,100))
plt.show()

In [0]:
sub.head(10)

In [0]:
from google.colab import files
files.download('submission.csv')

In [0]:
!kaggle competitions submit -c siim-isic-melanoma-classification -f submission.csv -m "test b5"