# Data and Imports

In [None]:



import pandas as pd
import numpy as np
import jsonlines
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import torch.nn as nn
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import torch_optimizer as optim


from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from importlib import reload
pd.set_option('display.max_rows', 500)
pd.set_option('display.float_format', '{:0.3f}'.format)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.width = 0
import warnings
import torchvision
warnings.filterwarnings('ignore')

from facebook_hateful_memes_detector.utils.globals import set_global, get_global
set_global("cache_dir", "/home/ahemf/cache/cache")
set_global("dataloader_workers", 4)
set_global("use_autocast", True)
set_global("models_dir", "/home/ahemf/cache/")

from facebook_hateful_memes_detector.utils import read_json_lines_into_df, in_notebook, set_device
get_global("cache_dir")
from facebook_hateful_memes_detector.models import Fasttext1DCNNModel, MultiImageMultiTextAttentionEarlyFusionModel, LangFeaturesModel, AlbertClassifer
from facebook_hateful_memes_detector.preprocessing import TextImageDataset, my_collate, get_datasets, get_image2torchvision_transforms, TextAugment
from facebook_hateful_memes_detector.preprocessing import DefinedRotation, QuadrantCut, ImageAugment
from facebook_hateful_memes_detector.training import *
import facebook_hateful_memes_detector
reload(facebook_hateful_memes_detector)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
set_device(device)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
set_device(device)
device


In [None]:
aug_speeds = {"keyboard": 117, "char_substitute": 109, "char_insert": 109, "char_swap": 114,
              "ocr": 114, "char_delete": 108,
              "word_insert": 0.0, "word_substitute": 0.0, "text_rotate": 32,
              "stopword_insert": 34, "word_join": 32, "word_cutout": 36,
              "w2v_insert": 0.0, "w2v_substitute": 0.0, 
              "fasttext": 137, "glove_twitter": 88, "glove_wiki": 82, "word2vec": 137,
              "synonym": 522, "split": 110, "sentence_shuffle": 67, "one_third_cut": 0.0, "half_cut":0.0}


- What Augs are useful
- What Text models perform best

In [None]:


choice_probas = {"keyboard": 0.1, "char_substitute": 0.0, "char_insert": 0.1, "char_swap": 0.1, "ocr": 0.0, "char_delete": 0.1,
                 "fasttext": 0.0, "glove_twitter": 0.0, "glove_wiki": 0.0, "word2vec": 0.0, "split": 0.1,
                 "stopword_insert": 0.3, "word_join": 0.1, "word_cutout": 0.8,
                 "text_rotate": 0.5, "sentence_shuffle": 0.5, "one_third_cut": 0.3, "half_cut":0.1}
preprocess_text = TextAugment([0.05, 0.05, 0.05, 0.35, 0.3, 0.2], choice_probas, fasttext_file="wiki-news-300d-1M-subword.bin")


data = get_datasets(data_dir="../data/", train_text_transform=preprocess_text, train_image_transform=None, 
                    test_text_transform=None, test_image_transform=None, 
                    cache_images = True, use_images = False, dev=False, test_dev=True,
                    keep_original_text=False, keep_original_image=False, 
                    keep_processed_image=True, keep_torchvision_image=False,)

# images = list(data["train"].img) + list(data["test"].img)
# pd.DataFrame({"img":images}).to_csv("image.csv", header=None, index=None)

# ImageAugment([0.2, 0.5, 0.3])


In [None]:
# data["train"].label.value_counts()
# train = data["train"]

# ones = train[train["label"] == 1]
# zeros = train[train["label"] == 0].sample(n=len(ones), replace=False)
# data["train"] = pd.concat((ones, zeros)).sample(frac=1.0)
# data["train"].label.value_counts()

# len(set(data["train"]["id"])) == data["train"].shape[0]


https://discuss.pytorch.org/t/how-to-implement-torch-optim-lr-scheduler-cosineannealinglr/28797/11

# Params

In [None]:
sgd = torch.optim.SGD
sgd_params = dict(lr=2e-2, momentum=0.9, dampening=0, weight_decay=0, nesterov=False)

rangerQH = optim.RangerQH
rangerQHparams = dict(lr=1e-3, betas=(0.9, 0.999), nus=(.7, 1.0),
    weight_decay=0.0,
    k=6,
    alpha=.5,
    decouple_weight_decay=True,
    eps=1e-8,)

adam = torch.optim.Adam
adam_params = params=dict(lr=1e-3, weight_decay=1e-7)

adamw = torch.optim.AdamW
adamw_params = dict(lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=1e-2)

novograd = optim.NovoGrad
novograd_params = dict(lr= 1e-3,
    betas=(0.9, 0.999),
    eps=1e-8,
    weight_decay=0,
    grad_averaging=False,
    amsgrad=False,)

qhadam = optim.QHAdam
qhadam_params = dict(lr= 1e-3,
    betas=(0.9, 0.999),
    nus=(1.0, 1.0),
    weight_decay=0,
    decouple_weight_decay=False,
    eps=1e-8,)

radam = optim.RAdam
radam_params = dict(lr= 1e-3,
    betas=(0.9, 0.999),
    eps=1e-8,
    weight_decay=0,)

yogi = optim.Yogi
yogi_params = dict(lr= 1e-2,
    betas=(0.9, 0.999),
    eps=1e-3,
    initial_accumulator=1e-6,
    weight_decay=0)




In [None]:
batch_size=256
epochs = 10
optimizer = adam
optimizer_params = adam_params

scheduler_init_fn = get_multistep_lr([11, 13], gamma=0.25) # get_cosine_schedule_with_warmup # get_cosine_with_hard_restarts_schedule_with_warmup
scheduler_init_fn = get_cosine_schedule_with_warmup()
reg_sched = get_regularizer_scheduler()



# Fasttext 1D CNN

In [None]:
epochs = 15
model_fn = model_builder(Fasttext1DCNNModel,
                         dict(
                             classifier_dims=256,
                             num_classes=2,
                             n_layers=2,
                             final_layer_builder=fb_1d_loss_builder,
                             gaussian_noise=0.15,
                             dropout=0.2,
                             embedding_dims=256,
                             internal_dims=512,
                             fasttext_file="crawl-300d-2M-subword.bin",
                             featurizer="transformer",
                             loss="focal",
                             dice_loss_coef=0.0,
                             auc_loss_coef=0.0,
                         ),
                         optimiser_class=optimizer,
                         optimiser_params=optimizer_params)

kfold = False
results, prfs = train_validate_ntimes(
    model_fn,
    data,
    batch_size,
    epochs,
    kfold=kfold,
    model_call_back=reg_sched,
    scheduler_init_fn=scheduler_init_fn,
    show_model_stats=False,
    sampling_policy="without_replacement",
    validation_epochs=[4, 7, 9, 11, 14, 17, 19])
r1, p1 = results, prfs
results
prfs

# 0.738	0.734
# 0.730	0.715
# 0.730	0.715
# 0.734	0.731
# 0.746	0.712

# Lang Features Model

In [None]:
epochs = 15
batch_size = 256
# fasttext_crawl 1.9s
# spacy 1.8s
# full_view 1.4s 0.659	0.651
# gensim 7.5s # 0.718	0.737
# nltk 3s 0.609	0.585
# "spacy", "key_phrases" 4.2s 0.688	0.670
# "fasttext_crawl", "spacy", "key_phrases", "gensim" 20s 0.763	0.729 2h 49m
# "fasttext_crawl", "gensim" 11s 0.749	0.733 1h 47m
# gensim 8s 0.751	0.733 1h 20m

all_caps = [
    "fasttext_crawl", "spacy", "full_view", "key_phrases", "nltk", "gensim"
]  # "snlp", "ibm_max", "tmoji", "key_phrases", "full_view", "spacy", "nltk", "fasttext_crawl"
all_caps = [
    "full_view",
]
all_caps = ["fasttext_crawl", "spacy", "key_phrases", "nltk"]
all_caps = ["fasttext_crawl", "gensim"]
model_fn = model_builder(LangFeaturesModel,
                         dict(classifier_dims=256,
                              num_classes=2,
                              gaussian_noise=0.2,
                              dropout=0.2,
                              embedding_dims=256,
                              internal_dims=512,
                              capabilities=all_caps,
                              featurizer="transformer",
                              loss="focal",
                              dice_loss_coef=0.0,
                              auc_loss_coef=0.0,
                              n_layers=2,
                              final_layer_builder=fb_1d_loss_builder),
                         optimiser_class=optimizer,
                         optimiser_params=optimizer_params)

kfold = False
results, prfs = train_validate_ntimes(
    model_fn,
    data,
    batch_size,
    epochs,
    kfold=kfold,
    scheduler_init_fn=scheduler_init_fn,
    model_call_back=reg_sched,
    show_model_stats=False,
    sampling_policy="without_replacement",
    validation_epochs=[1, 4, 7, 9, 11, 14, 17, 19])
r1, p1 = results, prfs
results
prfs

# BERT Models

## Actibus/Bert_REview

In [None]:
epochs = 24
batch_size = 256
lr_strategy = {
    "model": {
        "lr": optimizer_params["lr"] / 1000,
        "finetune": False,
        "encoder": {
            "layer": {
                "9": {
                    "lr": optimizer_params["lr"] / 1e3,
                    "finetune": False
                },
                "10": {
                    "lr": optimizer_params["lr"] / 1e2,
                    "finetune": True
                },
                "11": {
                    "lr": optimizer_params["lr"] / 1e2,
                    "finetune": True
                }
            }
        },
    }
}
model_fn = model_builder(AlbertClassifer,
                         dict(classifier_dims=256,
                              num_classes=2,
                              embedding_dims=768,
                              gaussian_noise=0.0,
                              dropout=0.1,
                              word_masking_proba=0.15,
                              internal_dims=512,
                              final_layer_builder=fb_1d_loss_builder,
                              n_layers=2,
                              n_encoders=2,
                              n_decoders=2,
                              n_tokens_in=96,
                              n_tokens_out=16,
                              featurizer="transformer",
                              model="activebus/BERT_Review",
                              loss="focal",
                              dice_loss_coef=0.0,
                              auc_loss_coef=0.0,
                             ),
                         per_param_opts_fn=lr_strategy,
                         optimiser_class=optimizer,
                         optimiser_params=optimizer_params)

kfold = False
results, prfs = train_validate_ntimes(
    model_fn,
    data,
    batch_size,
    epochs,
    kfold=kfold,
    scheduler_init_fn=scheduler_init_fn,
    model_call_back=reg_sched,
    validation_epochs=[4, 7, 9, 11, 14, 17, 19, 23, 27, 31, 34, 37, 41, 44, 47, 51, 54],
    show_model_stats=False,
    sampling_policy="without_replacement")
r2, p2 = results, prfs
results
prfs

# 0.761	0.749 (0.703	0.691)


## NSP Style Finetuned

### Non DETR

In [None]:
lr_strategy = {
    "model": {
        "lr": optimizer_params["lr"] / 1000,
        "finetune": False,
        "encoder": {
            "layer": {
                "2": {
                    "lr": optimizer_params["lr"] / 1e3,
                    "finetune": False
                },
                "3": {
                    "lr": optimizer_params["lr"] / 1e3,
                    "finetune": False
                },
                "4": {
                    "lr": optimizer_params["lr"] / 1e2,
                    "finetune": True
                },
                "5": {
                    "lr": optimizer_params["lr"] / 1e2,
                    "finetune": True
                }
            }
        },
    }
}

epochs = 24
batch_size = 256
model_fn = model_builder(AlbertClassifer,
                         dict(classifier_dims=256,
                              num_classes=2,
                              embedding_dims=768,
                              gaussian_noise=0.75,
                              dropout=0.1,
                              word_masking_proba=0.15,
                              internal_dims=512,
                              final_layer_builder=fb_1d_loss_builder,
                              n_layers=2,
                              n_encoders=3,
                              n_decoders=3,
                              n_tokens_in=96,
                              n_tokens_out=16,
                              featurizer="transformer",
                              model='./distilbert-nsp',
                              loss="focal",
                              classification_head="decoder_ensemble",
                              dice_loss_coef=0.0,
                              auc_loss_coef=0.0, # fasttext_vector_config=dict(n_decoders=2, gru_layers=2)
                              finetune=False),
                         per_param_opts_fn=lr_strategy,
                         optimiser_class=optimizer,
                         optimiser_params=optimizer_params)

kfold = False
results, prfs = train_validate_ntimes(
    model_fn,
    data,
    batch_size,
    epochs,
    kfold=kfold,
    scheduler_init_fn=scheduler_init_fn,
    model_call_back=reg_sched,
    validation_epochs=[4, 7, 9, 11, 14, 17, 19, 23, 27, 31, 34, 37, 41, 44, 47, 51, 54],
    show_model_stats=False,
    sampling_policy="without_replacement")
r2, p2 = results, prfs
results
prfs

# Try with_replacement
# 0.810	0.661 (0.724	0.600) (gaussian_noise=0.75, dropout=0.1, word_masking_proba=0.2,)


In [None]:
adamw = torch.optim.AdamW
adamw_params = dict(lr=1e-4, betas=(0.9, 0.98), eps=1e-08, weight_decay=1e-2)
optimizer = adamw
optimizer_params = adamw_params

lr_strategy = {
    "model": {
        "lr": optimizer_params["lr"] / 1000,
        "finetune": False,
        "encoder": {
            "layer": {
                "2": {
                    "lr": optimizer_params["lr"] / 1e3,
                    "finetune": False
                },
                "3": {
                    "lr": optimizer_params["lr"],
                    "finetune": True
                },
                "4": {
                    "lr": optimizer_params["lr"],
                    "finetune": True
                },
                "5": {
                    "lr": optimizer_params["lr"],
                    "finetune": True
                }
            }
        },
    }
}

epochs = 24
batch_size = 256
model_fn = model_builder(AlbertClassifer,
                         dict(classifier_dims=256,
                              num_classes=2,
                              gaussian_noise=0.2,
                              dropout=0.2,
                              word_masking_proba=0.25,
                              internal_dims=512,
                              final_layer_builder=fb_1d_loss_builder,
                              n_layers=2,
                              n_encoders=2,
                              n_decoders=2,
                              n_tokens_in=96,
                              n_tokens_out=16,
                              featurizer="transformer",
                              model='./distilbert-nsp',
                              loss="focal",
                              classification_head="decoder_ensemble", # head_ensemble
                              dice_loss_coef=0.0,
                              auc_loss_coef=0.5,
                              attention_drop_proba=0.2,
                              finetune=False),
                         per_param_opts_fn=lr_strategy,
                         optimiser_class=optimizer,
                         optimiser_params=optimizer_params)

kfold = False
results, prfs = train_validate_ntimes(
    model_fn,
    data,
    batch_size,
    epochs,
    kfold=kfold,
    scheduler_init_fn=scheduler_init_fn,
    model_call_back=reg_sched, # reg_sched
    validation_epochs=[4, 7, 9, 11, 14, 17, 19, 23, 27, 31, 34, 37, 41, 44, 47, 51, 54],
    show_model_stats=False,
    sampling_policy="without_replacement")
r2, p2 = results, prfs
results
prfs

# auc_loss_coef=0.5
# 0.853 0.661 (0.757 0.596) gaussian_noise=0.1, dropout=0.2, word_masking_proba=0.2,
# 0.852	0.658 (0.748	0.604) gaussian_noise=0.1, dropout=0.15, word_masking_proba=0.2,
# 0.857	0.661 (0.761	0.590) gaussian_noise=0.1, dropout=0.15, word_masking_proba=0.15,
# 0.845	0.657 (0.757	0.592) gaussian_noise=0.1, dropout=0.25, word_masking_proba=0.25,
# 0.841	0.644 (0.753	0.578) gaussian_noise=0.1, dropout=0.25, word_masking_proba=0.2,
# 0.861	0.647 (0.759	0.594) gaussian_noise=0.1, dropout=0.15, word_masking_proba=0.15,
# 0.857	0.652 (0.756	0.576) gaussian_noise=0.1, dropout=0.15, word_masking_proba=0.15,
# 0.848	0.661 (0.751	0.592) gaussian_noise=0.1, dropout=0.15, word_masking_proba=0.15,
# 0.853	0.657 (0.755	0.588) gaussian_noise=0.1, dropout=0.2, word_masking_proba=0.2,
# 0.853	0.661 (0.750	0.602) gaussian_noise=0.1, dropout=0.2, word_masking_proba=0.2,
# 0.852	0.649 (0.757	0.578) gaussian_noise=0.1, dropout=0.2, word_masking_proba=0.2,

# No reg_sched testing
# 0.848	0.652 (0.754	0.590) gaussian_noise=0.1, dropout=0.2, word_masking_proba=0.2,

# Cnn1D head


In [None]:
adamw = torch.optim.AdamW
adamw_params = dict(lr=1e-4, betas=(0.9, 0.98), eps=1e-08, weight_decay=1e-3)
optimizer = adamw
optimizer_params = adamw_params

lr_strategy = {
    "model": {
        "lr": optimizer_params["lr"] / 1000,
        "finetune": False,
        "encoder": {
            "layer": {
                "2": {
                    "lr": optimizer_params["lr"] / 1e3,
                    "finetune": False
                },
                "3": {
                    "lr": optimizer_params["lr"],
                    "finetune": True
                },
                "4": {
                    "lr": optimizer_params["lr"],
                    "finetune": True
                },
                "5": {
                    "lr": optimizer_params["lr"],
                    "finetune": True
                }
            }
        },
    }
}

epochs = 24
batch_size = 256
model_fn = model_builder(AlbertClassifer,
                         dict(classifier_dims=256,
                              num_classes=2,
                              gaussian_noise=0.1,
                              dropout=0.25,
                              word_masking_proba=0.25,
                              internal_dims=512,
                              final_layer_builder=fb_1d_loss_builder,
                              n_layers=2,
                              n_encoders=2,
                              n_decoders=2,
                              n_tokens_in=96,
                              n_tokens_out=16,
                              featurizer="transformer",
                              model='distilbert-cor',
                              loss="focal",
                              classification_head="decoder_ensemble", # decoder_ensemble
                              dice_loss_coef=0.0,
                              auc_loss_coef=0.5,
                              attention_drop_proba=0.2,
                              finetune=False),
                         per_param_opts_fn=lr_strategy,
                         optimiser_class=optimizer,
                         optimiser_params=optimizer_params)

kfold = False
results, prfs = train_validate_ntimes(
    model_fn,
    data,
    batch_size,
    epochs,
    kfold=kfold,
    scheduler_init_fn=scheduler_init_fn,
    model_call_back=reg_sched, # reg_sched
    validation_epochs=[4, 7, 9, 11, 14, 17, 19, 23, 27, 31, 34, 37, 41, 44, 47, 51, 54],
    show_model_stats=False,
    sampling_policy="without_replacement")
r3, p3 = results, prfs
results
prfs

# auc_loss_coef=0.5
# 0.853 0.661 (0.757 0.596) gaussian_noise=0.1, dropout=0.2, word_masking_proba=0.2,
# 0.852	0.658 (0.748	0.604) gaussian_noise=0.1, dropout=0.15, word_masking_proba=0.2,
# 0.857	0.661 (0.761	0.590) gaussian_noise=0.1, dropout=0.15, word_masking_proba=0.15,
# 0.845	0.657 (0.757	0.592) gaussian_noise=0.1, dropout=0.25, word_masking_proba=0.25,
# 0.841	0.644 (0.753	0.578) gaussian_noise=0.1, dropout=0.25, word_masking_proba=0.2,
# 0.861	0.647 (0.759	0.594) gaussian_noise=0.1, dropout=0.15, word_masking_proba=0.15,
# 0.857	0.652 (0.756	0.576) gaussian_noise=0.1, dropout=0.15, word_masking_proba=0.15,
# 0.848	0.661 (0.751	0.592) gaussian_noise=0.1, dropout=0.15, word_masking_proba=0.15,
# 0.853	0.657 (0.755	0.588) gaussian_noise=0.1, dropout=0.2, word_masking_proba=0.2,
# 0.853	0.661 (0.750	0.602) gaussian_noise=0.1, dropout=0.2, word_masking_proba=0.2,
# 0.852	0.649 (0.757	0.578) gaussian_noise=0.1, dropout=0.2, word_masking_proba=0.2,

# No reg_sched testing
# 0.848	0.652 (0.754	0.590) gaussian_noise=0.1, dropout=0.2, word_masking_proba=0.2,

# Cnn1D head


### DETR Style

In [None]:
from facebook_hateful_memes_detector.utils import in_notebook, CNNHead, MultiLayerTransformerDecoderHead, AveragedLinearHead, OneTokenPositionLinearHead, MultiTaskForward, CNN2DHead
def fb_detr_loss_builder(n_dims, n_tokens, n_out, dropout, **kwargs):
    loss = kwargs.pop("loss", "classification")
    cnn = MultiLayerTransformerDecoderHead(n_dims, n_tokens, n_out, dropout=0.4, 
                                           gaussian_noise=0.75, n_layers=3, loss=loss)
    return cnn



In [None]:
epochs = 24
batch_size = 256

adamw = torch.optim.AdamW
adamw_params = dict(lr=5e-5, betas=(0.9, 0.98), eps=1e-08, weight_decay=1e-2)
optimizer = adamw
optimizer_params = adamw_params

lr_strategy = {
    "model": {
        "lr": optimizer_params["lr"] / 1000,
        "finetune": False,
        "encoder": {
            "layer": {
                "3": {
                    "lr": optimizer_params["lr"],
                    "finetune": True
                },
                "4": {
                    "lr": optimizer_params["lr"],
                    "finetune": True
                },
                "5": {
                    "lr": optimizer_params["lr"],
                    "finetune": True
                }
            }
        },
    }
}

model_fn = model_builder(AlbertClassifer,
                         dict(classifier_dims=256,
                              num_classes=2,
                              gaussian_noise=0.15,
                              dropout=0.25,
                              word_masking_proba=0.25,
                              internal_dims=512,
                              final_layer_builder=fb_detr_loss_builder,
                              n_layers=2,
                              n_encoders=2,
                              n_decoders=0,
                              n_tokens_in=96,
                              n_tokens_out=96,
                              featurizer="transformer",
                              model='./distilbert-nsp',
                              classification_head="decoder_ensemble",
                              loss="focal",
                              dice_loss_coef=0.0,
                              auc_loss_coef=0.5,
                              attention_drop_proba=0.2,
                              finetune=False),
                         per_param_opts_fn=lr_strategy,
                         optimiser_class=optimizer,
                         optimiser_params=optimizer_params)

kfold = False
results, prfs = train_validate_ntimes(
    model_fn,
    data,
    batch_size,
    epochs,
    kfold=kfold,
    scheduler_init_fn=scheduler_init_fn,
    model_call_back=reg_sched,
    validation_epochs=[4, 7, 9, 11, 14, 17, 19, 23, 27, 31, 34, 37, 41, 44, 47, 51, 54],
    show_model_stats=False,
    sampling_policy="without_replacement")
r2, p2 = results, prfs
results
prfs

# 0.824	0.649 (0.748	0.576) (gaussian_noise=0.75, dropout=0.25, word_masking_proba=0.25,)
# 0.829	0.654 (0.744	0.584) (gaussian_noise=0.5, dropout=0.25, word_masking_proba=0.25,)
# 0.811	0.635 (0.740	0.578) (gaussian_noise=0.5, dropout=0.25, word_masking_proba=0.25,)
# 0.834	0.630 (0.755	0.566) (gaussian_noise=0.5, dropout=0.2, word_masking_proba=0.25,)
# 


# Predict

## Normal Head

In [None]:
adamw = torch.optim.AdamW
adamw_params = dict(lr=1e-4, betas=(0.9, 0.98), eps=1e-08, weight_decay=1e-2)
optimizer = adamw
optimizer_params = adamw_params

lr_strategy = {
    "model": {
        "lr": optimizer_params["lr"] / 1000,
        "finetune": False,
        "encoder": {
            "layer": {
                "2": {
                    "lr": optimizer_params["lr"] / 1e3,
                    "finetune": False
                },
                "3": {
                    "lr": optimizer_params["lr"] / 1e1,
                    "finetune": True
                },
                "4": {
                    "lr": optimizer_params["lr"],
                    "finetune": True
                },
                "5": {
                    "lr": optimizer_params["lr"],
                    "finetune": True
                }
            }
        },
    }
}

epochs = 24
batch_size = 256
model_fn = model_builder(AlbertClassifer,
                         dict(classifier_dims=256,
                              num_classes=2,
                              gaussian_noise=0.1,
                              dropout=0.2,
                              word_masking_proba=0.2,
                              internal_dims=512,
                              final_layer_builder=fb_1d_loss_builder,
                              n_layers=2,
                              n_encoders=3,
                              n_decoders=3,
                              n_tokens_in=96,
                              n_tokens_out=32,
                              featurizer="transformer",
                              model='./distilbert-nsp',
                              loss="focal",
                              classification_head="decoder_ensemble", # decoder_ensemble
                              dice_loss_coef=0.0,
                              auc_loss_coef=0.5,
                              finetune=False),
                         per_param_opts_fn=lr_strategy,
                         optimiser_class=optimizer,
                         optimiser_params=optimizer_params)


In [None]:
submission, text_model = train_and_predict(model_fn, data, batch_size, epochs, 
                                           scheduler_init_fn=scheduler_init_fn, 
                                           model_call_back=reg_sched,
                                           sampling_policy="without_replacement",
                                           validation_epochs=[15, 31, 34, 42],
                                          )
submission.to_csv("submission.csv",index=False)
submission.sample(5)


## DETR Style head

In [None]:
lr_strategy = {
    "model": {
        "lr": optimizer_params["lr"] / 1000,
        "finetune": False,
        "encoder": {
            "layer": {
                "3": {
                    "lr": optimizer_params["lr"] / 1e2,
                    "finetune": True
                },
                "4": {
                    "lr": optimizer_params["lr"] / 1e2,
                    "finetune": True
                },
                "5": {
                    "lr": optimizer_params["lr"] / 1e1,
                    "finetune": True
                }
            }
        },
    }
}
model_fn = model_builder(AlbertClassifer,
                         dict(classifier_dims=256,
                              num_classes=2,
                              embedding_dims=768,
                              gaussian_noise=0.75,
                              dropout=0.25,
                              word_masking_proba=0.25,
                              internal_dims=512,
                              final_layer_builder=fb_detr_loss_builder,
                              n_layers=2,
                              n_encoders=2,
                              n_decoders=0,
                              n_tokens_in=96,
                              n_tokens_out=16,
                              featurizer="transformer",
                              model='./distilbert-nsp',
                              loss="focal",
                              dice_loss_coef=0.0,
                              auc_loss_coef=0.0,
                              finetune=False),
                         per_param_opts_fn=lr_strategy,
                         optimiser_class=optimizer,
                         optimiser_params=optimizer_params)

epochs = 24
batch_size = 256

submission, text_model = train_and_predict(model_fn, data, batch_size, epochs,
                                           scheduler_init_fn=scheduler_init_fn, 
                                           model_call_back=reg_sched, validation_epochs=[2, 5, 7],
                                           sampling_policy="without_replacement")
submission.to_csv("submission.csv",index=False)
submission.sample(10)

# 0.6723


In [None]:
# What train-auc does we achieve if all examples have same score.
# What train-auc does we achieve if all examples have random score.

In [None]:
submission.sample(10)