In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Not connected to a GPU


In [None]:
need_to_install = False
if need_to_install:
  !pip install wandb
  !pip install transformers
  !pip install sentencepiece
  !pip install tokenizer
  !pip install iterative-stratification

In [None]:
import sys
#code_path = '/content/drive/MyDrive/FB_KAGL/code/FB_utils/'
code_path = '/content/drive/MyDrive/colab/FB_KAGL/code/'

sys.path.insert(0, code_path)

In [None]:
import os
import warnings
warnings.filterwarnings("ignore")

import wandb
import torch
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

from FB_utils_window.pipeline import train_loop
from FB_utils_window.models import get_tokenizer
from FB_utils_window.utils import class2dict, get_logger, define_max_len, get_result

%env TOKENIZERS_PARALLELISM=true

env: TOKENIZERS_PARALLELISM=true


# Config

In [None]:
# ====================================================
# Config
# ====================================================
class CFG:
    ####################
    # MAIN
    ####################
    wandb = True
    wandb_project = 'FeedBack_kaggle_window'
    competition = 'FeedBack_3'
    wb_group = 'multi'
    exp_name = 'window_large_v3_bs96'
    base_path = '/content/drive/MyDrive/colab/FB_KAGL/'

    seed = 3337
    train = True
    debug = False
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    ####################
    # DATA
    ####################
    target_cols = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']

    normlen = False
    num_workers = 12
    train_bs = 96 # 3
    valid_bs = 32 # 3
    max_len = 512

    n_fold = 8
    trn_fold = [0,1,2,3,4,5,6,7]

    ####################
    # MODEL
    ####################
    model = "microsoft/deberta-v3-large"  # microsoft/deberta-v2-xxlarge
    gradient_checkpointing = True
    num_classes = 6

    ####################
    # TRAIN
    ####################
    apex = True
    use_restart = True
    use_restart_step2 = True
    valid_pnts = [] # [1200, 1300, 3300] 

    ####################
    # LOSS
    ####################
    loss = 'l1'  # ['l1', 'double', 'rmse']
    w_mse = 0.25
    w_l1 = 0.75
    beta_L1 = 0.1 #0.125
    delta_Huber = 0.07

    # Scheduler step 1

    scheduler = 'linear'  # ['linear', 'cosine', 'cosine_restart']
    num_cycles = 0.5  # 3.5
    num_warmup_steps = 3

    # Loop step 1

    epochs = 12 # 6
    rest_thr = 0.006 #0.005  # 0.012
    iter4eval = 100000

    # LR, optimizer step 1

    encoder_lr = 6e-5  # 1.4e-5 # 2e-5
    decoder_lr = 6e-5  # 1.4e-5 # 2e-5
    min_lr = 0.01e-6  # 1e-6
    eps = 1e-6 #1e-6
    betas = (0.9, 0.999)
    weight_decay = 0.0001
    gradient_accumulation_steps = 1 # 2
    max_grad_norm = 1000
    optimizer = 'AdamW'

    ####################
    # STEP 2
    ####################
    step2 = True

    # Scheduler step 2

    scheduler_step2 = 'cosine_restart'
    num_cycles_step2 = 2
    # Loop step 2

    epochs_step2 = 6
    rest_thr_step2 = 0.0015  # 0.002
    iter4eval_step2 = 12

    # LR 2
    lr_step2 = 1e-5  # 2.8e-6
    weight_decay_step2 = 0.00001
    eps_step2 = 1e-6
    betas_step2 = (0.99, 0.999)

    window_size = max_len-2
    window_step = 128
    use_fgm = True
    
    use_mean_valid = True
    pooling = 'mean'
    use_meta = False

    use_pretrain = True
    pretrain_check = ''


os.makedirs(CFG.base_path + 'results/', exist_ok=True)
os.makedirs(CFG.base_path + 'results/' + CFG.exp_name, exist_ok=True)
os.makedirs(CFG.base_path + 'results/' + CFG.exp_name + '/checkpoints', exist_ok=True)
CFG.save_path = CFG.base_path + 'results/' + CFG.exp_name + '/checkpoints/'
with open(CFG.base_path + 'results/' + CFG.exp_name + '/CFG.txt', 'w') as f:
    for key, value in CFG.__dict__.items():
        f.write('%s:%s\n' % (key, value))

# Logging

In [None]:
if CFG.wandb:
    wandb.init(project=CFG.wandb_project,
               name=CFG.exp_name,
               config=class2dict(CFG),
               group=CFG.wb_group,
               job_type="train",
               dir=CFG.base_path)

LOGGER = get_logger(CFG.base_path + 'results/' + CFG.exp_name + '/train')

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mforrato[0m. Use [1m`wandb login --relogin`[0m to force relogin


## Data and tokenizer preparation

In [None]:
train = pd.read_csv(f'{CFG.base_path}/feedback-prize-english-language-learning/train.csv')
test = pd.read_csv(f'{CFG.base_path}/feedback-prize-english-language-learning/test.csv')
submission = pd.read_csv(f'{CFG.base_path}/feedback-prize-english-language-learning/sample_submission.csv')

print(f"train.shape: {train.shape}")
print(f"test.shape: {test.shape}")
print(f"submission.shape: {submission.shape}")

CFG.tokenizer = get_tokenizer(CFG)
max_len, lengths = define_max_len(train, CFG.tokenizer)
#CFG.max_len = max_len
train['length'] = lengths

Fold = MultilabelStratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=49)
for n, (train_index, val_index) in enumerate(Fold.split(train, train[CFG.target_cols])):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())


if CFG.debug:
    display(train.groupby('fold').size())
    train = train.sample(n=150, random_state=0).reset_index(drop=True)
    display(train.groupby('fold').size())

train.shape: (3911, 8)
test.shape: (3, 2)
submission.shape: (3, 7)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


fold
0    489
1    489
2    489
3    489
4    489
5    489
6    489
7    488
dtype: int64

## Training Pipeline

In [None]:
if CFG.train:
    oof_df = pd.DataFrame()
    for fold in range(CFG.n_fold):
        if fold in CFG.trn_fold:
            if CFG.num_classes == 1:
                for ind_cl, curclass in enumerate(CFG.target_cols):

                    _oof_df = train_loop(CFG=CFG, folds=train, fold=fold, LOGGER=LOGGER, curclass=curclass)
                    if ind_cl == 0:
                        fold_df = _oof_df.copy()
                    else:
                        fold_df = pd.merge(fold_df, _oof_df,
                                           on=['text_id', 'full_text', 'length', 'fold'] + CFG.target_cols)
                oof_df = pd.concat([oof_df, fold_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(fold_df, CFG, LOGGER)
            else:
                _oof_df = train_loop(CFG=CFG, folds=train, fold=fold, LOGGER=LOGGER, curclass=None)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df, CFG, LOGGER)
    oof_df = oof_df.reset_index(drop=True)
    LOGGER.info(f"========== CV ==========")
    get_result(oof_df, CFG, LOGGER)
    oof_df.to_pickle(CFG.save_path + 'oof_df.pkl')

if CFG.wandb:
    wandb.finish()




> SEEDING DONE
Size of train dataset: 3422
Size of eval dataset: 489


Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.bias', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Validation points: [524]
Epoch 1/12 | Fold 0 | Class None


Ep.1 Train :   0%|          | 0/35 [00:00<?, ?it/s]

Ep.1 Valid :   0%|          | 0/16 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fas

Epoch 2/12 | Fold 0 | Class None


Ep.2 Train :   0%|          | 0/35 [00:00<?, ?it/s]

Ep.2 Valid :   0%|          | 0/16 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fas

Epoch 3/12 | Fold 0 | Class None


Ep.3 Train :   0%|          | 0/35 [00:00<?, ?it/s]

Ep.3 Valid :   0%|          | 0/16 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fas

Epoch 4/12 | Fold 0 | Class None


Ep.4 Train :   0%|          | 0/35 [00:00<?, ?it/s]

Ep.4 Valid :   0%|          | 0/16 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fas

Epoch 5/12 | Fold 0 | Class None


Ep.5 Train :   0%|          | 0/35 [00:00<?, ?it/s]

Ep.5 Valid :   0%|          | 0/16 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fas

Epoch 6/12 | Fold 0 | Class None


Ep.6 Train :   0%|          | 0/35 [00:00<?, ?it/s]

Ep.6 Valid :   0%|          | 0/16 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fas

Epoch 7/12 | Fold 0 | Class None


Ep.7 Train :   0%|          | 0/35 [00:00<?, ?it/s]

Ep.7 Valid :   0%|          | 0/16 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fas

Epoch 8/12 | Fold 0 | Class None


Ep.8 Train :   0%|          | 0/35 [00:00<?, ?it/s]

Ep.8 Valid :   0%|          | 0/16 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fas

Epoch 9/12 | Fold 0 | Class None


Ep.9 Train :   0%|          | 0/35 [00:00<?, ?it/s]

Ep.9 Valid :   0%|          | 0/16 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fas

Epoch 10/12 | Fold 0 | Class None


Ep.10 Train :   0%|          | 0/35 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

# Export data to kaggle

In [None]:
!pip install kaggle

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))
  
# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json
User uploaded file "kaggle.json" with length 67 bytes


In [None]:
import kaggle
import shutil
import os
import glob

In [None]:
EXP_NAME = 'large-v3_pseudo_0.02'
EXP_NAME_DRIVE = 'large-v3_pseudo_v1'

In [None]:
os.makedirs(f'/content/{EXP_NAME}/')
os.makedirs(f'/content/{EXP_NAME}/checkpoints')
os.makedirs(f'/content/{EXP_NAME}/tokenizer')

In [None]:
# copy tokenizer
tkn = glob.glob(f'/content/drive/MyDrive/colab/FB_KAGL/results/{EXP_NAME_DRIVE}/tokenizer/*')
for tk in tkn:
  hz = tk.split('/')[-1]
  shutil.copy(tk, f'/content/{EXP_NAME}/tokenizer/{hz}')

# copy config
cfg_item = glob.glob(f'/content/drive/MyDrive/colab/FB_KAGL/results/{EXP_NAME_DRIVE}/checkpoints/config*')
print(cfg_item)
shutil.copy(cfg_item[0], f'/content/{EXP_NAME}/checkpoints/config.pth')

# copy checkpoints
items = glob.glob(f'/content/drive/MyDrive/colab/FB_KAGL/results/{EXP_NAME_DRIVE}/checkpoints/*final_best*')
for tk in items:
  hz = tk.split('/')[-1]
  shutil.copy(tk, f'/content/{EXP_NAME}/checkpoints/{hz}')

['/content/drive/MyDrive/colab/FB_KAGL/results/large-v3_pseudo_v1/checkpoints/config.pth']


In [None]:
!kaggle datasets init -p /content/$EXP_NAME

Data package template written to: /content/large-v3_pseudo_0.02/dataset-metadata.json


In [None]:
!kaggle datasets create --dir-mode zip -p /content/$EXP_NAME

Starting upload for file tokenizer.zip
100% 3.04M/3.04M [00:04<00:00, 695kB/s]
Upload successful: tokenizer.zip (3MB)
Starting upload for file checkpoints.zip
100% 6.96G/6.96G [02:24<00:00, 51.8MB/s]
Upload successful: checkpoints.zip (7GB)
Your private Dataset is being created. Please check progress at https://www.kaggle.com/datasets/kolyaforrat/large-v3-pseudo-002
