In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Sat Nov 12 13:35:49 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   67C    P8    11W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
need_to_install = True
if need_to_install:
  !pip install wandb
  !pip install transformers
  !pip install sentencepiece
  !pip install tokenizer
  !pip install iterative-stratification

In [None]:
import sys
#code_path = '/content/drive/MyDrive/FB_KAGL/code/FB_utils/'
code_path = '/content/drive/MyDrive/colab/FB_KAGL/code/'

sys.path.insert(0, code_path)

In [None]:
import os
import warnings
warnings.filterwarnings("ignore")

import wandb
import torch
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

from FB_utils.pipeline import train_loop
from FB_utils.models import get_tokenizer
from FB_utils.utils import class2dict, get_logger, define_max_len, get_result

%env TOKENIZERS_PARALLELISM=true

env: TOKENIZERS_PARALLELISM=true


## Config

In [None]:
# ====================================================
# Config
# ====================================================
class CFG:
    ####################
    # MAIN
    ####################
    wandb = True
    wandb_project = 'FeedBack_kaggle_reborn'
    competition = 'FeedBack_3'
    wb_group = 'single'
    exp_name = 'single_v3base_7fold'
    base_path = '/content/drive/MyDrive/colab/FB_KAGL/'

    seed = 137
    train = True
    debug = False
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    ####################
    # DATA
    ####################
    target_cols = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']

    normlen = False
    num_workers = 12
    train_bs = 16 # 3
    valid_bs = 16 # 3
    max_len = 512

    n_fold = 7
    trn_fold = [4, 5, 6] # 5,6]

    ####################
    # MODEL
    ####################
    model = "microsoft/deberta-v3-base"  # microsoft/deberta-v2-xxlarge
    gradient_checkpointing = True
    num_classes = 1

    ####################
    # TRAIN
    ####################
    apex = True
    use_restart = True
    use_restart_step2 = True
    valid_pnts = [] # [1200, 1300, 3300] 

    ####################
    # LOSS
    ####################
    loss = 'l1'  # ['l1', 'double', 'rmse']
    w_mse = 0.25
    w_l1 = 0.75
    beta_L1 = 0.05 #0.125

    # Scheduler step 1

    scheduler = 'linear'  # ['linear', 'cosine', 'cosine_restart']
    num_cycles = 0.5  # 3.5
    num_warmup_steps = 25

    # Loop step 1

    epochs = 5 # 6
    rest_thr = 0.01 #0.005  # 0.012
    iter4eval = 100000

    # LR, optimizer step 1

    encoder_lr = 1.46e-5  # 1.4e-5 # 2e-5
    decoder_lr = 1.66e-5  # 1.4e-5 # 2e-5
    min_lr = 0.01e-6  # 1e-6
    eps = 1e-6 #1e-6
    betas = (0.9, 0.999)
    weight_decay = 0.0001
    gradient_accumulation_steps = 1 # 2
    max_grad_norm = 1000
    optimizer = 'AdamW'

    ####################
    # STEP 2
    ####################
    step2 = True

    # Scheduler step 2

    scheduler_step2 = 'cosine_restart'
    num_cycles_step2 = 1
    # Loop step 2

    epochs_step2 = 2
    rest_thr_step2 = 0.002  # 0.002
    iter4eval_step2 = 69  # 163

    # LR 2
    lr_step2 = 0.2e-5  # 2.8e-6
    weight_decay_step2 = 0.00001
    eps_step2 = 1e-6
    betas_step2 = (0.99, 0.999)


os.makedirs(CFG.base_path + 'results/', exist_ok=True)
os.makedirs(CFG.base_path + 'results/' + CFG.exp_name, exist_ok=True)
os.makedirs(CFG.base_path + 'results/' + CFG.exp_name + '/checkpoints', exist_ok=True)
CFG.save_path = CFG.base_path + 'results/' + CFG.exp_name + '/checkpoints/'
with open(CFG.base_path + 'results/' + CFG.exp_name + '/CFG.txt', 'w') as f:
    for key, value in CFG.__dict__.items():
        f.write('%s:%s\n' % (key, value))

## Logging

In [None]:
if CFG.wandb:
    wandb.init(project=CFG.wandb_project,
               name=CFG.exp_name,
               config=class2dict(CFG),
               group=CFG.wb_group,
               job_type="train",
               dir=CFG.base_path)

LOGGER = get_logger(CFG.base_path + 'results/' + CFG.exp_name + '/train')

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


## Data and tokenizer preparation



In [None]:
train = pd.read_csv(f'{CFG.base_path}/feedback-prize-english-language-learning/train.csv')
test = pd.read_csv(f'{CFG.base_path}/feedback-prize-english-language-learning/test.csv')
submission = pd.read_csv(f'{CFG.base_path}/feedback-prize-english-language-learning/sample_submission.csv')

print(f"train.shape: {train.shape}")
print(f"test.shape: {test.shape}")
print(f"submission.shape: {submission.shape}")

CFG.tokenizer = get_tokenizer(CFG)
max_len, lengths = define_max_len(train, CFG.tokenizer)
CFG.max_len = max_len
train['length'] = lengths

Fold = MultilabelStratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=66)
for n, (train_index, val_index) in enumerate(Fold.split(train, train[CFG.target_cols])):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())


if CFG.debug:
    display(train.groupby('fold').size())
    train = train.sample(n=150, random_state=0).reset_index(drop=True)
    display(train.groupby('fold').size())

train.shape: (3911, 8)
test.shape: (3, 2)
submission.shape: (3, 7)


Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


fold
0    559
1    559
2    559
3    558
4    559
5    558
6    559
dtype: int64


## Training Pipeline

In [None]:
if CFG.train:
    oof_df = pd.DataFrame()
    for fold in range(CFG.n_fold):
        if fold in CFG.trn_fold:
            if CFG.num_classes == 1:
                for ind_cl, curclass in enumerate(CFG.target_cols):

                    _oof_df = train_loop(CFG=CFG, folds=train, fold=fold, LOGGER=LOGGER, curclass=curclass)
                    if ind_cl == 0:
                        fold_df = _oof_df.copy()
                    else:
                        fold_df = pd.merge(fold_df, _oof_df,
                                           on=['text_id', 'full_text', 'length', 'fold'] + CFG.target_cols)
                oof_df = pd.concat([oof_df, fold_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(fold_df, CFG, LOGGER)
            else:
                _oof_df = train_loop(CFG=CFG, folds=train, fold=fold, LOGGER=LOGGER, curclass=None)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df, CFG, LOGGER)
    oof_df = oof_df.reset_index(drop=True)
    LOGGER.info(f"========== CV ==========")
    get_result(oof_df, CFG, LOGGER)
    oof_df.to_pickle(CFG.save_path + 'oof_df.pkl')

if CFG.wandb:
    wandb.finish()


# Upload to Kaggle

In [None]:
!pip install kaggle

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))
  
# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json
User uploaded file "kaggle.json" with length 67 bytes


In [None]:
import kaggle
import shutil
import os
import glob

In [None]:
os.makedirs('/content/reborn_single_base/')
os.makedirs('/content/reborn_single_base/checkpoints')
os.makedirs('/content/reborn_single_base/tokenizer')

In [None]:
tkn = glob.glob('/content/drive/MyDrive/colab/FB_KAGL/results/single_v3base_7fold/tokenizer/*')
for tk in tkn:
  hz = tk.split('/')[-1]
  shutil.copy(tk, f'/content/reborn_single_base/tokenizer/{hz}')

In [None]:
cfg_item = glob.glob('/content/drive/MyDrive/colab/FB_KAGL/results/single_v3base_7fold/checkpoints/config*')
print(cfg_item)
shutil.copy(cfg_item[0], f'/content/reborn_single_base/checkpoints/config.pth')

['/content/drive/MyDrive/colab/FB_KAGL/results/single_v3base_7fold/checkpoints/config.pth']


'/content/reborn_single_base/checkpoints/config.pth'

In [None]:
nm = 'microsoft-deberta-v3-base_fold5_grammar_final_best_0.4657'

nm_item = glob.glob(f'/content/drive/MyDrive/colab/FB_KAGL/results/single_v3base_7fold/checkpoints/{nm}*')[0]
print(nm_item)
hz = nm_item.split('/')[-1]
shutil.copy(tk, f'/content/reborn_single_base/checkpoints/{hz}')

/content/drive/MyDrive/colab/FB_KAGL/results/single_v3base_7fold/checkpoints/microsoft-deberta-v3-base_fold5_grammar_final_best_0.4657.pth


'/content/reborn_single_base/checkpoints/microsoft-deberta-v3-base_fold5_grammar_final_best_0.4657.pth'

In [None]:
items = glob.glob('/content/drive/MyDrive/colab/FB_KAGL/results/single_v3base_7fold/checkpoints/*final_best*')
for tk in items:
  hz = tk.split('/')[-1]
  shutil.copy(tk, f'/content/reborn_single_base/checkpoints/{hz}')

In [None]:
!kaggle datasets init -p /content/reborn_single_base

Data package template written to: /content/reborn_single_base/dataset-metadata.json


In [None]:
!kaggle datasets create --dir-mode zip -p /content/reborn_single_base

Starting upload for file checkpoints.zip
100% 18.6G/18.6G [04:03<00:00, 82.1MB/s]
Upload successful: checkpoints.zip (19GB)
Starting upload for file tokenizer.zip
100% 3.04M/3.04M [00:01<00:00, 2.23MB/s]
Upload successful: tokenizer.zip (3MB)
Your private Dataset is being created. Please check progress at https://www.kaggle.com/datasets/kolyaforrat/single-base-reb2


In [None]:
target_cols = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']

In [None]:
from collections import defaultdict
import numpy as np
items = glob.glob('/content/drive/MyDrive/colab/FB_KAGL/results/single_v3base_7fold/checkpoints/*final_best*')

myd = defaultdict(list)
for col in target_cols:
    for tk in items:
        if col in tk:
            st = float(tk.split('_')[-1].split('.pt')[0])
            myd[col].append(st)

full = 0
for col in target_cols:
    print(f'{col}: {np.mean(myd[col])}')
    full += np.mean(myd[col])

print(f'full CV: {full/6}')


cohesion: 0.48439000000000004
syntax: 0.44482166666666667
vocabulary: 0.41056
phraseology: 0.45977333333333337
grammar: 0.4701683333333333
conventions: 0.44359399999999993
full CV: 0.452217888888889


In [None]:
items = glob.glob('/content/drive/MyDrive/colab/FB_KAGL/results/single_v3base_7fold/checkpoints/*final_best*')
import numpy as np
arr = []
for tk in items:
  st = float(tk.split('_')[-1].split('.pt')[0])
  arr.append(st)
print(np.mean(arr))

0.45246428571428565
