# Preparations

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Sun Nov 27 07:32:30 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100-SXM4-40GB      Off  | 00000000:00:04.0 Off |                    0 |
| N/A   28C    P0    42W / 400W |      0MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
need_to_install = True
if need_to_install:
  !pip install wandb
  !pip install transformers
  !pip install sentencepiece
  !pip install tokenizer
  !pip install iterative-stratification

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wandb
  Downloading wandb-0.13.5-py2.py3-none-any.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 4.5 MB/s 
Collecting pathtools
  Downloading pathtools-0.1.2.tar.gz (11 kB)
Collecting setproctitle
  Downloading setproctitle-1.3.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)
Collecting shortuuid>=0.5.0
  Downloading shortuuid-1.0.11-py3-none-any.whl (10 kB)
Collecting docker-pycreds>=0.4.0
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting GitPython>=1.0.0
  Downloading GitPython-3.1.29-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 85.0 MB/s 
Collecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.11.1-py2.py3-none-any.whl (168 kB)
[K     |████████████████████████████████| 168 kB 88.1 MB/s 
Collecting gitdb<5,>=4.0.1
  Downloading gitdb-4.0.10-py3-n

# Imports

In [None]:
import sys
#code_path = '/content/drive/MyDrive/FB_KAGL/code/FB_utils/'
code_path = '/content/drive/MyDrive/colab/FB_KAGL/code/'

sys.path.insert(0, code_path)

In [None]:
import os
import gc
import warnings
warnings.filterwarnings("ignore")

import wandb
import torch
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

from FB_utils_pseudo.pipeline import train_loop, train_loop_pseudo
from FB_utils_pseudo.core_pseudo import valid_fn_pseudo
from FB_utils_pseudo.models import get_tokenizer
from FB_utils_pseudo.utils import class2dict, get_logger, define_max_len, get_result, get_score

%env TOKENIZERS_PARALLELISM=true

env: TOKENIZERS_PARALLELISM=true


# Config

In [None]:
# ====================================================
# Config
# ====================================================
class CFG:
    ####################
    # MAIN
    ####################
    wandb = True
    wandb_project = 'FeedBack_kaggle_metasplit'
    competition = 'FeedBack_3'
    wb_group = 'multi'
    exp_name = 'large-v3_metasplit_V1'
    base_path = '/content/drive/MyDrive/colab/FB_KAGL/'

    seed = 333
    train = True
    debug = False
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    ####################
    # DATA
    ####################
    target_cols = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']

    normlen = False
    num_workers = 12
    train_bs = 16 # 3
    valid_bs = 16 # 3
    max_len = 512

    n_fold = 7
    trn_fold = [0, 1, 4, 6]

    ####################
    # MODEL
    ####################
    model = "microsoft/deberta-v3-large"  # microsoft/deberta-v2-xxlarge
    gradient_checkpointing = True
    num_classes = 6

    ####################
    # TRAIN
    ####################
    apex = True
    use_restart = True
    use_restart_step2 = True
    valid_pnts = [] # [1200, 1300, 3300] 

    ####################
    # LOSS
    ####################
    loss = 'l1'  # ['l1', 'double', 'rmse']
    w_mse = 0.25
    w_l1 = 0.75
    beta_L1 = 0.125 #0.125
    delta_Huber = 0.07

    # Scheduler step 1

    scheduler = 'linear'  # ['linear', 'cosine', 'cosine_restart']
    num_cycles = 0.5  # 3.5
    num_warmup_steps = 3

    # Loop step 1

    epochs = 4
    rest_thr = 0.006 
    iter4eval = 100000

    # LR, optimizer step 1

    encoder_lr = 1.8e-5  # 1.4e-5 # 2e-5
    decoder_lr = 1.8e-5  # 1.4e-5 # 2e-5
    min_lr = 0.01e-6  # 1e-6
    eps = 1e-6 #1e-6
    betas = (0.9, 0.999)
    weight_decay = 0.001
    gradient_accumulation_steps = 1 # 2
    max_grad_norm = 1000
    optimizer = 'AdamW'

    ####################
    # STEP 2
    ####################
    step2 = True

    # Scheduler step 2

    scheduler_step2 = 'cosine_restart'
    num_cycles_step2 = 1
    # Loop step 2

    epochs_step2 = 3
    rest_thr_step2 = 0.0012  
    iter4eval_step2 = 53

    # LR 2
    lr_step2 = 0.4e-5  # 2.8e-6
    weight_decay_step2 = 0.0001
    eps_step2 = 1e-6
    betas_step2 = (0.99, 0.999)

    use_fgm = True

    pooling = 'mean'
    use_meta = False

    pseudo_weight = 0.2
    pseudo_training = False




os.makedirs(CFG.base_path + 'results/', exist_ok=True)
os.makedirs(CFG.base_path + 'results/' + CFG.exp_name, exist_ok=True)
os.makedirs(CFG.base_path + 'results/' + CFG.exp_name + '/checkpoints', exist_ok=True)
CFG.save_path = CFG.base_path + 'results/' + CFG.exp_name + '/checkpoints/'
with open(CFG.base_path + 'results/' + CFG.exp_name + '/CFG.txt', 'w') as f:
    for key, value in CFG.__dict__.items():
        f.write('%s:%s\n' % (key, value))

# Logging

In [None]:
if CFG.wandb:
    wandb.init(project=CFG.wandb_project,
               name=CFG.exp_name,
               config=class2dict(CFG),
               group=CFG.wb_group,
               job_type="train",
               dir=CFG.base_path)

LOGGER = get_logger(CFG.base_path + 'results/' + CFG.exp_name + '/train')

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


# Data and tokenizer preparation
# And Loop number defining

In [None]:
import string

def define_max_len(df, tokenizer):
    lengths = []
    uni = []
    punc = []
    for text in df['full_text'].fillna("").values:
        token = tokenizer(text, add_special_tokens=False)['input_ids']
        length = len(token)
        lengths.append(length)
        uni.append(len(set(token)))
        punc.append(count_punct(text))
    max_len = max(lengths) + 2  # cls & sep
    return max_len, lengths, uni, punc


def count_punct(text):
    if text.strip() == "":  # To take of care of all space input
        return 0
    count = sum([1 if char in string.punctuation else 0 for char in text])
    return count

In [None]:
train = pd.read_csv(f'{CFG.base_path}/feedback-prize-english-language-learning/train.csv')
test = pd.read_csv(f'{CFG.base_path}/feedback-prize-english-language-learning/test.csv')
submission = pd.read_csv(f'{CFG.base_path}/feedback-prize-english-language-learning/sample_submission.csv')

pseudo = pd.read_csv(f'{CFG.base_path}/feedback-prize-english-language-learning/df_only_pseudo_after1.csv')
pseudo['weight'] = [CFG.pseudo_weight for _ in range(len(pseudo))]
pseudo['pseudo'] = [1 for _ in range(len(pseudo))]

train['pseudo'] = [0 for _ in range(len(train))]
train['weight'] = [1 for _ in range(len(train))]


print(f"train.shape: {train.shape}")
print(f'pseudo.shape: {pseudo.shape}')
print(f"test.shape: {test.shape}")
print(f"submission.shape: {submission.shape}")

CFG.tokenizer = get_tokenizer(CFG)
max_len, lengths, uni, punc = define_max_len(train, CFG.tokenizer)
CFG.max_len = max_len
train['length'] = lengths
train['uni'] = uni
train['punc'] = punc
train['uni_per'] = train['uni'] / train['length']
train['punc_per'] = train['punc'] / train['length']

# max_len_pseudo, _ = define_max_len(pseudo, CFG.tokenizer)

#print(max_len, max_len_pseudo)
#max_len = max(max_len_pseudo, max_len)
#print(f'Max length set: {max_len}')
#CFG.max_len = max_len


Fold = MultilabelStratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=49)
for n, (train_index, val_index) in enumerate(Fold.split(train, train[CFG.target_cols + ['length', 'uni', 'punc', 'uni_per', 'punc_per']])):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())

pseudo['fold'] = [999 for _ in range(len(pseudo))]
#train = pd.concat([train, pseudo]).reset_index(drop=True)

print('Final train shape:', train.shape)

if CFG.debug:
    display(train.groupby('fold').size())
    train = train.sample(n=150, random_state=0).reset_index(drop=True)
    display(train.groupby('fold').size())

display(train.head(3))
display(train.tail(3))

train.shape: (3911, 10)
pseudo.shape: (13695, 10)
test.shape: (3, 2)
submission.shape: (3, 7)


Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/580 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


fold
0    559
1    559
2    558
3    559
4    558
5    559
6    559
dtype: int64

Final train shape: (3911, 16)


Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions,pseudo,weight,length,uni,punc,uni_per,punc_per,fold
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0,0,1,286,136,21,0.475524,0.073427,2
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5,0,1,560,147,21,0.2625,0.0375,5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5,0,1,372,126,36,0.33871,0.096774,5


Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions,pseudo,weight,length,uni,punc,uni_per,punc_per,fold
3908,FFDC4011AC9C,"""A problem is a chance for you to do your best...",2.5,3.0,3.0,3.0,3.5,3.0,0,1,294,139,21,0.472789,0.071429,6
3909,FFE16D704B16,Many people disagree with Albert Schweitzer's ...,4.0,4.5,4.5,4.0,4.5,4.5,0,1,570,189,50,0.331579,0.087719,0
3910,FFED00D6E0BD,Do you think that failure is the main thing fo...,3.5,2.5,3.5,3.0,3.0,3.5,0,1,697,189,46,0.271162,0.065997,5


# Base training Pipeline

In [None]:
if CFG.train:
    oof_df = pd.DataFrame()
    for fold in range(CFG.n_fold):
        if fold in CFG.trn_fold:
            if CFG.num_classes == 1:
                for ind_cl, curclass in enumerate(CFG.target_cols):

                    _oof_df = train_loop(CFG=CFG, folds=train, fold=fold, LOGGER=LOGGER, curclass=curclass)
                    if ind_cl == 0:
                        fold_df = _oof_df.copy()
                    else:
                        fold_df = pd.merge(fold_df, _oof_df,
                                           on=['text_id', 'full_text', 'length', 'fold'] + CFG.target_cols)
                oof_df = pd.concat([oof_df, fold_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(fold_df, CFG, LOGGER)
            else:
                _oof_df = train_loop(CFG=CFG, folds=train, fold=fold, LOGGER=LOGGER, curclass=None)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df, CFG, LOGGER)
    oof_df = oof_df.reset_index(drop=True)
    LOGGER.info(f"========== CV ==========")
    get_result(oof_df, CFG, LOGGER)
    oof_df.to_pickle(CFG.save_path + 'oof_df.pkl')

if CFG.wandb:
    wandb.finish()




> SEEDING DONE
Size of train dataset: 3352
Size of eval dataset: 559


Downloading:   0%|          | 0.00/874M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Validation points: []
Epoch 1/4 | Fold 0 | Class None


Ep.1 Train :   0%|          | 0/209 [00:00<?, ?it/s]

Ep.1 Valid :   0%|          | 0/35 [00:00<?, ?it/s]

Epoch 1 - avg_train_loss: 0.4532  avg_val_loss: 0.3166  time: 1216s
INFO:FB_utils_pseudo.utils:Epoch 1 - avg_train_loss: 0.4532  avg_val_loss: 0.3166  time: 1216s
Epoch 1 - Score: 0.4693  Scores: [0.5115012223255612, 0.44360979376780135, 0.4491287411280718, 0.46218696170427687, 0.4791240157025057, 0.47014233033494063]
INFO:FB_utils_pseudo.utils:Epoch 1 - Score: 0.4693  Scores: [0.5115012223255612, 0.44360979376780135, 0.4491287411280718, 0.46218696170427687, 0.4791240157025057, 0.47014233033494063]
Best Score Updated inf -->> 0.4693 | Model Saved
INFO:FB_utils_pseudo.utils:Best Score Updated inf -->> 0.4693 | Model Saved
Epoch 1 - Save Best Score: 0.4693 Model
INFO:FB_utils_pseudo.utils:Epoch 1 - Save Best Score: 0.4693 Model


Epoch 2/4 | Fold 0 | Class None


Ep.2 Train :   0%|          | 0/209 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

# Train and valid pseudo loop in **loop** )

In [None]:
N_LOOPS = 5 # define number of loops
LOOPS = [x for x in range(1, N_LOOPS+1)][1:]

DROP_THRESHOLD = 0.25  # Threshold for dropping texts. Mean L1 errors
RELABEL = True        # Relabel sample. False for just not drop
DECREASING_THRESHOLD = 0.91  # Decrease drop threshold. If you don't need it set to 1

PATH_TO_PSEUDO = f'{CFG.base_path}/feedback-prize-english-language-learning/'

for idx, LOOP in enumerate(LOOPS):
    print(f'='*50)
    print(f'STARTING LOOP {LOOP} FROM {N_LOOPS}')
    print(f'='*50)
    if LOOP > 1:

        # Read and prepare pseudo labeled data
        train = pd.read_csv(f'{PATH_TO_PSEUDO}/df_pseudo_after_loop{LOOP-1}.csv')
        print(f"train.shape: {train.shape}")

        CFG.tokenizer = get_tokenizer(CFG)
        max_len, lengths = define_max_len(train, CFG.tokenizer)
        CFG.max_len = max_len
        print(f'Max length set: {max_len}')
        train['length'] = lengths

    # Train loop
    train_loop_pseudo(CFG, train, LOOP, LOGGER)

    # Load checkpoint and validate
    checkpoint = CFG.save_path + f"{CFG.model.replace('/', '-')}_pseudoloop_{LOOP}_epoch{CFG.epochs-1}.pth"

    predictions, score, scores = valid_fn_pseudo(CFG, train, LOOP, LOGGER, checkpoint)

    print(f'Score: {score}')
    print(f'Scores: {scores}')
    train[[f'pred_{x}' for x in CFG.target_cols]] = predictions

    # Save predictions
    train.to_csv(CFG.base_path + 'results/' + CFG.exp_name + f'/loop{LOOP}_preds.csv', index=False)
    
    # Get predictions and make decisions about drop
    df = pd.read_csv(CFG.base_path + 'results/' + CFG.exp_name + f'/loop{LOOP}_preds.csv')

    decision = []
    for i in range(len(df)):
        true = df.loc[i, CFG.target_cols].values
        pred = df.loc[i, [f'pred_{x}' for x in CFG.target_cols]].values

        diff = np.abs(true - pred).mean()
        if diff < DROP_THRESHOLD:
            decision.append(i)
            
    decision = np.array(decision)
    # decision = np.unique(np.concatenate([df[df.pseudo == 0].index.values, decision]))

    print(f'Good: {len(decision)} from {len(df)}')
    print(f'Drop: {len(df) - len(decision)} from {len(df)}')

    df = df.iloc[decision].reset_index(drop=True)

    if RELABEL:
        df[CFG.target_cols] = df[[f'pred_{x}' for x in CFG.target_cols]].values

    df = df[['text_id', 'full_text', 'cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 
            'conventions', 'pseudo', 'weight', 'length', 'fold']]

    df = df[df.weight != 1].reset_index(drop=True)
    df = pd.concat([real_train, df]).reset_index(drop=True)    

    display(df.head(3))
    df.to_csv(f'{PATH_TO_PSEUDO}/df_pseudo_after_loop{LOOP}.csv', index=False)

    DROP_THRESHOLD *= DECREASING_THRESHOLD 
    torch.cuda.empty_cache()
    gc.collect()

# Train and valid pseudo loop manually

In [None]:
kvdjigdjig
# breaking row for extract all

In [None]:
######################
# DEFINE LOOP NUMBER
LOOP = 1
DROP_THRESHOLD = 0.5
RELABEL = True
######################

In [None]:
train_loop_pseudo(CFG, train, LOOP, LOGGER)

checkpoint = CFG.save_path + f'microsoft-deberta-v3-large_pseudoloop_{LOOP}_epoch{CFG.epochs-1}.pth'

predictions, score, scores = valid_fn_pseudo(CFG, train, LOOP, LOGGER, checkpoint)

print(f'Score: {score}')
print(f'Scores: {scores}')
train[[f'pred_{x}' for x in CFG.target_cols]] = predictions

train.to_csv(CFG.base_path + 'results/' + CFG.exp_name + f'/loop{LOOP}_preds.csv', index=False)
train.head(3)

## Drop and relabel

In [None]:
df = pd.read_csv(CFG.base_path + 'results/' + CFG.exp_name + f'/loop{LOOP}_preds.csv')

decision = []
for i in range(len(df)):
    true = df.loc[i, CFG.target_cols].values
    pred = df.loc[i, [f'pred_{x}' for x in CFG.target_cols]].values

    diff = np.abs(true - pred).mean()
    if diff < DROP_THRESHOLD:
        decision.append(i)
        
decision = np.array(decision)
decision = np.unique(np.concatenate([df[df.pseudo == 0].index.values, decision]))

print(f'Good: {len(decision)} from {len(df)}')
print(f'Drop: {len(df) - len(decision)} from {len(df)}')

df = df.iloc[decision].reset_index(drop=True)

if RELABEL:
    df[CFG.target_cols] = df[[f'pred_{x}' for x in CFG.target_cols]].values

df = df[['text_id', 'full_text', 'cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 
         'conventions', 'pseudo', 'weight', 'length', 'fold']]

display(df.head(3))
df.to_csv(CFG.base_path + f'feedback-prize-english-language-learning/df_pseudo_after_loop{LOOP}.csv', index=False)