# INSTALL REQUIREMENTS

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 5.4 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.12.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 52.0 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.0 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 52.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 72.7 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Fo

In [2]:
# for XLNet
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l[K     |▎                               | 10 kB 31.1 MB/s eta 0:00:01[K     |▌                               | 20 kB 12.3 MB/s eta 0:00:01[K     |▉                               | 30 kB 9.5 MB/s eta 0:00:01[K     |█                               | 40 kB 8.8 MB/s eta 0:00:01[K     |█▍                              | 51 kB 4.6 MB/s eta 0:00:01[K     |█▋                              | 61 kB 5.4 MB/s eta 0:00:01[K     |██                              | 71 kB 5.5 MB/s eta 0:00:01[K     |██▏                             | 81 kB 4.3 MB/s eta 0:00:01[K     |██▍                             | 92 kB 4.8 MB/s eta 0:00:01[K     |██▊                             | 102 kB 5.2 MB/s eta 0:00:01[K     |███                             | 112 kB 5.2 MB/s eta 0:00:01[K     |███▎                            | 122 kB 5.2 MB/s eta 0:00:01[K     |███▌        

In [3]:
!pip install wandb

Collecting wandb
  Downloading wandb-0.12.11-py2.py3-none-any.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 5.1 MB/s 
[?25hCollecting pathtools
  Downloading pathtools-0.1.2.tar.gz (11 kB)
Collecting GitPython>=1.0.0
  Downloading GitPython-3.1.27-py3-none-any.whl (181 kB)
[K     |████████████████████████████████| 181 kB 44.2 MB/s 
Collecting docker-pycreds>=0.4.0
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting setproctitle
  Downloading setproctitle-1.2.2-cp37-cp37m-manylinux1_x86_64.whl (36 kB)
Collecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.5.8-py2.py3-none-any.whl (144 kB)
[K     |████████████████████████████████| 144 kB 75.0 MB/s 
[?25hCollecting yaspin>=1.0.0
  Downloading yaspin-2.1.0-py3-none-any.whl (18 kB)
Collecting shortuuid>=0.5.0
  Downloading shortuuid-1.0.8-py3-none-any.whl (9.5 kB)
Collecting gitdb<5,>=4.0.1
  Downloading gitdb-4.0.9-py3-none-any.whl (63 kB)
[K     |████████████████████████████████| 63 kB 1.9 MB

# GOOGLE MOUNT

In [4]:
from google.colab import drive
drive.mount('/content/drive')
PATH = './drive/MyDrive/datasets/'

Mounted at /content/drive


In [5]:
%cd /content/drive/MyDrive/groom_project1/

/content/drive/MyDrive/groom_project1


In [6]:
%pwd

'/content/drive/MyDrive/groom_project1'

# IMPORT REQUIREMENTS

In [7]:
import os
import sys
import random
import pickle

import numpy as np
from tqdm import tqdm

import torch
from torch.nn.utils.rnn import pad_sequence

from transformers import (
    AdamW
)

import wandb

# import costumized modules
from compute import compute_acc
from visualize_score import plot_graph
from dump_datasets import mk_dataset, mk_dataset_xlnet
from dump_models import load_model, load_model_xlnet
from evaluate import test_model
from data_processing import regular

#CREATE FOLDER

In [8]:
# create required folders if not exists
def createFolder(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print ('Error: Creating directory. ' +  directory)
 
createFolder('./best_models')
createFolder('./dump_datasets')
createFolder('./dump_models_tokenizer')
createFolder('./scores')
createFolder('./submissions')

# FIX SEED

In [9]:
def seed_everything(seed:int = 1004):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore

seed_everything(42)

# MODEL

In [10]:
# test various models
MODEL_NAME = 'bert-base-uncased'
# MODEL_NAME = 'bert-large-uncased'
# MODEL_NAME = 'xlnet-base-cased'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
assert str(device) == 'cuda'

# by using pickle, load dump file if exist, else make dump file and load
# load for bert(uncased) or xlnet model(cased)
try:
    with open('./dump_models_tokenizer/' + MODEL_NAME + '.p', 'rb') as f:
        model = pickle.load(f)
        tokenizer = pickle.load(f)
        print('./dump_models_tokenizer/' + MODEL_NAME + '.p')
    print('model exists => just load model')
except:
    print('exeption occur => download model')
    if MODEL_NAME == 'bert-base-uncased':
        model, tokenizer = load_model(MODEL_NAME)
    elif MODEL_NAME == 'xlnet-base-cased':
        model, tokenizer = load_model_xlnet(MODEL_NAME)

model.to(device)

./dump_models_tokenizer/bert-base-uncased.p
model exists => just load model


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

# HYPER PARAMETERS

In [11]:
TRAIN_BATCH_SIZE=256
EVAL_BATCH_SIZE=256

LEARNING_RATE = 5e-5
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
TRAIN_EPOCH = 2



# WANDB

In [12]:
!wandb login

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [13]:
wandb.init(project="groomProject1", entity="chohs1221")

[34m[1mwandb[0m: Currently logged in as: [33mchohs1221[0m (use `wandb login --relogin` to force relogin)


In [14]:
wandb.run.name = 'bert2_lr' + str(LEARNING_RATE)

In [15]:
wandb.config.learning_rate = LEARNING_RATE
wandb.config.epochs = TRAIN_EPOCH
wandb.config.batch_size = TRAIN_BATCH_SIZE

# LOAD DATASETS

In [16]:
# by using pickle, load dump file if exist, else make dump file and load
# load for bert(uncased) or xlnet model(cased)
try:
    if  MODEL_NAME == 'bert-base-uncased':
        with open('./dump_datasets/train_dev_dumps.p', 'rb') as f:
            train_pos = pickle.load(f)
            train_neg = pickle.load(f)
            dev_pos = pickle.load(f)
            dev_neg = pickle.load(f)
        print('dataset exists => just load datasets')
    elif  MODEL_NAME == 'xlnet-base-cased':
        with open('./dump_datasets/train_dev_dumps_xlnet.p', 'rb') as f:
            train_pos = pickle.load(f)
            train_neg = pickle.load(f)
            dev_pos = pickle.load(f)
            dev_neg = pickle.load(f)
        print('dataset exists => just load datasets')
except:
    print('exeption occur => make datasets')
    train_pos, train_neg, dev_pos, dev_neg = mk_dataset()
    if MODEL_NAME == 'bert-base-uncased':
        train_pos, train_neg, dev_pos, dev_neg = mk_dataset()
    elif MODEL_NAME == 'xlnet-base-cased':
        train_pos, train_neg, dev_pos, dev_neg = mk_dataset_xlnet()

dataset exists => just load datasets


# DATA PREPROCESSING

In [17]:
# Remove '_num_', !@#$ ... from datasets
# train_pos = regular(train_pos)
# train_neg = regular(train_pos)
# dev_pos = regular(train_pos)
# dev_neg = regular(train_pos)

# TOKENIZE

In [18]:
# seperate encoding to preprocess data before encoding
train_pos = [tokenizer.encode(line) for line in train_pos]
train_neg = [tokenizer.encode(line) for line in train_neg]
dev_pos = [tokenizer.encode(line) for line in dev_pos]
dev_neg = [tokenizer.encode(line) for line in dev_neg]

# MAKE DATASETS

In [19]:
# concatenate pos, neg dataset, costomize magic mathod
class SentimentDataset(object):
    def __init__(self, pos, neg):
        self.data = [pos_sent for pos_sent in pos] + [neg_sent for neg_sent in neg]
        self.label = [[1] for _ in range(len(pos))] + [[0] for _ in range(len(neg))]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sample = self.data[index]
        return np.array(sample), np.array(self.label[index])

train_dataset = SentimentDataset(train_pos, train_neg)
dev_dataset = SentimentDataset(dev_pos, dev_neg)

# DATA LOADER

In [20]:
# argsort deleted <- no reason to sort
# attention masking for padding token
def collate_fn_style(samples):
    input_ids, labels = zip(*samples)
    max_len = max(len(input_id) for input_id in input_ids)

    attention_mask = torch.tensor([[1] * len(input_id) + [0] * (max_len - len(input_id)) for input_id in input_ids])
    input_ids = pad_sequence([torch.tensor(input_id) for input_id in input_ids], batch_first=True)
    token_type_ids = torch.tensor([[0] * len(input_id) for input_id in input_ids])
    position_ids = torch.tensor([list(range(len(input_id))) for input_id in input_ids])
    labels = torch.tensor(np.stack(labels, axis=0))

    return input_ids, attention_mask, token_type_ids, position_ids, labels

train_loader = torch.utils.data.DataLoader(train_dataset,
                                           batch_size=TRAIN_BATCH_SIZE,
                                           shuffle=True, 
                                           collate_fn=collate_fn_style,
                                           pin_memory=True, num_workers=2)

dev_loader = torch.utils.data.DataLoader(dev_dataset, 
                                         batch_size=EVAL_BATCH_SIZE,
                                         shuffle=False, 
                                         collate_fn=collate_fn_style,
                                         num_workers=2)

# TRAIN

In [None]:
# record datas for highest accuracy & lowest loss
lowest_valid_loss = 9999.
highest_valid_acc = 0.
train_acc = []
train_loss = []
valid_acc = []
valid_loss = []

temp_train_acc = []
temp_train_loss = []

#train model
model.train()
for epoch in range(TRAIN_EPOCH):
    with tqdm(train_loader, unit="batch") as tepoch:
        for iteration, (input_ids, attention_mask, token_type_ids, position_ids, labels) in enumerate(tepoch):

            tepoch.set_description(f"Epoch {epoch}")

            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            token_type_ids = token_type_ids.to(device)
            position_ids = position_ids.to(device)
            labels = labels.to(device, dtype=torch.long)

            output = model(input_ids=input_ids,
                           attention_mask=attention_mask,
                           token_type_ids=token_type_ids,
                           position_ids=position_ids,
                           labels=labels)

            loss = output.loss
            
            logits = output.logits
            batch_predictions = [0 if example[0] > example[1] else 1 for example in logits]
            batch_labels = [int(example) for example in labels]
            
            acc = compute_acc(batch_predictions, batch_labels)
            
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()


            tepoch.set_postfix(acc=acc, loss=loss.item())
            
            temp_train_acc.append(acc)
            temp_train_loss.append(loss.item())
            if iteration != 0 and iteration % int(len(train_loader) / 100) == 0:

                # evaluate model
                model.eval()
                with torch.no_grad():
                    val_acc = []
                    val_loss = []
                    for input_ids, attention_mask, token_type_ids, position_ids, labels in dev_loader:
                        input_ids = input_ids.to(device)
                        attention_mask = attention_mask.to(device)
                        token_type_ids = token_type_ids.to(device)
                        position_ids = position_ids.to(device)
                        labels = labels.to(device, dtype=torch.long)

                        output = model(input_ids=input_ids,
                                    attention_mask=attention_mask,
                                    token_type_ids=token_type_ids,
                                    position_ids=position_ids,
                                    labels=labels)

                        logits = output.logits
                        batch_predictions = [0 if example[0] > example[1] else 1 for example in logits]
                        batch_labels = [int(example) for example in labels]

                        val_acc.append(compute_acc(batch_predictions, batch_labels))
                        val_loss.append(output.loss)

                # compute accuracy, loss for train, valid datasets and record
                mean_train_acc = sum(temp_train_acc) / len(temp_train_acc)
                mean_train_loss = sum(temp_train_loss) / len(temp_train_loss)
                mean_val_acc = sum(val_acc) / len(val_acc)
                mean_val_loss = sum(val_loss) / len(val_loss)

                train_acc.append(mean_train_acc)
                train_loss.append(mean_train_loss)
                valid_acc.append(mean_val_acc)
                valid_loss.append(mean_val_loss)

                temp_train_acc = []
                temp_train_loss = []
                
                # WANDB
                wandb.log({"train_loss": mean_train_loss,
                           'train_acc': mean_train_acc,
                           'valid_loss': mean_val_loss,
                           'valid_acc': mean_val_acc})

                # save best models
                if highest_valid_acc < mean_val_acc:
                    highest_valid_acc = mean_val_acc
                    print('ACCURACY for highest valid acc: ', mean_val_acc)
                    print('LOSS for lowest valid acc: ', mean_val_loss)
                    # model.save_pretrained('./best_models/model' + str(int(mean_val_acc*100)) + str(int(mean_val_loss*1000)))

                elif lowest_valid_loss > mean_val_loss:
                    lowest_valid_loss = mean_val_loss
                    print('ACCURACY for lowest valid loss: ', mean_val_acc)
                    print('LOSS for lowest valid loss: ', mean_val_loss)
                    # model.save_pretrained('./best_models/model' + str(int(mean_val_acc*100)) + str(int(mean_val_loss*1000)))
                                        
                model.train()
model.save_pretrained('./best_models/model' + str(int(mean_val_acc*100)) + str(int(mean_val_loss*1000)))

Epoch 0:   1%|          | 18/1732 [00:13<41:17,  1.45s/batch, acc=0.965, loss=0.102]

ACCURACY for highest valid acc:  0.956884765625
LOSS for lowest valid acc:  tensor(0.1374, device='cuda:0')


Epoch 0:   2%|▏         | 35/1732 [00:25<40:20,  1.43s/batch, acc=0.945, loss=0.157]

ACCURACY for highest valid acc:  0.959423828125
LOSS for lowest valid acc:  tensor(0.1136, device='cuda:0')


Epoch 0:   3%|▎         | 52/1732 [00:37<39:55,  1.43s/batch, acc=0.984, loss=0.0618]

ACCURACY for highest valid acc:  0.964111328125
LOSS for lowest valid acc:  tensor(0.1033, device='cuda:0')


Epoch 0:   4%|▍         | 69/1732 [00:49<39:13,  1.42s/batch, acc=0.949, loss=0.126]

ACCURACY for lowest valid loss:  0.95908203125
LOSS for lowest valid loss:  tensor(0.1058, device='cuda:0')


Epoch 0:   5%|▍         | 86/1732 [01:01<39:06,  1.43s/batch, acc=0.965, loss=0.0759]

ACCURACY for lowest valid loss:  0.95966796875
LOSS for lowest valid loss:  tensor(0.1043, device='cuda:0')


Epoch 0:   6%|▌         | 103/1732 [01:13<39:24,  1.45s/batch, acc=0.973, loss=0.0841]

ACCURACY for highest valid acc:  0.9701171875
LOSS for lowest valid acc:  tensor(0.0853, device='cuda:0')


Epoch 0:   7%|▋         | 120/1732 [01:25<38:24,  1.43s/batch, acc=0.965, loss=0.104]

ACCURACY for lowest valid loss:  0.967041015625
LOSS for lowest valid loss:  tensor(0.0936, device='cuda:0')


Epoch 0:   8%|▊         | 137/1732 [01:36<37:31,  1.41s/batch, acc=0.977, loss=0.0623]

ACCURACY for highest valid acc:  0.970751953125
LOSS for lowest valid acc:  tensor(0.0796, device='cuda:0')


Epoch 0:  10%|▉         | 171/1732 [02:00<36:50,  1.42s/batch, acc=0.953, loss=0.111]

ACCURACY for highest valid acc:  0.9732421875
LOSS for lowest valid acc:  tensor(0.0789, device='cuda:0')


Epoch 0:  11%|█         | 188/1732 [02:12<36:48,  1.43s/batch, acc=0.957, loss=0.109]

ACCURACY for lowest valid loss:  0.969873046875
LOSS for lowest valid loss:  tensor(0.0839, device='cuda:0')


Epoch 0:  12%|█▏        | 205/1732 [02:24<36:12,  1.42s/batch, acc=0.988, loss=0.0484]

ACCURACY for highest valid acc:  0.975537109375
LOSS for lowest valid acc:  tensor(0.0745, device='cuda:0')


Epoch 0:  13%|█▎        | 222/1732 [02:36<35:49,  1.42s/batch, acc=0.965, loss=0.0722]

ACCURACY for lowest valid loss:  0.97255859375
LOSS for lowest valid loss:  tensor(0.0777, device='cuda:0')


Epoch 0:  17%|█▋        | 290/1732 [03:24<34:19,  1.43s/batch, acc=0.969, loss=0.0635]

ACCURACY for lowest valid loss:  0.973193359375
LOSS for lowest valid loss:  tensor(0.0774, device='cuda:0')


Epoch 0:  18%|█▊        | 307/1732 [03:37<34:25,  1.45s/batch, acc=0.965, loss=0.0904]

ACCURACY for lowest valid loss:  0.972119140625
LOSS for lowest valid loss:  tensor(0.0731, device='cuda:0')


Epoch 0:  20%|█▉        | 341/1732 [04:01<33:13,  1.43s/batch, acc=0.98, loss=0.0482]

ACCURACY for highest valid acc:  0.97666015625
LOSS for lowest valid acc:  tensor(0.0658, device='cuda:0')


Epoch 0:  21%|██        | 358/1732 [04:13<32:49,  1.43s/batch, acc=0.977, loss=0.0567]

ACCURACY for lowest valid loss:  0.975927734375
LOSS for lowest valid loss:  tensor(0.0665, device='cuda:0')


Epoch 0:  24%|██▎       | 409/1732 [04:49<31:39,  1.44s/batch, acc=0.977, loss=0.0764]

ACCURACY for highest valid acc:  0.9771484375
LOSS for lowest valid acc:  tensor(0.0651, device='cuda:0')


Epoch 0:  30%|███       | 528/1732 [06:13<28:50,  1.44s/batch, acc=0.98, loss=0.0721]

ACCURACY for highest valid acc:  0.978857421875
LOSS for lowest valid acc:  tensor(0.0649, device='cuda:0')


Epoch 0:  34%|███▍      | 596/1732 [07:01<26:57,  1.42s/batch, acc=0.98, loss=0.0692]

ACCURACY for lowest valid loss:  0.975927734375
LOSS for lowest valid loss:  tensor(0.0620, device='cuda:0')


Epoch 0:  38%|███▊      | 664/1732 [07:49<25:29,  1.43s/batch, acc=0.98, loss=0.0666]

ACCURACY for lowest valid loss:  0.976025390625
LOSS for lowest valid loss:  tensor(0.0610, device='cuda:0')


Epoch 0:  40%|████      | 698/1732 [08:13<25:10,  1.46s/batch, acc=0.961, loss=0.0883]

ACCURACY for lowest valid loss:  0.978125
LOSS for lowest valid loss:  tensor(0.0576, device='cuda:0')


Epoch 0:  41%|████      | 714/1732 [08:21<09:06,  1.86batch/s, acc=0.973, loss=0.0541]

# SAVE SCORES

In [None]:
# using pickle, save dump accuracy, loss file
accloss_filename = 'accloss' + str(int(mean_val_acc*100)) + str(int(mean_val_loss*1000)) + '.p'
with open('./scores/' + accloss_filename,'wb') as f:
    pickle.dump(train_acc, f)
    pickle.dump(train_loss, f)
    pickle.dump(valid_acc, f)
    pickle.dump(valid_loss, f)

# TEST

In [None]:
# test model
test_model(model, tokenizer, mean_val_acc, mean_val_loss, file_name = 'test_no_label', device='cuda')

# SCORE VISUALIZE

In [None]:
# plot accuracy, loss graph for train, valid datasets
plot_graph(accloss_filename)