In [1]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
device = "cuda" if torch.cuda.is_available() else "cpu"

from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import f1_score

from tqdm import tqdm
import wandb
wandb.login()

import os, gc
import random
import warnings
warnings.filterwarnings('ignore')

import re, string
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

random_seed = 42
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True
np.random.seed(random_seed)
random.seed(random_seed)
os.environ["PYTHONHASHSEED"] = str(random_seed)

[34m[1mwandb[0m: Currently logged in as: [33mchanmuzi[0m. Use [1m`wandb login --relogin`[0m to force relogin
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chanmuzi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/chanmuzi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
sweep_config = {
    'name':'sweep-test',
    'method':'random',
    'metric':{
        'name':'valid_loss',
        'goal':'minimize'
    },
    'parameters':{
        'learning_rate':{
            'min':1e-6,
            'max':1e-2
        },
        'epochs':{
            'values':[3,4,5,6,7]
        },
        'batch_size':{
            'values':[2,4,8,16,32]
        },
        'lr':{
            'min':1e-7,
            'max':1e-2
        },
        'eps':{
            'min':1e-9,
            'max':1e-2
        }
    }
}

In [3]:
train = pd.read_csv('Data/train.csv')
test = pd.read_csv('Data/test.csv')
train_len = len(train) # we should record it to split all_data into train,test again

all_data = pd.concat([train,test]) # for preprocessing and tokenizing

stop = set(stopwords.words('english'))
def remove_tag(text):
    tag = re.compile(r'@\S+')
    return tag.sub(r'',text)

def remove_URL(text):
    # http:... / https:... / www... 
    url = re.compile(r'https?://\S+|www\.\S+')
    return re.sub(url,'',text)

def remove_html(text):
    # < > / ( )
    html = re.compile(r'<[^>]+>|\([^)]+\)')
    return html.sub(r'',text)

def remove_punct(text):
    # ['!','"','$','%','&',"'",'(',')','*',
    # '+',',','-','.','/',':',';','<','=',
    # '>','?','@','[','\\',']','^','_','`',
    # '{','|','}','~']
    punctuations = list(string.punctuation)
    table = str.maketrans('', '', ''.join(punctuations))
    return text.translate(table)

all_data['cleaned'] = all_data['text'].apply(lambda x:remove_tag(x))
all_data['cleaned'] = all_data['cleaned'].apply(lambda x: remove_URL(x))
all_data['cleaned'] = all_data['cleaned'].apply(lambda x: remove_html(x))
all_data['cleaned'] = all_data['cleaned'].apply(lambda x: remove_punct(x))
all_data['cleaned'] = all_data['cleaned'].apply(lambda x: x.lower()) # lowering
all_data['cleaned'] = all_data['cleaned'].apply(lambda x: word_tokenize(x)) # split sentence into words list
# exclude stop words and make them a sentence again
all_data['cleaned'] = all_data['cleaned'].apply(lambda x: ' '.join([word for word in x if word not in stop]))

train_data,test_data = all_data[:train_len],all_data[train_len:]


class TweetsDataset(Dataset):
    def __init__(self,df,label,tokenizer):
        self.df = df # Pandas.DataFrame
        self.label = label # True: train,valid / False: test
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df) # number of samples

    def __getitem__(self,idx):
        text = self.df.loc[idx]['text'] # extracting text from each row

        encoded_dict = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            padding='max_length',
            truncation=True,
            max_length=84, # given to the max_length of tokenized text
            return_tensors='pt', # PyTorch
            return_attention_mask=True, # We should put it into the model
        )
        '''
        The model BERT has two positional(mandatory) arguments,
        "input_ids" and "attention_mask".

        In the process of train/valid, we already have labels(answers),
        but in case of test, we don't.
        '''
        if self.label:
            labels = self.df.loc[idx]['target']
            # [batch,1,max_len(84)] -> [batch,max_len]
            return {'input_ids':encoded_dict['input_ids'].squeeze(),
                    'attention_mask':encoded_dict['attention_mask'].squeeze(),
                    # Our loss_fn wants it to be a "long tensor", so it will be changed
                    'labels':torch.tensor(labels,dtype=torch.int).unsqueeze(dim=0)}
        else:
            # [batch,1,max_len(84)] -> [batch,max_len]
            return {'input_ids':encoded_dict['input_ids'].squeeze(),
                    'attention_mask':encoded_dict['attention_mask'].squeeze()}

model_name = 'bert-base-uncased' # If possible, use "bert-large-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)

In [4]:
def TweetsLoader(train_data,batch_size):
    train_data_demo = train_data[:200]
    train_dataset_demo = TweetsDataset(train_data_demo,True,tokenizer)
    train_demo_size = int(0.8 * len(train_dataset_demo))
    valid_demo_size = len(train_dataset_demo) - train_demo_size

    train_dataset_demo, valid_dataset_demo = random_split(train_dataset_demo,[train_demo_size,valid_demo_size])
    print(f'{len(train_dataset_demo)} train demo samples')
    print(f'{len(valid_dataset_demo)} valid demo samples')

    train_dataloader_demo = DataLoader(train_dataset_demo,batch_size=batch_size,shuffle=True,pin_memory=True)
    valid_dataloader_demo = DataLoader(valid_dataset_demo,batch_size=batch_size,shuffle=False,pin_memory=True)
    
    return train_dataloader_demo,valid_dataloader_demo

In [5]:
class TweetsModel(nn.Module):
    def __init__(self,model_name):
        super().__init__()
        self.model = BertForSequenceClassification.from_pretrained(model_name)

    def forward(self,input_ids,attention_mask):
        output = self.model(input_ids=input_ids,attention_mask=attention_mask)
        logits = output.logits
        return logits

In [6]:
def train(model,device,train_dataloader,valid_dataloader,epochs,criterion,optimizer,metric):
    wandb.watch(model,criterion,log='all',log_freq=10)

    best_model_epoch, valid_loss_values = [],[] 
    valid_loss_min = [1] # arbitrary loss I set here
    for epoch in range(epochs):
        gc.collect() # memory cleaning
        model.train()

        train_loss = 0
        train_step = 0
        pbar = tqdm(train_dataloader,desc='Training..')
        for idx,batch in enumerate(pbar): # you can also write like "for batch in tqdm(train_dataloader)"
            optimizer.zero_grad() # initialize
            train_step += 1

            train_input_ids = batch['input_ids'].to(device)
            train_attention_mask = batch['attention_mask'].to(device)
            train_labels = batch['labels'].squeeze().to(device).long()
            
            # You can refer to the class "TweetsModel" for understand 
            # what would be logits
            logits = model(train_input_ids, train_attention_mask).to(device)
            predictions = torch.argmax(logits, dim=1) # get an index from larger one
            detached_predictions = predictions.detach().cpu().numpy()
            
            loss = criterion(logits, train_labels)
            loss.backward() 
            optimizer.step()
            model.zero_grad()

            train_loss += loss.detach().cpu().numpy().item()

            pbar.set_postfix({'train_loss':train_loss/train_step})
            wandb.log({
                'epoch':epoch,
                'train_loss':train_loss/train_step
            })

        print(f'Epoch [{epoch+1}/{epochs}] Train_loss: {train_loss/train_step}')
        pbar.close()

        with torch.no_grad():
            model.eval()

            valid_loss = 0
            valid_step = 0
            total_valid_score = 0

            y_pred = [] # for getting f1_score that is a metric of the competition
            y_true = []

            pbar = tqdm(valid_dataloader,desc='Validating...')
            for idx,batch in enumerate(pbar):
                valid_step += 1

                valid_input_ids = batch['input_ids'].to(device)
                valid_attention_mask = batch['attention_mask'].to(device)
                valid_labels = batch['labels'].squeeze().to(device).long()

                logits = model(valid_input_ids, valid_attention_mask).to(device)
                predictions = torch.argmax(logits, dim=1)
                detached_predictions = predictions.detach().cpu().numpy()
                
                loss = criterion(logits, valid_labels)
                valid_loss += loss.detach().cpu().numpy().item()

                y_pred.extend(predictions.cpu().numpy())
                y_true.extend(valid_labels.cpu().numpy())

            wandb.log({
                'epoch':epoch,
                'valid_loss':valid_loss/valid_step
            })                

            valid_loss /= valid_step
            f1 = f1_score(y_true,y_pred)

            print(f'Epoch [{epoch+1}/{epochs}] Score: {f1}')
            print(f'Epoch [{epoch+1}/{epochs}] Valid_loss: {valid_loss}')

            if valid_loss < min(valid_loss_min):
                print('model improved!')
            else:
                print('model not improved')
    
            # torch.save(model.state_dict(), f'save/epoch:{epoch+1}_model.pt')
            # print('save checkpoint!')
            valid_loss_min.append(valid_loss)
            print(f'valid_loss_min:{min(valid_loss_min)}')
        
        # # Double check your directory
        # best_model_epoch.append(f'save/bert-base/epoch:{epoch+1}_model.pt')
        # valid_loss_values.append(valid_loss)
        print('='*100)

    # select_best_model() # refer to below function
    print('Train/Valid Completed!!')
    # wandb.finish()
    del train_dataloader, valid_dataloader # memory cleaning
    gc.collect()

# def select_best_model():
#     best_model = best_model_epoch[np.array(valid_loss_values).argmin()]
#     os.rename(best_model, best_model.split('.pt')[0] + '_best.pt')

In [11]:
def run_sweep(config=None):
    with wandb.init(config=config) as run:
        run.name = 'For Test'

        w_config = wandb.config

        criterion = nn.CrossEntropyLoss()
        train_loader,valid_loader = TweetsLoader(train_data,w_config.batch_size)
        model = TweetsModel('bert-base-uncased').to(device)
        optimizer = AdamW(model.parameters(),lr=w_config.lr,eps=w_config.eps,no_deprecation_warning=True)
        metric = f1_score

        train(model,device,train_loader,valid_loader,w_config.epochs,criterion,optimizer,metric)
    

In [12]:
sweep_id = wandb.sweep(sweep_config,project="sweep_tutorial",entity="chanmuzi")
wandb.agent(sweep_id,run_sweep,count=10)

Create sweep with ID: bnyj35ht
Sweep URL: https://wandb.ai/chanmuzi/sweep_tutorial/sweeps/bnyj35ht


[34m[1mwandb[0m: Agent Starting Run: fsqlv9uy with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 6
[34m[1mwandb[0m: 	eps: 0.005864192167837217
[34m[1mwandb[0m: 	learning_rate: 0.0001522159404665369
[34m[1mwandb[0m: 	lr: 0.008984468016499101


160 train demo samples
40 valid demo samples


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch [1/6] Train_loss: 1.0519946217536926


Validating...: 100%|██████████| 2/2 [00:01<00:00,  1.43it/s]


Epoch [1/6] Score: 0.29629629629629634
Epoch [1/6] Valid_loss: 0.6408710479736328
model improved!
valid_loss_min:0.6408710479736328


Training..: 100%|██████████| 5/5 [00:17<00:00,  3.53s/it, train_loss=0.99] 


Epoch [2/6] Train_loss: 0.9896473526954651


Validating...: 100%|██████████| 2/2 [00:01<00:00,  1.36it/s]


Epoch [2/6] Score: 0.6666666666666666
Epoch [2/6] Valid_loss: 1.0145443975925446
model not improved
valid_loss_min:0.6408710479736328


Training..: 100%|██████████| 5/5 [00:17<00:00,  3.44s/it, train_loss=0.954]


Epoch [3/6] Train_loss: 0.9539640545845032


Validating...: 100%|██████████| 2/2 [00:01<00:00,  1.14it/s]


Epoch [3/6] Score: 0.0
Epoch [3/6] Valid_loss: 0.7058576047420502
model not improved
valid_loss_min:0.6408710479736328


Training..: 100%|██████████| 5/5 [00:17<00:00,  3.51s/it, train_loss=0.744]


Epoch [4/6] Train_loss: 0.7436479926109314


Validating...: 100%|██████████| 2/2 [00:01<00:00,  1.43it/s]


Epoch [4/6] Score: 0.0
Epoch [4/6] Valid_loss: 0.7516243159770966
model not improved
valid_loss_min:0.6408710479736328


Training..: 100%|██████████| 5/5 [00:17<00:00,  3.48s/it, train_loss=0.845]


Epoch [5/6] Train_loss: 0.8445168495178222


Validating...: 100%|██████████| 2/2 [00:01<00:00,  1.46it/s]


Epoch [5/6] Score: 0.0
Epoch [5/6] Valid_loss: 1.031533032655716
model not improved
valid_loss_min:0.6408710479736328


Training..: 100%|██████████| 5/5 [00:17<00:00,  3.56s/it, train_loss=0.785]


Epoch [6/6] Train_loss: 0.7846502661705017


Validating...: 100%|██████████| 2/2 [00:01<00:00,  1.36it/s]

Epoch [6/6] Score: 0.0
Epoch [6/6] Valid_loss: 0.8860965669155121
model not improved
valid_loss_min:0.6408710479736328
Train/Valid Completed!!





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▁▁▁▂▂▂▂▂▂▄▄▄▄▄▄▅▅▅▅▅▅▇▇▇▇▇▇██████
train_loss,▃▂▂▆▆▁▁▄▅▅▇█▆▅▅▂▂▃▃▃▃▃▃▃▄▄▃▄▃▃
valid_loss,▁█▂▃█▅

0,1
epoch,5.0
train_loss,0.78465
valid_loss,0.8861


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 7hg91bbg with config:
[34m[1mwandb[0m: 	batch_size: 2
[34m[1mwandb[0m: 	epochs: 7
[34m[1mwandb[0m: 	eps: 0.007926774296926587
[34m[1mwandb[0m: 	learning_rate: 0.006169208785035573
[34m[1mwandb[0m: 	lr: 0.002189506440332318


160 train demo samples
40 valid demo samples


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch [1/7] Train_loss: 0.8697851020842791


Validating...: 100%|██████████| 20/20 [00:02<00:00,  7.49it/s]


Epoch [1/7] Score: 0.5964912280701754
Epoch [1/7] Valid_loss: 0.7740284085273743
model improved!
valid_loss_min:0.7740284085273743


Training..: 100%|██████████| 80/80 [00:49<00:00,  1.60it/s, train_loss=0.76] 


Epoch [2/7] Train_loss: 0.7598379334434867


Validating...: 100%|██████████| 20/20 [00:02<00:00,  7.56it/s]


Epoch [2/7] Score: 0.0
Epoch [2/7] Valid_loss: 0.6877380043268204
model improved!
valid_loss_min:0.6877380043268204


Training..: 100%|██████████| 80/80 [00:49<00:00,  1.61it/s, train_loss=0.76] 


Epoch [3/7] Train_loss: 0.7595183599740267


Validating...: 100%|██████████| 20/20 [00:02<00:00,  7.33it/s]


Epoch [3/7] Score: 0.0
Epoch [3/7] Valid_loss: 0.6818618655204773
model improved!
valid_loss_min:0.6818618655204773


Training..:  91%|█████████▏| 73/80 [00:48<00:05,  1.39it/s, train_loss=0.735][34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.
