# Import Modules and Libraries

In [1]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader,random_split
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'device is {device}')

from sklearn.metrics import f1_score
from tqdm import tqdm
import wandb
wandb.login()

from transformers import AutoTokenizer,AutoModel,AdamW

import os,gc,warnings
gc.collect()
warnings.filterwarnings('ignore')

import re,string
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import random
random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True
os.environ['PYTHONHASHSEED'] = str(random_seed)

device is cpu


[34m[1mwandb[0m: Currently logged in as: [33mchanmuzi[0m. Use [1m`wandb login --relogin`[0m to force relogin
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chanmuzi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/chanmuzi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# !pip install wandb --upgrade

# from kaggle_secrets import UserSecretsClient
# user_secrets = UserSecretsClient()
# wandb_key = user_secrets.get_secret("wandb_key")
# wandb.login(wandb_key)
# !wandb login $key

In [3]:
configs = {
    'model_name':'bert-base-uncased',
    'epochs':4,
    'batch_size':16,
    'learning_rate':8e-6,
    'adamw_lr':9.3e-7,
    'adamw_eps':2.5e-9,
    'exp_name':'bert with BCE'
}

In [4]:
wandb.init(
    entity='chanmuzi',
    project="Disaster Tweets",
    group=configs['model_name'],
    name=configs['exp_name'],
    config=configs,
)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01671310556666666, max=1.0)…

# Preprocessing, Dataset, DataLoader

### Load CSV file

In [5]:
train = pd.read_csv('./Data/train.csv') # check path!
test = pd.read_csv('./Data/test.csv')
train_len = len(train)

all_data = pd.concat([train,test])

### Define Preprocessing Functions

In [7]:
def remove_tag(text):
    tag = re.compile(r'@\S+')
    return re.sub(tag,'',text)

def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return re.sub(url,'',text)

def remove_html(text):
    html = re.compile(r'<[^>]+>|\([^)]+\)')
    return re.sub(html,'',text)

def remove_punct(text):
    punct = list(string.punctuation)
    table = str.maketrans('','',''.join(punct))
    return text.translate(table)

stop = set(stopwords.words('english'))

In [8]:
all_data['cleaned'] = all_data['text'].apply(lambda x:remove_tag(x))
all_data['cleaned'] = all_data['cleaned'].apply(lambda x:remove_URL(x))
all_data['cleaned'] = all_data['cleaned'].apply(lambda x:remove_html(x))
all_data['cleaned'] = all_data['cleaned'].apply(lambda x:remove_punct(x))
all_data['cleaned'] = all_data['cleaned'].apply(lambda x:x.lower())
all_data['cleaned'] = all_data['cleaned'].apply(lambda x:word_tokenize(x))
all_data['cleaned'] = all_data['cleaned'].apply(lambda x:' '.join([word for word in x if word not in stop]))

### Compose Dataset, DataLoader

In [9]:
train_data,test_data = all_data[:train_len],all_data[train_len:]

In [19]:
class TweetsDataset(Dataset):
    def __init__(self,df,tokenizer,label):
        self.df = df
        self.tokenizer = tokenizer
        self.label = label

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self,idx):
        text = self.df.loc[idx]['text']

        encoded_dict = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            padding='max_length',
            truncation=True,
            max_length=82,
            return_tensors='pt',
            return_token_type_ids=False,
            return_attention_mask=True,
        )

        if self.label:
            labels = self.df.loc[idx]['target']
            return {'input_ids':encoded_dict['input_ids'].squeeze(),
                    'attention_mask':encoded_dict['attention_mask'].squeeze(),
                    'labels':torch.tensor(labels,dtype=torch.float).unsqueeze(dim=0)}
        else:
            return {'input_ids':encoded_dict['input_ids'].squeeze(),
                    'attention_mask':encoded_dict['attention_mask'].squeeze()}

In [20]:
tokenizer = AutoTokenizer.from_pretrained(configs['model_name'])

train_dataset = TweetsDataset(train_data,tokenizer,True)
test_dataset = TweetsDataset(test_data,tokenizer,False)

train_size = int(len(train_dataset) * 0.8)
valid_size = len(train_dataset) - train_size

train_dataset,valid_dataset = random_split(train_dataset,[train_size,valid_size])

In [21]:
def TweetsDataLoader(train_data,test_data,batch_size):
    train_dataset = TweetsDataset(train_data,tokenizer,True)
    test_dataset = TweetsDataset(test_data,tokenizer,False)

    train_size = int(len(train_dataset) * 0.8)
    valid_size = len(train_dataset) - train_size

    train_dataset,valid_dataset = random_split(train_dataset,[train_size,valid_size])

    train_dataloader = DataLoader(train_dataset,batch_size=configs['batch_size'],shuffle=True,pin_memory=True)
    valid_dataloader = DataLoader(valid_dataset,batch_size=configs['batch_size'],shuffle=False,pin_memory=True)
    test_dataloader = DataLoader(test_dataset,batch_size=1,shuffle=False)

    print(f'{len(train_dataset)} train samples')
    print(f'{len(valid_dataset)} valid samples')
    print(f'{len(test_dataset)} test samples')

    return  train_dataloader,valid_dataloader,test_dataloader

In [22]:
train_loader,valid_loader,test_loader = TweetsDataLoader(train_data,test_data,configs['batch_size'])

6090 train samples
1523 valid samples
3263 test samples


In [23]:
class TweetsModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = AutoModel.from_pretrained(configs['model_name'])
        self.dropout = nn.Dropout(p=0.5,inplace=False)
        self.fc = nn.Linear(768,1,bias=True)
        self.sigmoid = nn.Sigmoid()

    def forward(self,input_ids,attention_mask):
        output = self.model(input_ids,attention_mask).pooler_output
        output = self.dropout(output)
        output = self.fc(output)
        logits = self.sigmoid(output)
        return logits

In [24]:
model = TweetsModel().to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [25]:
# criterion = nn.CrossEntropyLoss()
criterion = nn.BCELoss()
optimizer = AdamW(model.parameters(),lr=configs['adamw_lr'],eps=configs['adamw_eps'])
metric = f1_score

In [26]:
def train_and_valid(model,train_loader,valid_loader,criterion,optimizer,metric,epochs):
    wandb.watch(model,criterion,log='all',log_freq=10)

    best_model_epoch,valid_loss_values = [],[]
    valid_loss_min = [1]
    for epoch in range(epochs):
        model.train()
        gc.collect()
        pbar = tqdm(train_loader,desc='Training...')
        
        train_losses = 0
        train_steps = 0
        for idx,batch in enumerate(pbar):
            train_steps += 1            

            train_input_ids = batch['input_ids'].to(device)
            train_attention_mask = batch['attention_mask'].to(device)
            train_labels = batch['labels'].to(device)

            optimizer.zero_grad()

            train_outputs = model(train_input_ids,train_attention_mask)

            train_loss = criterion(train_outputs,train_labels)
            train_loss.backward()
            optimizer.step()
            model.zero_grad()
            
            train_losses += train_loss.detach().cpu().numpy().item()
            wandb.log({
                'epoch':epoch,
                'train_loss':train_losses / train_steps
            })
            pbar.set_postfix({'train_loss':train_losses/train_steps})

        print(f'Epoch [{epoch+1}/{epochs}] Train_loss: {train_losses/train_steps}')
        pbar.close()

        with torch.no_grad():
            model.eval()
            gc.collect()
            pabr = tqdm(valid_loader)

            valid_steps = 0
            valid_losses = 0
            valid_score = 0

            y_preds,y_labels = [],[]
            for idx,batch in enumerate(pbar):
                valid_steps += 1

                valid_input_ids = batch['input_ids'].to(device)
                valid_attention_mask = batch['attention_mask'].to(device)
                valid_labels = batch['labels'].to(device)

                valid_outputs = model(valid_input_ids,valid_attention_mask)
                valid_preds = torch.argmax(valid_outputs,dim=1)

                valid_loss = criterion(valid_outputs,valid_labels)
                valid_losses += valid_loss.detach().cpu().numpy().item()

                y_preds.extend(valid_preds.detach().cpu().numpy())
                y_labels.extend(valid_labels.detach().cpu().numpy())
            
            wandb.log({
                'epoch':epoch,
                'valid_loss':valid_losses / valid_steps
            })
            valid_losses /= valid_steps
            valid_score = metric(y_labels,y_preds)
            print(f'Epoch [{epoch+1}/{epochs}] Score: {valid_score}')
            print(f'Epoch [{epoch+1}/{epochs}] Valid_loss: {valid_losses}')

            if valid_losses < min(valid_loss_min):
                print('model improved!')
                torch.save(model.state_dict(), f'save/epoch:{epoch+1}_model.pt')
            else:
                print('model "not" improved..')
            
            valid_loss_min.append(valid_losses)
            print(f'current valid_loss_min:{min(valid_loss_min)}')

        best_model_epoch.append(f'save/bert-base/epoch:{epoch+1}_model.pt')
        valid_loss_values.append(valid_losses)
        print('='*100)
    
    print('Train/Valid Completed!')
    wandb.finish()
    select_bset_model(best_model_epoch,valid_loss_values)
    
    del model,train_loader,valid_loader
    torch.cuda.empty_cache()
    gc.collect()

def select_bset_model(best_model_epoch,valid_loss_values):
    best_model = best_model_epoch[np.array(valid_loss_values).argmin()]
    os.rename(best_model,best_model.split('.pt')[0] + '_best.pt')

In [27]:
print(f'Before training, files in current directory: {os.listdir("save")}')

print('Training and Validation Start!')
print('='*100)

train_and_valid(
    model,
    train_loader,
    valid_loader,
    criterion,
    optimizer,
    metric,
    configs['epochs']
)

print(f'After training, files in current directory: {os.listdir()}')

Before training, files in current directory: ['epoch:4_model.pt', 'epoch:2_model.pt', 'epoch:3_model.pt', 'epoch:1_model.pt']
Training and Validation Start!


Training...: 100%|██████████| 381/381 [13:10<00:00,  2.07s/it, train_loss=0.684]


Epoch [1/4] Train_loss: 0.6836131052395177


  0%|          | 0/96 [00:00<?, ?it/s]

Epoch [1/4] Score: 0.0
Epoch [1/4] Valid_loss: 0.6338065778176616
model improved!
current valid_loss_min:0.6338065778176616


Training...:  26%|██▌       | 100/381 [03:31<09:53,  2.11s/it, train_loss=0.64]


KeyboardInterrupt: 

In [38]:
def inference(model,test_loader):
    test_preds = []
    model.eval()

    with torch.no_grad():
        pbar = tqdm(test_loader)
        for idx,batch in enumerate(pbar):
            test_input_ids = batch['input_ids'].to(device)
            test_attention_mask = batch['attention_mask'].to(device)

            logits,predictions = model(test_input_ids,test_attention_mask)
            predictions = predictions.detach().cpu().numpy()
            test_preds.append(predictions)
        
        return test_preds

In [None]:
for filename in os.listdir('save/'):
    if 'best.pt' in filename:
        best_pt = filename
print(f'BEST model.pt: {best_pt}')
check_point = torch.load('save/'+best_pt)

model = TweetsModel().to(device)
model.load_state_dict(check_point)

predictions = inference(model,test_loader)

In [None]:
sample = pd.read_csv('./Data/sample_submission.csv')
predictions = [int(x.item()) for x in predictions]
sample['target'] = predictions
sample.to_csv('submission.csv',index=False,header=True)