## Load Data, Preprocessing

The dataset we have to feed consist of 'Tweets' according to the kaggle guide,
which means that there sould be lots of 'cooloquial expressions','hashtags','links',etc.

Thus, we have to "clean" this data(I mean, "text") before we feed it to our model.

In [1]:
import pandas as pd

train = pd.read_csv('Data/train.csv')
test = pd.read_csv('Data/test.csv')
train_len = len(train)

all_data = pd.concat([train,test]) # for preprocessing and tokenizing

In [2]:
import re
import string

# Cleaning Functions
def remove_tag(text):
    tag = re.compile(r'@\S+')
    return tag.sub(r'',text)

def remove_URL(text):
    # http:... / https:... / www... 
    url = re.compile(r'https?://\S+|www\.\S+')
    return re.sub(url,'',text)

def remove_html(text):
    # < > / ( )
    html = re.compile(r'<[^>]+>|\([^)]+\)')
    return html.sub(r'',text)

def remove_punct(text):
    # ['!','"','$','%','&',"'",'(',')','*',
    # '+',',','-','.','/',':',';','<','=',
    # '>','?','@','[','\\',']','^','_','`',
    # '{','|','}','~']
    punctuations = list(string.punctuation)
    table = str.maketrans('', '', ''.join(punctuations))
    return text.translate(table)



In [3]:
# from spellchecker import SpellChecker

# spell = SpellChecker()
# def correct_spellings(text):
#     corrected_text = []
#     misspelled_words = spell.unknown(text.split())
#     for word in text.split():
#         if word in misspelled_words:
#             corrected_text.append(spell.correction(word))
#         else:
#             corrected_text.append(word)
#     return " ".join(corrected_text)

In [4]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords
stop = set(stopwords.words('english'))

from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chanmuzi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
all_data['cleaned'] = all_data['text'].apply(lambda x:remove_tag(x))
all_data['cleaned'] = all_data['cleaned'].apply(lambda x: remove_URL(x))
all_data['cleaned'] = all_data['cleaned'].apply(lambda x: remove_html(x))
all_data['cleaned'] = all_data['cleaned'].apply(lambda x: remove_punct(x))
all_data['cleaned'] = all_data['cleaned'].apply(lambda x: x.lower()) # lowering
all_data['cleaned'] = all_data['cleaned'].apply(lambda x: word_tokenize(x)) # split sentence into words list
# exclude stop words and make them a sentence again
all_data['cleaned'] = all_data['cleaned'].apply(lambda x: ' '.join([word for word in x if word not in stop]))

## Dataset, DataLoader

In [6]:
# Remember we have combined train and test set into one all_data
train_data,test_data = all_data[:train_len],all_data[train_len:]

In [7]:
from torch.utils.data import Dataset
import torch

class TweetDataset(Dataset):
    def __init__(self,df,is_grad,tokenizer):
        self.df = df
        self.is_grad = is_grad
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self,idx):
        text = self.df.loc[idx]['text']

        encoded_dict = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            padding='max_length',
            truncation=True,
            max_length=84,
            return_tensors='pt',
            return_attention_mask=True,
        )

        if self.is_grad:
            labels = self.df.loc[idx]['target']
            return {'input_ids':encoded_dict['input_ids'].squeeze(),
                    'attention_mask':encoded_dict['attention_mask'].squeeze(),
                    'labels':torch.tensor(labels,dtype=torch.float).unsqueeze(dim=0)}
        else:
            return {'input_ids':encoded_dict['input_ids'],
                    'attention_mask':encoded_dict['attention_mask']}

In [8]:
from transformers import BertTokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)

train_dataset = TweetDataset(train_data,True,tokenizer)

In [9]:
from torch.utils.data import random_split

train_size = int(0.8 * len(train_dataset))
valid_size = len(train_dataset) - train_size

train_dataset,valid_dataset = random_split(train_dataset,[train_size,valid_size])

print(f'{len(train_dataset)} train samples')
print(f'{len(valid_dataset)} valid samples')

6090 train samples
1523 valid samples


In [10]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset,batch_size=32,shuffle=True,pin_memory=True)
valid_dataloader = DataLoader(valid_dataset,batch_size=32,shuffle=False,pin_memory=True)

## Model

In [11]:
configs = {
    'model_name':'bert-base-uncased',
    'num_labels':2,
    'batch_size':32,
    'epochs':10,
    'learning_rate':5e-6,
}

In [12]:
import numpy as np
import torch
import torch.nn as nn
from transformers import BertForSequenceClassification

class TweetsModel(nn.Module):
    def __init__(self,model_name):
        super().__init__()
        self.model = BertForSequenceClassification.from_pretrained(model_name)

    def forward(self,input_ids,attention_mask):
        output = self.model(input_ids=input_ids,attention_mask=attention_mask)
        logits = output.logits.detach().cpu().numpy()
        logits = np.argmax(logits,axis=1)
        return torch.tensor(logits,dtype=torch.float)

In [13]:
if torch.cuda.is_available():
    print('GPU is turning on...')
    device = 'cuda'
else:
    print('CPU is turning on...')
    device = 'cpu'

CPU is turning on...


In [14]:
model = TweetsModel(configs['model_name']).to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

## Tools

In [15]:
# loss function
# (y_pred,y_label)
import torch.nn as nn

loss_fn = nn.CrossEntropyLoss()

In [16]:
# optimizer
from transformers import AdamW

optimizer = AdamW(model.parameters(),
                lr=6e-6,
                eps=1e-8,
                no_deprecation_warning=True)

In [17]:
# metric
# f1_score(y_label,y_pred)
from sklearn.metrics import f1_score

metric = f1_score

## Train

In [39]:
import gc
from tqdm.auto import tqdm

def train(model,device,train_dataloader,valid_dataloader,epochs,loss_fn,optimizer,metric):

    for epoch in range(epochs):
        gc.collect()
        model.train()

        train_loss = 0
        train_step = 0
        pbar = tqdm(train_dataloader)

        for batch in pbar:
            optimizer.zero_grad()
            train_step += 1

            train_input_ids = batch['input_ids'].to(device)
            train_attention_mask = batch['attention_mask'].to(device)
            train_labels = batch['labels'].squeeze().to(device)

            logits = model(train_input_ids,train_attention_mask).to(device)
            loss = loss_fn(logits,train_labels).requires_grad_()
            loss.backward()

            train_loss += loss.detach().cpu().numpy().item()
            
            optimizer.step()

            pbar.set_postfix({'train_loss':train_loss/train_step})
        pbar.close()

        with torch.no_grad():
            model.eval()

            valid_loss = 0
            valid_step = 0
            total_valid_score = 0
            valid_loss_min = [1]

            pbar = tqdm(valid_dataloader)
            for batch in pbar:
                valid_step += 1

                valid_input_ids = batch['input_ids'].to(device)
                valid_attention_mask = batch['attention_mask'].to(device)
                valid_labels = batch['labels'].squeeze().to(device)

                logits = model(valid_input_ids,valid_attention_mask).to(device)
                loss = loss_fn(logits,valid_labels)
                
                valid_loss += loss.detach().cpu().numpy().item()

                y_preds = logits.detach().cpu().numpy()
                y_labels = valid_labels.detach().cpu().numpy()
                total_valid_score += metric(y_labels,y_preds)

            valid_loss /= valid_step
            total_valid_score /= valid_step

            print(f'Epoch [{epoch+1}/{epochs}] Score: {total_valid_score}')
            print(f'Epoch [{epoch+1}/{epochs}] Valid_loss: {valid_loss}')

            if valid_loss < min(valid_loss_min):
                print('model improved!')
                print('save checkpoint!')
                torch.save(model.state_dict(), f'epoch:{epoch+1}_model.pt')
                valid_loss_min.append(valid_loss)

    del train_dataloader, valid_dataloader
    gc.collect()

In [40]:
if torch.cuda.is_available():
    print('GPU is turning on...')
    device = 'cuda'
else:
    print('CPU is turning on...')
    device = 'cpu'

CPU is turning on...


In [41]:
print('Training Start!')
print('=' * 100)

train(model,
    device,
    train_dataloader,
    valid_dataloader,
    configs['epochs'],
    loss_fn,
    optimizer,
    metric)

del model, train_dataloader, valid_dataloader
gc.collect()

Training Start!


  0%|          | 0/191 [00:00<?, ?it/s]

[E thread_pool.cpp:113] Exception in thread pool task: mutex lock failed: Invalid argument


KeyboardInterrupt: 