In [1]:
%cd ..
%cd ..

/root/yhh/assignment-2-text-classification-foxintohumanbeing-main/improvement
/root/yhh/assignment-2-text-classification-foxintohumanbeing-main


In [2]:
import os
import sys
import time
import random

import numpy as np
import pandas as pd

from tqdm import tqdm

import torch
from torch import nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter

import torchtext
from torchtext.vocab import GloVe 
from torchtext.data import get_tokenizer

from utils.clean_utils import expand_contractions, clean_texts

2023-04-22 00:09:56.294577: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


 ### Set random seed


In [3]:
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7f20daee4910>

 ### Define Hyper-parameters


 build the CONFIG object


In [4]:
configs = {
    'work_dir': 'work_dir2', 
    'device': 'cuda:0',
    'batch': 16, 
    'optimizer_config': {
        'lr': 1e-5, 
    }, 
    'epoch': 10, 
    'model_name': 'bert-base-cased',
    'train_ratio':0.7
}

 ### Define datasets


In [5]:
!pip install transformers

Looking in indexes: https://mirror.baidu.com/pypi/simple
[0m

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(configs['model_name'])

In [7]:
class TweetDataset(Dataset):

    def __init__(self, fname: str, is_train: bool = True) -> None:
        ''' A dataset object to read tweets.
            @input fname: the .csv file name.
            @input is_train: True if is training dataset, else the testing dataset.
        '''
        super().__init__()
        self.is_train = is_train
        # read in the data
        df = pd.read_csv(fname)#.head(32)
        # preprocessing the data
        df['keyword'] = df['keyword'].fillna('unknown')
        df['text'] = df['text'] + ' ' + df['keyword']
        df['text'] = df['text'].apply(expand_contractions)       
        df['text'] = df['text'].apply(clean_texts)  

        # make the data in a list, for later use.
        self.data = [] # all-in-a-list. (id, bag(input_ids, mask, labels,...))
        for i in range(len(df)):
            bag = tokenizer(df.iloc[i, 3], padding="max_length", truncation=True)
            bag = {k: torch.tensor(v) for k, v in bag.items()}
            if is_train:
                bag['labels'] = torch.tensor(df.iloc[i, 4], dtype=torch.int64)
            
            self.data.append((
                df.iloc[i, 0], 
                bag                    
            ))

        print('Complete data preprocessing with length:', len(self.data))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index: int):
        return self.data[index]


 collate functions


In [8]:
def collate_fn(batch):
    ids, bags = zip(*batch)
    is_train = len(bags[0]) == 4
    
    ret_bag = {}
    input_ids = []
    token_type_ids = []
    attention_mask = []
    labels = []
    for b in bags:
        input_ids.append(b['input_ids'])
        token_type_ids.append(b['token_type_ids'])
        attention_mask.append(b['attention_mask'])
        if is_train:
            labels.append(b['labels'].unsqueeze(0))

    ret_bag['input_ids'] = torch.vstack(input_ids)
    ret_bag['token_type_ids'] = torch.vstack(token_type_ids)
    ret_bag['attention_mask'] = torch.vstack(attention_mask)
    if is_train:
        ret_bag['labels'] = torch.concat(labels)
    return ids, ret_bag

train_set = TweetDataset('nlp-getting-started/train.csv')
test_set = TweetDataset('nlp-getting-started/test.csv', False)

train_size = int(len(train_set) * configs['train_ratio'])
val_size = len(train_set) - train_size

train_set, val_set = torch.utils.data.random_split(train_set, [train_size, val_size])

train_dataloader = DataLoader(train_set,
                batch_size=configs['batch'],
                shuffle=True,
                num_workers=0, 
                collate_fn=collate_fn)
val_dataloader = DataLoader(val_set,
                batch_size=configs['batch'],
                shuffle=True,
                num_workers=0, 
                collate_fn=collate_fn)
test_dataloader = DataLoader(test_set,
                batch_size=configs['batch'],
                shuffle=False,
                num_workers=0, 
                collate_fn=collate_fn)

Complete data preprocessing with length: 7613
Complete data preprocessing with length: 3263


 ### Define model


In [9]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(configs['model_name'], num_labels=2)
model.to(configs['device'])


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

 ### Training


In [10]:
writer = SummaryWriter()
optimizer = torch.optim.AdamW(model.parameters(), **configs['optimizer_config'])
total_epoch = configs['epoch']
best_acc = float('-inf')
step = 0
for epoch in range(total_epoch):
    # train!
    model.train()
    train_loss_sum = 0
    train_correct = 0
    dataset_len = len(train_dataloader.dataset)
    for id, batch in tqdm(train_dataloader):
        batch = {k: v.to(configs['device']) for k, v in batch.items()}

        outputs = model(**batch)

        loss = outputs.loss
        loss.backward()

        # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
        optimizer.zero_grad()

        predicted = torch.argmax(outputs.logits, dim=1)
        train_correct += (predicted == batch['labels']).sum()
        train_loss_sum += loss.item() * configs['batch']
    writer.add_scalar('training loss',
                            train_loss_sum/len(train_set),
                            epoch+1)    

    # eval!
    model.eval()
    val_loss_sum = 0
    val_correct = 0
    results_id = []
    results_predict = []
    with torch.no_grad():
        for id, batch in tqdm(val_dataloader):
            batch = {k: v.to(configs['device']) for k, v in batch.items()}
            outputs = model(**batch)
            predicted = torch.argmax(outputs.logits, dim=1)
            val_correct += (predicted == batch['labels']).sum()
            val_loss_sum += outputs.loss.item() * configs['batch']
            results_id.extend(id)
            results_predict.append(predicted.cpu())

    val_loss = val_loss_sum / len(val_set)
    val_acc = val_correct / len(val_set) 
    writer.add_scalar('average validation accuracy', val_acc, epoch+1)
    print('val',val_acc)  

    if val_acc > best_acc:
        best_acc = val_acc 
        pt_path = os.path.join('improvement/improvement2', 'best+BERT.pt')
        torch.save(model.state_dict(), pt_path)
        results_predict = torch.concat(results_predict).tolist()
        prediction = pd.DataFrame()
        prediction['id'] = results_id
        prediction['target'] = results_predict
        prediction.to_csv('improvement/improvement2/validation_result.csv',index=False)
    

100%|██████████| 334/334 [01:51<00:00,  2.99it/s]
100%|██████████| 143/143 [00:16<00:00,  8.57it/s]


val tensor(0.8271, device='cuda:0')


100%|██████████| 334/334 [01:51<00:00,  2.99it/s]
100%|██████████| 143/143 [00:16<00:00,  8.56it/s]


val tensor(0.8157, device='cuda:0')


100%|██████████| 334/334 [01:51<00:00,  2.99it/s]
100%|██████████| 143/143 [00:16<00:00,  8.56it/s]


val tensor(0.8323, device='cuda:0')


100%|██████████| 334/334 [01:51<00:00,  2.99it/s]
100%|██████████| 143/143 [00:16<00:00,  8.56it/s]


val tensor(0.8201, device='cuda:0')


100%|██████████| 334/334 [01:51<00:00,  2.98it/s]
100%|██████████| 143/143 [00:16<00:00,  8.56it/s]


val tensor(0.8205, device='cuda:0')


100%|██████████| 334/334 [01:51<00:00,  2.99it/s]
100%|██████████| 143/143 [00:16<00:00,  8.56it/s]


val tensor(0.8236, device='cuda:0')


100%|██████████| 334/334 [01:51<00:00,  2.99it/s]
100%|██████████| 143/143 [00:16<00:00,  8.60it/s]


val tensor(0.8214, device='cuda:0')


100%|██████████| 334/334 [01:51<00:00,  2.98it/s]
100%|██████████| 143/143 [00:16<00:00,  8.59it/s]


val tensor(0.8319, device='cuda:0')


100%|██████████| 334/334 [01:51<00:00,  2.98it/s]
100%|██████████| 143/143 [00:16<00:00,  8.59it/s]


val tensor(0.8165, device='cuda:0')


100%|██████████| 334/334 [01:52<00:00,  2.98it/s]
100%|██████████| 143/143 [00:16<00:00,  8.59it/s]

val tensor(0.8021, device='cuda:0')





 ### Inference


In [11]:

results_id = []
results_predict = []

model.load_state_dict(torch.load(os.path.join('improvement/improvement2', 'best+BERT.pt')))
model.eval()
with torch.no_grad():
    for id, batch in tqdm(test_dataloader):
        batch = {k: v.to(configs['device']) for k, v in batch.items()}

        outputs = model(**batch)

        predicted = torch.argmax(outputs.logits, dim=1)
        results_id.extend(id)
        results_predict.append(predicted.cpu())

results_predict = torch.concat(results_predict).tolist()
results_df = pd.DataFrame({'id': results_id, 'target': results_predict})

100%|██████████| 204/204 [00:23<00:00,  8.69it/s]


 ### Export the results


In [12]:
results_df.to_csv('prediction_result/predict_bert.csv', index=False)
results_df

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,0
3259,10865,1
3260,10868,1
3261,10874,1
