<a href="https://colab.research.google.com/github/dohyeongkim97/DL_study/blob/master/bert_clf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import pandas as pd
import numpy as np

import random
from transformers import set_seed

seed = 42
random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)
set_seed(seed)

In [2]:
torch.cuda.is_available()
from google.colab import drive

In [3]:
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
import os
print(os.getcwd())

/content


In [5]:
df = pd.read_csv("./drive/MyDrive/paper_data-master/train.csv")
test = pd.read_csv("./drive/MyDrive/paper_data-master/test.csv")

In [6]:
df

Unnamed: 0,ID,first_party,second_party,facts,first_party_winner
0,TRAIN_0000,Phil A. St. Amant,Herman A. Thompson,"On June 27, 1962, Phil St. Amant, a candidate ...",1
1,TRAIN_0001,Stephen Duncan,Lawrence Owens,Ramon Nelson was riding his bike when he suffe...,0
2,TRAIN_0002,Billy Joe Magwood,"Tony Patterson, Warden, et al.",An Alabama state court convicted Billy Joe Mag...,1
3,TRAIN_0003,Linkletter,Walker,Victor Linkletter was convicted in state court...,0
4,TRAIN_0004,William Earl Fikes,Alabama,"On April 24, 1953 in Selma, Alabama, an intrud...",1
...,...,...,...,...,...
2473,TRAIN_2473,"HollyFrontier Cheyenne Refining, LLC, et al.","Renewable Fuels Association, et al.",Congress amended the Clean Air Act through the...,1
2474,TRAIN_2474,"Grupo Mexicano de Desarrollo, S. A.","Alliance Bond Fund, Inc.","Alliance Bond Fund, Inc., an investment fund, ...",1
2475,TRAIN_2475,Peguero,United States,"In 1992, the District Court sentenced Manuel D...",0
2476,TRAIN_2476,Immigration and Naturalization Service,St. Cyr,"On March 8, 1996, Enrico St. Cyr, a lawful per...",0


In [7]:
os.listdir('./drive/MyDrive/paper_data-master/')

['sample_submission.csv',
 'test.csv',
 'train.csv',
 'judge',
 'training.1600000.processed.noemoticon.csv',
 'model_bert_classification']

In [8]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import RandomSampler, SequentialSampler
from torch import optim
from transformers import BertForSequenceClassification
from torch import nn
import math
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

In [9]:
def frame_make(df):
    if 'first_party_winner' in df.columns:
        data = pd.DataFrame(columns = ['text', 'target'])
        # df['first_party_winner'] = df['first_party_winner'].astype(str)
        data['text'] = 'first_party:' + df['first_party'] + 'second_party:' + df['second_party'] + 'facts:' + df['facts'] + '\nwinner: '
        data['target'] = df['first_party_winner']
        return data
    else:
        data = pd.DataFrame(columns = ['text'])
        data['text'] = 'first_party:' + df['first_party'] + 'second_party:' + df['second_party'] + 'facts:' + df['facts'] + '\nwinner: '
        return data


In [10]:
def make_dataset(data, tokenizer, device):
    tokenized = tokenizer(
        text = data.text.tolist(),
        padding= 'longest',
        truncation = True,
        return_tensors = 'pt'
    )
    input_ids = tokenized['input_ids'].to(device)
    attention_mask = tokenized['attention_mask'].to(device)
    if 'label' in data.columns:
        labels = torch.tensor(data.label.values, dtype=torch.long).to(device)
        return TensorDataset(input_ids, attention_mask, labels)
    else:
        return TensorDataset(input_ids, attention_mask)

In [11]:
def get_dataloader(dataset, sampler, batch_size):
    data_sampler = sampler(dataset)
    dataloader = DataLoader(dataset, sampler = data_sampler, batch_size = batch_size)
    return dataloader

In [12]:
df = frame_make(df)

In [13]:
df.columns = ['text', 'label']

In [14]:
epochs = 5
batch_size = 32
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = BertTokenizer.from_pretrained(
    pretrained_model_name_or_path='bert-base-multilingual-cased',
    do_lower_case = False
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]



In [15]:
df

Unnamed: 0,text,label
0,first_party:Phil A. St. Amantsecond_party:Herm...,1
1,first_party:Stephen Duncansecond_party:Lawrenc...,0
2,first_party:Billy Joe Magwoodsecond_party:Tony...,1
3,first_party:Linklettersecond_party:Walkerfacts...,0
4,first_party:William Earl Fikessecond_party:Ala...,1
...,...,...
2473,"first_party:HollyFrontier Cheyenne Refining, L...",1
2474,"first_party:Grupo Mexicano de Desarrollo, S. A...",1
2475,first_party:Peguerosecond_party:United Statesf...,0
2476,first_party:Immigration and Naturalization Ser...,0


In [16]:
class_0 = df[df['label'] == 0]
class_1 = df[df['label'] == 1]

class_0_oversampled = class_0.sample(len(class_1)+50, replace=True, random_state=42)
df = pd.concat([class_0_oversampled, class_1], axis=0).sample(frac=1).reset_index(drop=True)

In [17]:
df

Unnamed: 0,text,label
0,first_party:Epic Systems Corporationsecond_par...,1
1,first_party:Arthur Andersen LLP et al.second_p...,1
2,first_party:Gerald T. Martin et ux.second_part...,0
3,first_party:North Carolina Department of Reven...,0
4,first_party:Herbertsecond_party:Landofacts:Ant...,1
...,...,...
3343,first_party:Pharmaceutical Research & Manufact...,0
3344,first_party:Stephen Duncansecond_party:Lawrenc...,0
3345,first_party:Arthur James Lomaxsecond_party:Chr...,0
3346,"first_party:Heart of Atlanta Motel, Inc.second...",0


In [18]:
train, valid, test = np.split(
    df.sample(frac = 1, random_state = 42), [int(0.6*len(df)), int(0.8*len(df))]
)

  return bound(*args, **kwds)


In [19]:
train

Unnamed: 0,text,label
1829,first_party:Michelin Tire Corporationsecond_pa...,0
2627,first_party:Fulton Corporationsecond_party:Fau...,1
1551,"first_party:American Express, et al.second_par...",1
1236,first_party:Federal Express Corporationsecond_...,0
2974,first_party:Burlington Northern and Santa Fe R...,0
...,...,...
1539,first_party:Immigration and Naturalization Ser...,1
3288,first_party:Norfolk Southern Railway Companyse...,1
1911,"first_party:Polar Tankers, Inc.second_party:Ci...",1
1550,first_party:Lockettsecond_party:Ohiofacts:An O...,1


In [20]:
train_dataset = make_dataset(train, tokenizer, device)
train_dataloader = get_dataloader(train_dataset, RandomSampler, batch_size)

valid_dataset = make_dataset(valid, tokenizer, device)
valid_dataloader = get_dataloader(valid_dataset, RandomSampler, batch_size)

test_dataset = make_dataset(test, tokenizer, device)
test_dataloader = get_dataloader(test_dataset, RandomSampler, batch_size)

In [21]:
train_dataset[15]

(tensor([   101,  10422,    168,  14039,    131,  60005,  30204,  10341,  23486,
          10162,    168,  14039,    131,  16264,  65597,  13369,  39159,    131,
          10167,  10105,  97705,  37730,  20394,  10108,  10105,  23626,  10160,
          10319,  93785,  10631,  60005,  30204,  10134,  61487,  10135,  10551,
          92869,  10108,  10422,  16455,  29448,    117,  10105,  13668,  11388,
          14866,  33182,  93890,  10151,  89568,  10157,  37922,  14526,  11639,
          22530,  98235,  10376,  12557,  49219,  10107,  10135,  11408,  92869,
            119, 106273,  19083,  16942,  25635,  27691,  10376,  54186,  10106,
          31671,  10169,  11408, 110165,    117,  10111,  10192,  10221,  15342,
          12141,  54186,  10146,  10114,  16106,    117,  10105,  23626,  14866,
          53365,  60005,  30204,  10114,  12557,  10135,  11408,  92869,    119,
          11301,  10226,  94259,  10111,  49219,  10309,  10741,  80545,  10155,
          10105,  13668,  32

In [22]:
import datetime

In [23]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [24]:
model = BertForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path = 'bert-base-multilingual-cased',
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False
).to(device)

optimizer = optim.AdamW(model.parameters(), lr = 2e-3, eps = 1e-4)

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
def calc_accuracy(preds, labels):
    pred_flat = np.argmax(preds.cpu().numpy(), axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [26]:
def train(model, optimizer, dataloader, device):
    model = model.to(device)
    model.train()
    train_loss = 0.0
    j = 0

    for input_ids, attention_mask, labels in dataloader:
        outputs = model(
            input_ids = input_ids,
            attention_mask = attention_mask,
            labels = labels
        )

        loss = outputs.loss
        train_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    train_loss = train_loss / len(dataloader)
    return train_loss

In [27]:
def evaluation(model, dataloader):
    with torch.no_grad():
        model.eval()
        criterion = nn.CrossEntropyLoss()
        val_loss, val_accuracy = 0.0, 0.0

        for input_ids, attention_mask, labels in dataloader:
            outputs = model(
                input_ids = input_ids,
                attention_mask = attention_mask,
                labels = labels
            )
            logits = outputs.logits

            loss = criterion(logits, labels)
            logtis = logits.detach().cpu().numpy()
            labels_ids = labels.to('cpu').numpy()
            accuracy = calc_accuracy(logits, labels_ids)

            val_loss += loss
            val_accuracy += accuracy

        val_loss = val_loss/len(dataloader)
        val_accuracy = val_accuracy / len(dataloader)
        return val_loss, val_accuracy

In [28]:
best_loss = 10000
for epoch in range(epochs):
    train_loss = train(model, optimizer, train_dataloader, device)
    val_loss, val_accuracy = evaluation(model, valid_dataloader)
    print(f'Epoch: {epoch+1} train loss: {train_loss:.4f} val loss: {val_loss:.4f} val_acc: {val_accuracy:.4f}')

    if val_loss < best_loss:
        best_loss = val_loss

Epoch: 1 train loss: 0.8479 val loss: 0.8577 val_acc: 0.5089
Epoch: 2 train loss: 0.7630 val loss: 0.7012 val_acc: 0.4912
Epoch: 3 train loss: 0.7184 val loss: 0.6930 val_acc: 0.5087


OutOfMemoryError: CUDA out of memory. Tried to allocate 192.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 149.06 MiB is free. Process 14892 has 14.60 GiB memory in use. Of the allocated memory 11.86 GiB is allocated by PyTorch, and 2.60 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [29]:
model.save_pretrained('./drive/MyDrive/paper_data-master/model_bert_classification')

In [30]:
def predict(model, dataloader, device):
    model.eval()
    predictions = []

    with torch.no_grad():
        for input_ids, attention_mask in dataloader:
            outputs = model(
                input_ids = input_ids,
                attention_mask = attention_mask
            )
            logits = outputs.logits

            predicted_labels = torch.argmax(logits, dim=1)
            predictions.extend(predicted_labels.cpu().numpy())

    return predictions

In [31]:
test_data = pd.read_csv("./drive/MyDrive/paper_data-master/test.csv")

In [32]:
test_data = frame_make(test_data)

In [33]:
test_data

Unnamed: 0,text
0,first_party:Salernosecond_party:United Statesf...
1,first_party:Milberg Weiss Bershad Hynes and Le...
2,first_party:No. 07-582\t Title: \t Federal Com...
3,first_party:Harold Kaufman second_party:United...
4,first_party:Bergersecond_party:Hanlonfacts:In ...
...,...
1235,"first_party:Haitian Centers Council, Inc., et ..."
1236,first_party:Whitmansecond_party:American Truck...
1237,first_party:Linda A. Matteo and John J. Madiga...
1238,first_party:Washington State Apple Advertising...


In [36]:
del model

In [34]:
test_data_dataset = make_dataset(test_data, tokenizer, device)
test_data_dataloader = get_dataloader(test_data_dataset, RandomSampler, batch_size)

In [35]:
test_predicted = predict(model, test_data_dataloader, device)

OutOfMemoryError: CUDA out of memory. Tried to allocate 192.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 149.06 MiB is free. Process 14892 has 14.60 GiB memory in use. Of the allocated memory 12.00 GiB is allocated by PyTorch, and 2.47 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
len(test_predicted)

In [None]:
sum(test_predicted)

In [None]:
def predict_for_test(model, dataloader, device):
    model.eval()
    predictions = []

    with torch.no_grad():
        for input_ids, attention_mask, _ in dataloader:
            outputs = model(
                input_ids = input_ids,
                attention_mask = attention_mask
            )
            logits = outputs.logits

            predicted_labels = torch.argmax(logits, dim=1)
            predictions.extend(predicted_labels.cpu().numpy())

    return predictions

In [None]:
train_predicted = predict_for_test(model, train_dataloader, device)

In [None]:
len(train_predicted)

In [None]:
sum(train_predicted)