<a href="https://colab.research.google.com/github/dohyeongkim97/papers/blob/master/bert_clf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import pandas as pd
import numpy as np

import random
from transformers import set_seed

seed = 42
random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)
set_seed(seed)

In [2]:
torch.cuda.is_available()
from google.colab import drive

In [3]:
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
import os
print(os.getcwd())

/content


In [5]:
df = pd.read_csv("./drive/MyDrive/paper_data-master/train.csv")
test = pd.read_csv("./drive/MyDrive/paper_data-master/test.csv")

In [6]:
df

Unnamed: 0,ID,first_party,second_party,facts,first_party_winner
0,TRAIN_0000,Phil A. St. Amant,Herman A. Thompson,"On June 27, 1962, Phil St. Amant, a candidate ...",1
1,TRAIN_0001,Stephen Duncan,Lawrence Owens,Ramon Nelson was riding his bike when he suffe...,0
2,TRAIN_0002,Billy Joe Magwood,"Tony Patterson, Warden, et al.",An Alabama state court convicted Billy Joe Mag...,1
3,TRAIN_0003,Linkletter,Walker,Victor Linkletter was convicted in state court...,0
4,TRAIN_0004,William Earl Fikes,Alabama,"On April 24, 1953 in Selma, Alabama, an intrud...",1
...,...,...,...,...,...
2473,TRAIN_2473,"HollyFrontier Cheyenne Refining, LLC, et al.","Renewable Fuels Association, et al.",Congress amended the Clean Air Act through the...,1
2474,TRAIN_2474,"Grupo Mexicano de Desarrollo, S. A.","Alliance Bond Fund, Inc.","Alliance Bond Fund, Inc., an investment fund, ...",1
2475,TRAIN_2475,Peguero,United States,"In 1992, the District Court sentenced Manuel D...",0
2476,TRAIN_2476,Immigration and Naturalization Service,St. Cyr,"On March 8, 1996, Enrico St. Cyr, a lawful per...",0


In [7]:
os.listdir('./drive/MyDrive/paper_data-master/')

['sample_submission.csv',
 'test.csv',
 'train.csv',
 'judge',
 'training.1600000.processed.noemoticon.csv',
 'model_bert_classification']

In [8]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import RandomSampler, SequentialSampler
from torch import optim
from transformers import BertForSequenceClassification
from torch import nn
import math
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

In [9]:
def frame_make(df):
    if 'first_party_winner' in df.columns:
        data = pd.DataFrame(columns = ['text', 'target'])
        # df['first_party_winner'] = df['first_party_winner'].astype(str)
        data['text'] = 'first_party:' + df['first_party'] + 'second_party:' + df['second_party'] + 'facts:' + df['facts'] + '\nwinner: '
        data['target'] = df['first_party_winner']
        return data
    else:
        data = pd.DataFrame(columns = ['text'])
        data['text'] = 'first_party:' + df['first_party'] + 'second_party:' + df['second_party'] + 'facts:' + df['facts'] + '\nwinner: '
        return data


In [10]:
def make_dataset(data, tokenizer, device):
    tokenized = tokenizer(
        text = data.text.tolist(),
        padding= 'longest',
        truncation = True,
        return_tensors = 'pt'
    )
    input_ids = tokenized['input_ids'].to(device)
    attention_mask = tokenized['attention_mask'].to(device)
    if 'label' in data.columns:
        labels = torch.tensor(data.label.values, dtype=torch.long).to(device)
        return TensorDataset(input_ids, attention_mask, labels)
    else:
        return TensorDataset(input_ids, attention_mask)

In [11]:
def get_dataloader(dataset, sampler, batch_size):
    data_sampler = sampler(dataset)
    dataloader = DataLoader(dataset, sampler = data_sampler, batch_size = batch_size)
    return dataloader

In [12]:
df = frame_make(df)

In [13]:
df.columns = ['text', 'label']

In [14]:
epochs = 5
batch_size = 16
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = BertTokenizer.from_pretrained(
    pretrained_model_name_or_path='bert-base-multilingual-cased',
    do_lower_case = False
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]



In [15]:
df

Unnamed: 0,text,label
0,first_party:Phil A. St. Amantsecond_party:Herm...,1
1,first_party:Stephen Duncansecond_party:Lawrenc...,0
2,first_party:Billy Joe Magwoodsecond_party:Tony...,1
3,first_party:Linklettersecond_party:Walkerfacts...,0
4,first_party:William Earl Fikessecond_party:Ala...,1
...,...,...
2473,"first_party:HollyFrontier Cheyenne Refining, L...",1
2474,"first_party:Grupo Mexicano de Desarrollo, S. A...",1
2475,first_party:Peguerosecond_party:United Statesf...,0
2476,first_party:Immigration and Naturalization Ser...,0


In [16]:
class_0 = df[df['label'] == 0]
class_1 = df[df['label'] == 1]

class_0_oversampled = class_0.sample(len(class_1)+100, replace=True, random_state=42)
df = pd.concat([class_0_oversampled, class_1], axis=0).sample(frac=1).reset_index(drop=True)

In [17]:
df

Unnamed: 0,text,label
0,"first_party:City of San Antonio, Texas, On Beh...",0
1,first_party:United Mine Workers of America Hea...,1
2,"first_party:14 Penn Plaza LLC, et al.second_pa...",1
3,first_party:Wildersecond_party:Virginia Hospit...,0
4,first_party:Lisa Madigan et al.second_party:Ha...,0
...,...,...
3393,first_party:Pharmaceutical Research & Manufact...,0
3394,first_party:Stephen Duncansecond_party:Lawrenc...,0
3395,first_party:Arthur James Lomaxsecond_party:Chr...,0
3396,"first_party:Heart of Atlanta Motel, Inc.second...",0


In [18]:
train_df, valid, test = np.split(
    df.sample(frac = 1, random_state = 42), [int(0.6*len(df)), int(0.8*len(df))]
)

  return bound(*args, **kwds)


In [20]:
train_dataset = make_dataset(train_df, tokenizer, device)
train_dataloader = get_dataloader(train_dataset, RandomSampler, batch_size)

valid_dataset = make_dataset(valid, tokenizer, device)
valid_dataloader = get_dataloader(valid_dataset, RandomSampler, batch_size)

test_dataset = make_dataset(test, tokenizer, device)
test_dataloader = get_dataloader(test_dataset, RandomSampler, batch_size)

In [21]:
train_dataset[15]

(tensor([   101,  10422,    168,  14039,    131,  13866,  27964,  10341,  23486,
          10162,    168,  14039,    131,  16198,  58551,  13369,  39159,    131,
          33894,  15962,  10134,  56629,  10226,  99345,  10841,  10261,  26090,
            169,  13595,  18453, 106194,  10114,  10105,  12014,  10108,  10226,
          13578,  10169,    169,  22450,  11519,    119,  11301,  10551,  32684,
          56082,  14010,  10171,  26121,  16198,  58551,  10188,  10151,  56162,
          10108,  41833,  10111,  11059,    169,  90223,    117,  10261,  10134,
          27156,  10111,  61487,  10142,  15962,    100,    187,  12557,    119,
          24153,  15962,  10134,  39157,  11170,  10425,  11088,  10111,    171,
          71189,  11170,  10425,  11088,  80916,  10142,  18477,    117,  10105,
          31624,  10160,  58551,    100,  86696,  23626,  34920,  10189,  58551,
          10134,  23282,  10379,    169,  20998,  19918,  10165,  10111,  10134,
          32862,  10114,    

In [22]:
import datetime

In [23]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [60]:
model = BertForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path = 'bert-base-multilingual-cased',
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False
).to(device)

optimizer = optim.AdamW(model.parameters(), lr = 2e-4, eps = 1e-4)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
def calc_accuracy(preds, labels):
    pred_flat = np.argmax(preds.cpu().numpy(), axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [61]:
def train(model, optimizer, dataloader, device):
    model = model.to(device)
    model.train()
    train_loss = 0.0
    j = 0

    for input_ids, attention_mask, labels in dataloader:
        optimizer.zero_grad()
        outputs = model(
            input_ids = input_ids.to(device),
            attention_mask = attention_mask.to(device),
            labels = labels.to(device)
        )

        loss = outputs.loss
        train_loss += loss.item()

        # optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    train_loss = train_loss / len(dataloader)
    return train_loss

In [62]:
def evaluation(model, dataloader):
    with torch.no_grad():
        model.eval()
        criterion = nn.CrossEntropyLoss()
        val_loss, val_accuracy = 0.0, 0.0

        for input_ids, attention_mask, labels in dataloader:
            outputs = model(
                input_ids = input_ids,
                attention_mask = attention_mask,
                labels = labels
            )
            logits = outputs.logits

            loss = criterion(logits, labels)
            logtis = logits.detach().cpu().numpy()
            labels_ids = labels.to('cpu').numpy()
            accuracy = calc_accuracy(logits, labels_ids)

            # val_loss += loss
            val_loss += loss.item()
            val_accuracy += accuracy

        val_loss = val_loss/len(dataloader)
        val_accuracy = val_accuracy / len(dataloader)
        return val_loss, val_accuracy

In [63]:
best_loss = 10000
for epoch in range(epochs):
    train_loss = train(model, optimizer, train_dataloader, device)
    val_loss, val_accuracy = evaluation(model, valid_dataloader)
    print(f'Epoch: {epoch+1} train loss: {train_loss:.4f} val loss: {val_loss:.4f} val_acc: {val_accuracy:.4f}')

    if val_loss < best_loss:
        best_loss = val_loss

Epoch: 1 train loss: 0.7173 val loss: 0.7183 val_acc: 0.4826
Epoch: 2 train loss: 0.7041 val loss: 0.7193 val_acc: 0.4855
Epoch: 3 train loss: 0.7046 val loss: 0.7043 val_acc: 0.4884
Epoch: 4 train loss: 0.7003 val loss: 0.6957 val_acc: 0.5145
Epoch: 5 train loss: 0.6990 val loss: 0.7042 val_acc: 0.5116


In [64]:
model.save_pretrained('./drive/MyDrive/paper_data-master/model_bert_classification')

In [65]:
def predict(model, dataloader, device):
    model.to(device)
    model.eval()
    predictions = []

    with torch.no_grad():
        for input_ids, attention_mask in dataloader:
            outputs = model(
                input_ids = input_ids,
                attention_mask = attention_mask
            )
            logits = outputs.logits

            predicted_labels = torch.argmax(logits, dim=1)
            predictions.extend(predicted_labels.cpu().numpy())

    return predictions

In [66]:
test_data = pd.read_csv("./drive/MyDrive/paper_data-master/test.csv")

In [67]:
test_data = frame_make(test_data)

In [68]:
test_data

Unnamed: 0,text
0,first_party:Salernosecond_party:United Statesf...
1,first_party:Milberg Weiss Bershad Hynes and Le...
2,first_party:No. 07-582\t Title: \t Federal Com...
3,first_party:Harold Kaufman second_party:United...
4,first_party:Bergersecond_party:Hanlonfacts:In ...
...,...
1235,"first_party:Haitian Centers Council, Inc., et ..."
1236,first_party:Whitmansecond_party:American Truck...
1237,first_party:Linda A. Matteo and John J. Madiga...
1238,first_party:Washington State Apple Advertising...


In [69]:
valid.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,350
0,330


In [70]:
test_data_dataset = make_dataset(test_data, tokenizer, device)
test_data_dataloader = get_dataloader(test_data_dataset, RandomSampler, batch_size)

In [71]:
model_path = './drive/MyDrive/paper_data-master/model_bert_classification'

model = BertForSequenceClassification.from_pretrained(model_path)
# tokenizer = BertTokenizer.from_pretrained(model_path)

In [72]:
test_predicted = predict(model, test_data_dataloader, device)

In [73]:
len(test_predicted)

1240

In [74]:
sum(test_predicted)

1240

In [75]:
def predict_for_test(model, dataloader, device):
    model.eval()
    predictions = []

    with torch.no_grad():
        for input_ids, attention_mask, _ in dataloader:
            outputs = model(
                input_ids = input_ids,
                attention_mask = attention_mask
            )
            logits = outputs.logits

            predicted_labels = torch.argmax(logits, dim=1)
            predictions.extend(predicted_labels.cpu().numpy())

    return predictions

In [76]:
train_predicted = predict_for_test(model, train_dataloader, device)

In [77]:
len(train_predicted)

2038

In [78]:
sum(train_predicted)

2038

In [79]:
import torch.nn.functional as F

def predict_probabilities(model, dataloader, device):
    model.to(device)
    model.eval()
    probabilities = []

    with torch.no_grad():
        for input_ids, attention_mask in dataloader:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            model.to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            probs = torch.sigmoid(logits).cpu().numpy()
            probabilities.extend(probs)

    return probabilities

In [80]:
train_dataset[0]

(tensor([   101,  10422,    168,  14039,    131,  17099,  10341,  23486,  10162,
            168,  14039,    131,  21474,  10694,  20386,  68088,  13369,  39159,
            131,  10576,  10780,  10250,    117,  10583,    117,  21474,  10694,
          20386,  68088,  10134,  29479,  10160,  10105,  18167,  10108,    169,
          11499,  20251,  10774,  10157,    119,  10117,  45165,  10230,  19288,
          24944,  10957,  10226,  25841,  16691,    117,  10111,  20386,  68088,
          22151,  10114,  12888,  10151,  54131,  11360,  57085,  10230,  11178,
          26686,    119,  10576,  10780,  10270,    117,  11371,  20386,  68088,
          10134,  12647,  10106, 103772,  10135,  10105,  11499,  20251,  10774,
          10157,  18163,    117,    169,  12902,  19288,    117,  10479,  10134,
          10153,  27444,  10189,  20386,  68088,  10374,  59728,  11170,  78067,
            117,  22021,  59648,  11912,  10957,  33295,    169,  25470,  60805,
          10157,  10189,  44

In [81]:
probs = predict_probabilities(model, test_data_dataloader, device)

In [82]:
probs2 = pd.DataFrame(probs)

In [83]:
probs2[0].unique()

array([0.46221724, 0.4622174 , 0.46221787, 0.46221772, 0.46221775,
       0.46221823, 0.46221817, 0.46221793, 0.4622176 , 0.4622178 ,
       0.4622173 , 0.46221802, 0.4622171 , 0.46221796, 0.46221766,
       0.4622175 , 0.4622181 , 0.46221757, 0.4622172 , 0.4622183 ,
       0.46221808, 0.46221825, 0.46221715, 0.46221736, 0.46221837,
       0.462217  , 0.46221694, 0.46221688, 0.46221745, 0.46221685,
       0.46221843, 0.4622168 , 0.46221703], dtype=float32)

In [84]:
def predict_probabilities2(model, dataloader, device):
    model.eval()
    probabilities = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)

            model.to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            # 로짓 값과 확률 값 출력
            print("Logits:", logits)  # 로짓 값 출력
            probs = torch.sigmoid(logits).cpu().numpy()  # 확률로 변환
            print("Probabilities:", probs)  # 확률 값 출력

            probabilities.extend(probs)

    return probabilities