In [1]:
from transformers import BertTokenizer, BertModel
import torch
from easydict import EasyDict
import torch.nn as nn
import torch.nn.functional as F
from datasets import load_dataset
import pandas as pd
from easydict import EasyDict
import gzip
import json
from sklearn.metrics import accuracy_score
import numpy as np
import evaluate
from transformers import Trainer, TrainingArguments
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
import os
from transformers import EarlyStoppingCallback
import random
from torch.utils.data import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

# Question Type Classification

In [None]:
def seed_everything(seed: int):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

SEED = 7
seed_everything(SEED)

In [4]:
labels = ["None", "Max", "Min", "Count", "Sum", "Average"]

In [5]:
Num_labels = len(labels)
id2label = {id:label for id, label in enumerate(labels)}
label2id = {label:id for id, label in enumerate(labels)}

In [6]:
print(label2id)
print(id2label)

{'None': 0, 'Max': 1, 'Min': 2, 'Count': 3, 'Sum': 4, 'Average': 5}
{0: 'None', 1: 'Max', 2: 'Min', 3: 'Count', 4: 'Sum', 5: 'Average'}


In [7]:
datasets = load_dataset("wikisql")

In [8]:
train = pd.DataFrame(datasets['train'])
val = pd.DataFrame(datasets['validation'])
test = pd.DataFrame(datasets['test'])

In [10]:
train_agg = train['sql'].apply(lambda x: x['agg']).tolist()
test_agg = test['sql'].apply(lambda x: x['agg']).tolist()
val_agg = val['sql'].apply(lambda x: x['agg']).tolist()
train_query = train['question'].tolist()
train_header = train['table'].apply(lambda x: x['header']).tolist()
test_query = test['question'].tolist()
test_header = test['table'].apply(lambda x: x['header']).tolist()
val_query = val['question'].tolist()
val_header = val['table'].apply(lambda x: x['header']).tolist()
train_qt = [id2label[x] for x in train_agg]
val_qt = [id2label[x] for x in val_agg]
test_qt = [id2label[x] for x in test_agg]

train_data = pd.DataFrame({'query': train_query, 'header': train_header, 'agg': train_agg, 'agg_label': train_qt})
test_data = pd.DataFrame({'query': test_query, 'header': test_header, 'agg': test_agg, 'agg_label': test_qt})
val_data = pd.DataFrame({'query': val_query, 'header': val_header, 'agg': val_agg, 'agg_label': val_qt})

In [13]:
train_data.head()

Unnamed: 0,query,header,agg,agg_label
0,Tell me what the notes are for South Australia,"[State/territory, Text/background colour, Form...",0,
1,What is the current series where the new serie...,"[State/territory, Text/background colour, Form...",0,
2,What is the format for South Australia?,"[State/territory, Text/background colour, Form...",0,
3,Name the background colour for the Australian ...,"[State/territory, Text/background colour, Form...",0,
4,how many times is the fuel propulsion is cng?,"[Order Year, Manufacturer, Model, Fleet Series...",3,Count


In [27]:
def agg_ratio(df):
    return df['agg'].value_counts() / df['agg'].value_counts().sum()

train_agg_ratio = agg_ratio(train_data)
test_agg_ratio = agg_ratio(test_data)
val_agg_ratio = agg_ratio(val_data)

# 기존 데이터셋
pd.DataFrame([train_agg_ratio, test_agg_ratio, val_agg_ratio], index=['train', 'test', 'val']) 

agg,0,3,2,1,5,4
train,0.720539,0.090746,0.057333,0.056091,0.039056,0.036235
test,0.713188,0.091636,0.058761,0.06128,0.038166,0.036969
val,0.714523,0.092507,0.055575,0.060207,0.039069,0.038119


In [28]:
train_data['agg'].value_counts()

agg
0    40606
3     5114
2     3231
1     3161
5     2201
4     2042
Name: count, dtype: int64

In [82]:
len(train_data) * 0.1

5635.5

# Under sampling

In [11]:
def undersample(df, target_col='agg', target_class=0, target_ratio=0.1, random_state=42):
    # 클래스별로 분리
    majority = df[df[target_col] == target_class]
    others = df[df[target_col] != target_class]

    # 목표 비율에 맞게 클래스 0에서 일부만 샘플링
    target_n = int(len(df) * target_ratio)
    sampled_majority = majority.sample(n=target_n, random_state=random_state)

    # 합치기
    balanced_df = pd.concat([sampled_majority, others], axis=0).sample(frac=1, random_state=random_state).reset_index(drop=True) # frac=1 → 전체 행을 다 섞음 (shuffle)
    return balanced_df

In [12]:
train_resampled = undersample(train_data) 
test_resampled = undersample(test_data)
val_resampled = undersample(val_data)

In [24]:
[df['agg'].value_counts() for df in [train_resampled, test_resampled, val_resampled]]

[agg
 0    5635
 3    5114
 2    3231
 1    3161
 5    2201
 4    2042
 Name: count, dtype: int64,
 agg
 0    1587
 3    1455
 1     973
 2     933
 5     606
 4     587
 Name: count, dtype: int64,
 agg
 0    842
 3    779
 1    507
 2    468
 5    329
 4    321
 Name: count, dtype: int64]

In [None]:
# 토크나이저를 쉽게 처리하기 위해 json 파일로 저장
def convert_to_jsonl(df, out_path):
    with gzip.open(out_path, 'wt', encoding='utf-8') as f:
        for i in range(len(df)):

            # header안에 리스트인 경우 문자로 변환해서 *로 합쳐줘야함 
            header = df['header'][i]
            if isinstance(header, list):
                header = [str(h) for h in header]
            else:
                header = str(header)

            # 라벨값이 numpy이면 json.dumps가 처리하지 못함
            label = df['agg'][i]
            if isinstance(label, (np.integer, np.int64, np.int32)):
                label = int(label)

            item = {
                'id': i,
                'query': df['query'][i],
                'header': ' * '.join(header),
                'label': label,
                'category' : df['agg_label'][i]
            }

            f.write(json.dumps(item) + '\n')

In [137]:
def header_sep_token(df):
    # 각 행의 header가 리스트면 문자열로 변환하고 *로 연결
    new_headers = []
    for header in df['header']:
        if isinstance(header, list):
            header = [str(h) for h in header]
            header = ' * '.join(header)
        else:
            header = str(header)
        new_headers.append(header)
    df['header'] = new_headers
    return df

In [14]:
file_path = '/home/eunji/workspace/kim-internship/Eunji/wikisql_jsonl/'

In [4]:
# os.path.join(디렉토리, 파일명)
convert_to_jsonl(train_resampled, os.path.join(file_path, 'train.jsonl.gz'))
convert_to_jsonl(test_resampled, os.path.join(file_path, 'test.jsonl.gz'))
convert_to_jsonl(val_resampled, os.path.join(file_path, 'val.jsonl.gz'))

NameError: name 'convert_to_jsonl' is not defined

In [21]:
# jsonl.gz 파일은 압축되어 있음
# 압축을 풀어서 확인

file_check = os.path.join(file_path + 'test.jsonl.gz')
with gzip.open(file_check, 'rt', encoding='utf-8') as f:
    for i, line in enumerate(f):
        print(line)
        if i > 2:
            break

{"id": 0, "query": "Name the most attendance for giants points of 10", "header": "Game * Date * Opponent * Result * Giants points * Opponents * Record * Attendance", "label": 1, "category": "Max"}

{"id": 1, "query": "What is the average (Jericho) with a Population (Total) less than 4,059 in 1986 and a (Barcaldine) higher than 1,779?", "header": "Year * Population (Total) * (Barcaldine) * (Aramac) * (Jericho)", "label": 5, "category": "Average"}

{"id": 2, "query": "Which studio grossed $83,531,958 and ranked lower than 13?", "header": "Rank * Title * Studio * Director(s) * Gross", "label": 0, "category": "None"}

{"id": 3, "query": "What are the fewest number of podiums associated with a Series of formula renault 2000 brazil, and under 1 pole?", "header": "Season * Series * Races * Poles * Wins * Podiums * Points * Position", "label": 2, "category": "Min"}



In [15]:
dataset = load_dataset('json', data_files={
    'train': os.path.join(file_path, 'train.jsonl.gz'),
    'validation': os.path.join(file_path,'val.jsonl.gz'),
    'test': os.path.join(file_path ,'test.jsonl.gz')
})

In [103]:
train_pre_encoding = header_sep_token(train_resampled)
test_pre_encoding = header_sep_token(test_resampled)
val_pre_encoding = header_sep_token(val_resampled)

In [126]:
train_pre_encoding['header']

0        Team * Points * Played * Drawn * Lost * Agains...
1           Rank * Player * Country * Earnings( $ ) * Wins
2        Week * Date * Kickoff * Opponent * Final score...
3                    Rank * Gold * Silver * Bronze * Total
4        Lot No. * Diagram * Built * Builder * Fleet nu...
                               ...                        
21379    Driver * Constructor * Laps * Time/Retired * Grid
21380    Matches * Innings * Not out * Runs * High Scor...
21381          Team * City * State * Home venue * Capacity
21382    Game * Date * Team * Score * High points * Hig...
21383    Season * Series * Races * Wins * Poles * F/Lap...
Name: header, Length: 21384, dtype: object

In [16]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def preprocess(example):
    return tokenizer(example['query'], example['header'], 
                     #return_tensors='pt', 
                     truncation=True, 
                     padding='max_length', # 최대길이가 안되면 나머지 0으로 채움
                     max_length=128) # 문장 최대 길이

tokenized_dataset = dataset.map(preprocess)



In [17]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased', num_labels=6, id2label=id2label, label2id=label2id
)
model.to(device)

# agg_index는 0~5
# 분류 문제 → CrossEntropyLoss

  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassif

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [19]:
train_labels = train_resampled['agg']
val_labels = val_resampled['agg']
test_labels = test_resampled['agg']

In [20]:
class DataLoader(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {}
        for key in ['input_ids', 'token_type_ids', 'attention_mask']:
            item[key] = torch.tensor(self.encodings[key][idx])
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [21]:
train_dataloader  = DataLoader(tokenized_dataset['train'], list(train_labels))
val_dataloader  = DataLoader(tokenized_dataset['validation'], list(val_labels))
test_dataset  = DataLoader(tokenized_dataset['test'], list(test_labels))

In [22]:
train_dataloader[0]

{'input_ids': tensor([ 101, 2054, 2003, 1996, 3284, 4487, 4246, 2007, 4567, 1997, 1020, 1998,
         2377, 3469, 2084, 2324, 1029,  102, 2136, 1008, 2685, 1008, 2209, 1008,
         4567, 1008, 2439, 1008, 2114, 1008, 4487, 4246,  102,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1

In [23]:
# load metrics
accuracy = evaluate.load("accuracy")
f1_score = evaluate.load("f1")

def compute_metrics(eval_pred):
    # eval_pred = (predictions, labels)
    predictions, labels = eval_pred

    # 다중분류
    # 특정 i라벨의 확률 = 특정 i 라벨의 승산/모든 라벨의 승산 
    # predictions = [batch_size, num_labels]
    # probabilities = np.exp(predictions) / np.exp(predictions).sum(-1, keepdims=True)

    # 가장 로짓이 큰 라벨 추출
    predicted_classes = np.argmax(predictions, axis=1)

    # compute accuracy와 f1-score
    acc = np.round(accuracy.compute(predictions=predicted_classes, 
                                     references=labels)['accuracy'],3)
    
    f1 = np.round(f1_score.compute(predictions=predicted_classes, references=labels, average='macro')['f1'], 3) #  라벨별 f1-score를 산술평균한 것 : 현재 라벨의 갯수가 같아서 이렇게 써도 된다고 판단
    
    return {"Accuracy": acc, "f1": f1}

# hyperparameters
lr = 4e-5
num_epochs = 2

training_args = TrainingArguments(
    output_dir=os.path.join(file_path,'bert-agg'),
    logging_dir=os.path.join(file_path,'logs'),
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=num_epochs,
    #learning_rate=lr,
    # Number of steps used for a linear warmup
    warmup_steps=100,                
    weight_decay=0.01,
    logging_strategy="epoch",
    #logging_steps=500,
    evaluation_strategy="epoch",
    #eval_steps=50,
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    report_to='none',
    fp16=True,
    #metric_for_best_model="f1",
    dataloader_num_workers=4,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataloader,
    eval_dataset=val_dataloader,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

In [None]:
preds = trainer.predict(tokenized_dataset['test'])
pred_labels = preds.predictions.argmax(-1)

# 실제값 (리스트 → 배열로 변환)
true_label = np.array(tokenized_dataset['test']['label'])

In [None]:
# 정확도 계산
accuracy = accuracy_score(true_label, pred_labels)
print(f"Accuracy: {accuracy:.4f}")

In [None]:
from sklearn.metrics import classification_report

print(classification_report(true_label, pred_labels))