In [1]:
import pandas as pd
import numpy as np
import re
import urllib.request
import os
from tqdm import tqdm
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
import matplotlib.pyplot as plt
import seaborn as sns
import collections
from sklearn.feature_extraction.text import CountVectorizer

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [2]:
train_data = pd.read_csv('/kaggle/input/train-aug10/train_aug10.csv')
val_data = pd.read_csv('/kaggle/input/validationdataa/val.csv')

In [3]:
x_train = train_data['conversation']
y_train = train_data['class']
x_val = val_data['conversation']
y_val = val_data['class']

In [4]:
len(x_train), len(y_train), len(x_val), len(y_val)

(31600, 31600, 790, 790)

In [5]:
label_mapping = {
    '협박 대화': 0,
    '갈취 대화': 1,
    '직장 내 괴롭힘 대화': 2,
    '기타 괴롭힘 대화': 3
}

# train_data와 test_data의 레이블 값을 숫자로 변환
y_train = y_train.map(label_mapping)
y_val = y_val.map(label_mapping)

In [6]:
max_len = 400
EPOCH = 2
lr = 2e-5

In [7]:
tokenizer = BertTokenizer.from_pretrained('klue/bert-base')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/425 [00:00<?, ?B/s]

In [8]:
def convert_examples_to_features(examples, labels, max_seq_len, tokenizer):
    
    input_ids, attention_masks, token_type_ids, data_labels = [], [], [], []
    
    for example, label in tqdm(zip(examples, labels), total=len(examples)):
        # input_id는 워드 임베딩을 위한 문장의 정수 인코딩
        input_id = tokenizer.encode(example, max_length=max_seq_len, 
                                    pad_to_max_length=True)
        
        # attention_mask는 실제 단어가 위치하면 1, 패딩의 위치에는 0인 시퀀스
        padding_count = input_id.count(tokenizer.pad_token_id)
        attention_mask = [1] * (max_seq_len - padding_count) + [0] * padding_count
        
        # token_type_id은 세그먼트 인코딩
        token_type_id = [0] * max_seq_len
        
        assert len(input_id) == max_seq_len, "Error with input length {} vs {}".format(len(input_id), max_seq_len)
        assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(len(attention_mask), max_seq_len)
        assert len(token_type_id) == max_seq_len, "Error with token type length {} vs {}".format(len(token_type_id), max_seq_len)
        
        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
        data_labels.append(label)
    
    input_ids = np.array(input_ids, dtype=int)
    attention_masks = np.array(attention_masks, dtype=int)
    token_type_ids = np.array(token_type_ids, dtype=int)
    
    data_labels = np.asarray(data_labels, dtype=np.int32)
    
    return (input_ids, attention_masks, token_type_ids), data_labels

In [27]:
train_X, train_y = convert_examples_to_features(
    x_train, y_train, 
    max_seq_len=max_len, tokenizer=tokenizer
)
val_X, val_y = convert_examples_to_features(
    x_val, y_val, 
    max_seq_len=max_len, tokenizer=tokenizer
)

100%|██████████| 31600/31600 [00:48<00:00, 645.69it/s]
100%|██████████| 790/790 [00:01<00:00, 663.25it/s]


In [10]:
class TFBertForMultiClassClassification(tf.keras.Model):
    def __init__(self, model_name, num_classes):
        super(TFBertForMultiClassClassification, self).__init__()
        self.bert = TFBertModel.from_pretrained(model_name, from_pt=True)
        self.classifier = tf.keras.layers.Dense(num_classes, 
                                                kernel_initializer=tf.keras.initializers.TruncatedNormal(0.02),
                                                activation='softmax', 
                                                name='classifier')
        
    def call(self, inputs):
        input_ids, attention_mask, token_type_ids = inputs
        outputs = self.bert(input_ids=input_ids, 
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids)
        cls_token = outputs[1]
        prediction = self.classifier(cls_token)
        return prediction

In [11]:
model = TFBertForMultiClassClassification("klue/bert-base", 4)

Downloading pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'bert.embeddings.position_ids', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the 

In [12]:
optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
loss = tf.keras.losses.SparseCategoricalCrossentropy()
model.compile(optimizer=optimizer, loss=loss, metrics = ['accuracy'])

In [None]:
history = model.fit(
    train_X, train_y, 
    epochs=EPOCH, batch_size=16, 
    validation_data=(x_val, y_val)
)

Epoch 1/3
  19/1975 [..............................] - ETA: 45:41 - loss: 0.0052 - accuracy: 1.0000

In [24]:
def get_predicted_class(new_sentence):
    input_id = tokenizer.encode(new_sentence, 
                                max_length=max_len, 
                                pad_to_max_length=True)
    padding_count = input_id.count(tokenizer.pad_token_id)
    attention_mask = [1] * (max_len - padding_count) + [0] * padding_count
    token_type_id = [0] * max_len
    
    input_ids = np.array([input_id])
    attention_masks = np.array([attention_mask])
    token_type_ids = np.array([token_type_id])
    
    encoded_input = [input_ids, attention_masks, token_type_ids]
    score = model.predict(encoded_input)[0]
    
    class_names = ['협박 대화', '갈취 대화', '직장 내 괴롭힘 대화', '기타 괴롭힘 대화']
    predicted_class = class_names[np.argmax(score)]
    
    return predicted_class

In [17]:
df = pd.read_json('/kaggle/input/etstest/test.json')
df_flipped = df.transpose()

# 처음 5행 출력
df_flipped.head()

Unnamed: 0,text
t_000,아가씨 담배한갑주소 네 4500원입니다 어 네 지갑어디갔지 에이 버스에서 잃어버렸나...
t_001,우리팀에서 다른팀으로 갈 사람 없나? 그럼 영지씨가 가는건 어때? 네? 제가요? ...
t_002,너 오늘 그게 뭐야 네 제가 뭘 잘못했나요.? 제대로 좀 하지 네 똑바로 좀 하지 ...
t_004,아무튼 앞으로 니가 내 와이파이야. .응 와이파이 온. 켰어. 반말? 주인님이라고도...
t_005,그러니까 빨리 말해. 선생님 제발 살려주십시오. 비밀번호 틀릴 때마다 손톱 하나씩...


In [18]:
class_mapping = {
    '협박 대화': '00',
    '갈취 대화': '01',
    '직장 내 괴롭힘 대화': '02',
    '기타 괴롭힘 대화': '03'
}

In [25]:
# df_flipped의 각 행의 text 값을 get_predicted_class 함수에 넣어 예측된 클래스 값을 얻고, 
# 얻어진 예측된 클래스 값을 주어진 매핑에 따라 변경
df_flipped['class'] = df_flipped['text'].apply(lambda x: class_mapping[get_predicted_class(x)])
df_flipped.drop(columns=['text'], inplace=True)  # 'text' 컬럼 삭제



In [26]:
import datetime

# 현재 날짜와 시간 가져오기
now = datetime.datetime.now()

# 날짜와 시간을 원하는 형식으로 포맷팅
date_time_str = now.strftime("%Y%m%dT%H%M")

# 파일명 생성
file_name = f"submission{date_time_str}.csv"

# submission.csv 파일을 날짜패턴 합쳐 만들기
df_flipped.to_csv(file_name)