In [None]:
import sys
import numpy as np
import pandas as pd
import random
from tqdm.notebook import tqdm
import os
import re
import time

from transformers import BertTokenizer, AdamWeightDecay, TFRobertaModel, TFBertModel

import tensorflow as tf
import keras
from keras.callbacks import EarlyStopping, ModelCheckpoint

import sklearn
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import StratifiedKFold

In [None]:
# Setting results path
data_path='/content/drive/MyDrive/Dacon/뉴스 토픽 분류 AI/data/'
model_path='/content/drive/MyDrive/Dacon/뉴스 토픽 분류 AI/model/'
sub_path='/content/drive/MyDrive/Dacon/뉴스 토픽 분류 AI/sub/'

# Load data
train=pd.read_csv(os.path.join(data_path,'train_data.csv'))
test=pd.read_csv(os.path.join(data_path,'test_data.csv'))
sample_submission=pd.read_csv(os.path.join(data_path,'sample_submission.csv'))

# Main Tokenizer used in RobertaModel
tokenizer = BertTokenizer.from_pretrained('klue/roberta-large')

# 5. Processing confused case
- Extract singular sentences based on similarity for frequently confused labels

In [None]:
# Calculate text simillarity
def return_similarity(a, b): 
    c = a.intersection(b)
    return float(len(c))/(len(a)+len(b)-len(c))

In [None]:
tokenizer_bert=BertTokenizer.from_pretrained('klue/bert-base')

# Label = 1 or 2 or 3 in train data
train_data=train
train_1_2_3=train_data[(train_data['topic_idx']==1)|(train_data['topic_idx']==2)|(train_data['topic_idx']==3)]
train_1_2_3_index=train_data[(train_data['topic_idx']==1)|(train_data['topic_idx']==2)|(train_data['topic_idx']==3)].index

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=248477.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=125.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=289.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=428.0, style=ProgressStyle(description_…




In [None]:
# For calculate text similarity, tokenize all sentencies 
# Make new column 'token_list' to original train data
train_1_2_3['token_list']=''
train['token_list']=''

for i in train_1_2_3_index:
  token=tokenizer_bert.tokenize(train_1_2_3['title'][i])
  train_1_2_3['token_list'][i]=' '.join(token)
  train['token_list'][i]=' '.join(token)

In [None]:
# Data frames for sentences with high text similarity
set_train_log = [set(log.split()) for log in train['token_list']]
sim_idx=[]
sim_i=[]
sim_j=[]
top_i=[]
top_j=[]

# If the similarity is higher than 0.5 with different labels, it is judged as a similar sentence
for i in tqdm(train_1_2_3_index):
    for j in train_1_2_3_index:
        if i==j:
            continue
        if ((return_similarity(set_train_log[i], set_train_log[j]))>=0.5) and (train_1_2_3['topic_idx'][i]!=train_1_2_3['topic_idx'][j]):
            top_i.append(train_1_2_3['topic_idx'][i])
            top_j.append(train_1_2_3['topic_idx'][j])
            sim_i.append(i)
            sim_j.append(j)

HBox(children=(FloatProgress(value=0.0, max=19517.0), HTML(value='')))




In [None]:
# Labels are different despite semantically identical
# top_1, top2 is topic_1 and topic_2 respectively
crazy=pd.DataFrame({'sent1_idx':sim_i,'sent2_idx':sim_j,'sent1':train_1_2_3['title'][sim_i].values,'sent2':train_1_2_3['title'][sim_j].values,'top_1':top_i,'top_2':top_j})
crazy

Unnamed: 0,sent1_idx,sent2_idx,sent1,sent2,top_1,top_2
0,928,9817,인사말 하는 황창규 회장,기조연설 하는 황창규 KT회장,1,2
1,928,13291,인사말 하는 황창규 회장,KT 부스 살펴보는 황창규 회장,1,2
2,1155,7900,네이버 CEO 아름다운 바통 터치,네이버 CEO 아름다운 바통 터치종합,1,2
3,1576,11793,경기도 11개 시 19시 오존주의보 해제,경기도 25개 시·군 오존주의보 모두 해제,3,2
4,1606,8075,삼성 새만금 투자약속 철회 진실 밝혀라 목소리 봇물,삼성 새만금 투자약속 철회 진실 밝혀라 목소리 봇물종합,1,2
...,...,...,...,...,...,...
243,43378,16697,거래소 지엔코 시황변동 조회공시 요구,거래소 대성산업에 시황변동 관련 조회공시 요구,1,2
244,43381,37874,그래픽 경력단절 여성 경제활동 실태,파주시 경력단절 여성 경제활동 촉진 조례 제정,1,2
245,43548,17796,한국작가회의 새 임원 선출… 이경자 이사장과 한창훈 사무총장,한국작가회의 새 이사장으로 선출된 이경자 작가,2,3
246,43559,21728,LH 청년·신혼부부 매입임대 6천850가구 입주자 모집,LH 신혼부부·청년주택 109가구 입주자 모집,2,1


In [None]:
# Save odd data frame
np.save(data_path+'sim_1_2_3_0point5_klue_bert.npy',crazy)

# 6. Modeling & Prediction

+ klue/robert_large
+ Stratified 5-fold Ensemble
+ Using back translated data as train data & Include confusing label data to train set
+ AdamWeightDecay

In [None]:
# Set seed
def reset_seeds(seed, reset_graph_with_backend=None):
    if reset_graph_with_backend is not None:
        K = reset_graph_with_backend
        K.clear_session()
        tf.compat.v1.reset_default_graph()
        print("KERAS AND TENSORFLOW GRAPHS RESET")  

    np.random.seed(seed)
    random.seed(seed+100)
    tf.compat.v1.set_random_seed(seed+200)
    os.environ['CUDA_VISIBLE_DEVICES'] = ''  
    print("RANDOM SEEDS RESET {}".format(seed))  

SEED = 1514
reset_seeds(SEED)

In [None]:
back_train=pd.read_csv(data_path+'back_train_fin2.csv')
back_train=back_train[['index','back_title','topic_idx']]
back_train.columns=train.columns

# Loading indexes from strange data frames extracted earlier
crazy=pd.DataFrame(np.load(data_path+'sim_1_2_3_0point5_klue_bert.npy',allow_pickle=True),columns=['sent1_idx','sent2_idx','sent1','sent2','top_1','top_2'])
sim_idx=list(set([*crazy.sent1_idx.values,*crazy.sent1_idx.values]))

In [None]:
# Cross validation, StratifiedKfold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
folds=[]

# Unusual sentences must be included in the training data
# Seperate train set and validation set in each folds
for train_idx, valid_idx in skf.split(train, train['topic_idx']):
    train_idx = np.array(list(set(list(train_idx)+list(sim_idx))))
    valid_idx = np.array(list(set(set(valid_idx)-set(sim_idx))))
    folds.append((train_idx, valid_idx))

In [None]:
def convert_data(data_df,case,mask_token):
    global tokenizer
    
    tokens, masks, segments, targets = [], [], [], []
    
    for i in tqdm(range(len(data_df))):
        # tokenize
        token = tokenizer.encode(data_df[DATA_COLUMN][i], max_length=SEQ_LEN, padding='max_length',truncation=True)
       
        # making input mask
        num_zeros = token.count(mask_token)
        mask = [1]*(SEQ_LEN-num_zeros) + [0]*num_zeros
        
        # making segment
        segment = [0]*SEQ_LEN
 
        # token, mask, segment
        tokens.append(token)
        masks.append(mask)
        segments.append(segment)
        
        if case=='train':
          # label values
          targets.append(data_df[LABEL_COLUMN][i])
 
    # convert to array format    
    tokens = np.array(tokens)
    masks = np.array(masks)
    segments = np.array(segments)
    if case=='train':
      targets = np.array(targets)

    if case=='train':
       return [tokens, masks, segments], targets
    if case=='test':
       return [tokens, masks, segments]

 
# Load data and convert to BERT input format
def load_data(pandas_dataframe,case,mask_token):
    data_df = pandas_dataframe
    data_df[DATA_COLUMN] = data_df[DATA_COLUMN].astype(str)
    if case=='train':
      data_df[LABEL_COLUMN] = data_df[LABEL_COLUMN].astype(int)
      data_x, data_y = convert_data(data_df,'train',mask_token)
      return data_x, data_y
    if case=='test':
      data_x = convert_data(data_df,'test',mask_token)
      return data_x

# Define max_len
SEQ_LEN = 30
DATA_COLUMN = "title"
LABEL_COLUMN = "topic_idx"
 
# train
train_x0, train_y0 = load_data(train.iloc[folds[0][0]].append(back_train.iloc[folds[0][0]]).reset_index(drop=True),'train',1)
train_x1, train_y1 = load_data(train.iloc[folds[1][0]].append(back_train.iloc[folds[1][0]]).reset_index(drop=True),'train',1)
train_x2, train_y2 = load_data(train.iloc[folds[2][0]].append(back_train.iloc[folds[2][0]]).reset_index(drop=True),'train',1)
train_x3, train_y3 = load_data(train.iloc[folds[3][0]].append(back_train.iloc[folds[3][0]]).reset_index(drop=True),'train',1)
train_x4, train_y4 = load_data(train.iloc[folds[4][0]].append(back_train.iloc[folds[4][0]]).reset_index(drop=True),'train',1)

# valid
valid_x0, valid_y0 = load_data(train.iloc[folds[0][1]].reset_index(drop=True),'train',1)
valid_x1, valid_y1 = load_data(train.iloc[folds[1][1]].reset_index(drop=True),'train',1)
valid_x2, valid_y2 = load_data(train.iloc[folds[2][1]].reset_index(drop=True),'train',1)
valid_x3, valid_y3 = load_data(train.iloc[folds[3][1]].reset_index(drop=True),'train',1)
valid_x4, valid_y4 = load_data(train.iloc[folds[4][1]].reset_index(drop=True),'train',1)

# test
test_x = load_data(test,'test',1)

HBox(children=(FloatProgress(value=0.0, max=73104.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=73120.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=73144.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=73120.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=73114.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=9102.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=9094.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=9082.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=9094.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=9097.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=9131.0), HTML(value='')))




In [None]:
# Define RobertaModel using pretrained model
class Klue_RobertaClassifier(tf.keras.Model):
    def __init__(self, num_class):
        super(Klue_RobertaClassifier, self).__init__()

        self.bert = TFRobertaModel.from_pretrained("klue/roberta-large", from_pt=True)
        self.dropout = tf.keras.layers.Dropout(self.bert.config.hidden_dropout_prob)
        self.classifier = tf.keras.layers.Dense(num_class, kernel_initializer=tf.keras.initializers.TruncatedNormal(self.bert.config.initializer_range,seed=42), 
                                                name="classifier")
        
    def call(self, inputs, attention_mask=None, token_type_ids=None, training=False):
        
        # outputs value : sequence_output, pooled_output, (hidden_states), (attentions)
        outputs = self.bert(inputs, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output, training=training)
        logits = self.classifier(pooled_output)

        return logits

klue_roberta_model = Klue_RobertaClassifier(num_class=7)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1346854671.0, style=ProgressStyle(descr…




Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['lm_head.dense.bias', 'roberta.embeddings.position_ids', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaModel were not initialized from the PyTorch model and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream 

In [None]:
# Learn about each fold around the for gate and store the best weights.
for i in range(5):
  print('########## Fold {} : \n'.format(i))

  klue_roberta_model = Klue_RobertaClassifier(num_class=7)

  # Defining loss function, optimizer and metric
  optimizer = AdamWeightDecay(1e-5,weight_decay_rate=1e-4)
  loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
  metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
  klue_roberta_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

  # Adding an ealrystop to prevent overfitting
  earlystop_callback = EarlyStopping(monitor='val_accuracy', min_delta=0.0001,patience=2)

  checkpoint_path = os.path.join(model_path,'weight_klue_roberta_back_skf_fold_v0{}.h5'.format(i))
  cp_callback = ModelCheckpoint(checkpoint_path, monitor='val_accuracy', verbose=1, save_best_only=True, save_weights_only=True)

  # Training
  history = klue_roberta_model.fit(globals()['train_x{}'.format(i)],globals()['train_y{}'.format(i)], epochs=3, batch_size=32,
                             validation_data=(globals()['valid_x{}'.format(i)],globals()['valid_y{}'.format(i)]), callbacks=[earlystop_callback, cp_callback])

  klue_roberta_model.load_weights(model_path+'weight_klue_roberta_back_skf_fold_v0{}.h5'.format(i))

  preds=tf.argmax(klue_roberta_model.predict(globals()['valid_x{}'.format(i)]),axis=1)

  print('Validation set ACC: ',accuracy_score(globals()['valid_y{}'.format(i)],preds))
  print('Validation set Confusion Matrix: \n',confusion_matrix(globals()['valid_y{}'.format(i)],preds))

In [None]:
# Load all weights and predict labels toward test data
for i in range(0,5):
 klue_roberta_model.load_weights(model_path+'weight_klue_roberta_back_skf_fold_v0{}.h5'.format(i))
 globals()['results_{}'.format(i)] = klue_roberta_model.predict(test_x)

# Save prediction list toward test data
results_test_list=[results_0,results_1,results_2,results_3,results_4]
np.save(data_path+'results_klue_roberta_back_list_v01.npy',results_test_list)

In [None]:
# Ensemble prediction
# Calculate forecasts for final test data by averaging all forecasts
test_pred=tf.argmax((results_test_list[0]+results_test_list[1]+results_test_list[2]+results_test_list[3]+results_test_list[4])/5,axis=1)
sample_submission['topic_idx']=test_pred

pd.merge(test,sample_submission).head(30)

Unnamed: 0,index,title,topic_idx
663,46317,美연준 美경제 완만한 성장 지속…단기 낙관론 유지,4
1218,46872,한국당 특검 거론하며 曺 낙마 총공세…구속수사감 檢압박종합,6
7837,53491,北 김정일 5주기 띄우기…사이렌에 묵념도종합,6
8164,53818,소재·부품 기업 오찬 간담회 참석한 성윤모 장관,6
3564,49218,문 대통령 불합리한 보호무역 조치에 당당하고 결연히 대응,6
2115,47769,아시아나 마일리지로 갤럭시S9 싸게 산다…5천대 한정 판매,0
1549,47203,北 러시아에 원산블라디보스토크 여객선 취항 제안,6
2986,48640,LG유플러스 골드번호 5천개 추첨행사,0
7597,53251,특징주 오성엘에스티 무상감자·유상증자 소식에 신저가종합,1
2,45656,내년부터 국가RD 평가 때 논문건수는 반영 않는다,2
