In [12]:
import os
os.getcwd()
# os.chdir('drive/MyDrive/NLP/Dacon_NH competition/')

'/content/drive/My Drive/NLP/Dacon_NH competition'

In [13]:
%pip install wandb -q

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SpatialDropout1D, LSTM, Dropout, Dense
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping

import wandb
from wandb.keras import WandbCallback
wandb.login()

True

In [15]:
import logging
logger = logging.getLogger("wandb")
logger.setLevel(logging.ERROR)

In [16]:
train = pd.read_csv('data/news_train_preprocessing_Mecab_12.24.csv')
train = train.fillna('')

token_list = [t.split(' ') for t in train['text']]
max_len = max(len(l) for l in token_list)

def text2sequence(train_text, max_len):   # max_len에 뉴스의 최대 길이
  max_words = 10000
  tokenizer = Tokenizer(num_words = max_words)              # keras의 vectorizing 함수 호출
  tokenizer.fit_on_texts(train_text)                        # train 문장에 fit
  train_X_seq = tokenizer.texts_to_sequences(train_text)    # 각 토큰들에 정수값 부여
  vocabulary = tokenizer.word_index
  vocab_size = len(tokenizer.word_index) + 1                # 모델에 알려줄 vocabulary의 크기 계산(padding값 0을 포함해야 하므로 +1)
  # print('vocab_size : ', vocab_size)
  X_train = pad_sequences(train_X_seq, maxlen = max_len)    # 설정한 문장의 최대 길이만큼 padding

  return X_train, vocabulary, vocab_size, tokenizer

train_y = train['info']
train_X, vocabulary, vocab_size, vectorizer = text2sequence(token_list, max_len)

test = pd.read_csv('data/news_test_preprocessing_Mecab_12.24.csv')
test = test.fillna('')

token_list_test = [t.split(' ') for t in test['text']]

test_X_seq = vectorizer.texts_to_sequences(token_list_test)    # 각 토큰들에 정수값 부여
X_test = pad_sequences(test_X_seq, maxlen = max_len)           # 설정한 문장의 최대 길이만큼 padding

submission_data = pd.read_csv('data/sample_submission.csv')

In [23]:
# Configure the sweep – specify the parameters to search through, the search strategy, the optimization metric et all.

sweep_config = {
    'method': 'random', # grid, random
    'metric': {
      'name': 'val_loss',
      'goal': 'minimize'   
    },
    'parameters': {
        'embedding_dim': {
            'values': [50, 100, 150, 200]
        },
        'SpatialDropout1D': {
            'values': [0.1, 0.2, 0.3, 0.4, 0.5]
        },
        'LSTM_hidden': {
            'values': [16, 32, 64]
        },
        'dropout_rate1': {
            'values': [0.2, 0.3, 0.4, 0.5]
        },
        'dropout_rate2': {
            'values': [0.2, 0.3, 0.4, 0.5]
        },
        'regularizer': {
            'values': [0.01, 0.001, 0.0001]
        },
        'learning_rate': {
            'values': [1e-2, 1e-3, 1e-4, 3e-4, 3e-5, 1e-5]
        },
        'optimizer': {
            'values': ['adam', 'rmsprop']
        },
        'batch_size': {
            'values': [32, 64]
        },
        'epochs': {
            'values': [3, 5, 10, 20]
        }
    }
}

In [24]:
# Initialize a new sweep
# Arguments:
#     – sweep_config: the sweep config dictionary defined above
#     – entity: Set the username for the sweep
#     – project: Set the project name for the sweep

sweep_id = wandb.sweep(sweep_config, entity="ldc", project="NH_competition_keras_Embedding")

Create sweep with ID: snalj0xq
Sweep URL: https://wandb.ai/ldc/NH_competition_test/sweeps/snalj0xq


In [25]:
def train():
  # Default values for hyper-parameters we're going to sweep over
  default_config = {'embedding_dim' : 50, 'SpatialDropout1D' : 0.3, 'LSTM_hidden' : 64, 'dropout_rate1' : 0.2, 'dropout_rate2' : 0.3, 
                  'regularizer' : 0.001, 'batch_size' : 32, 'learning_rate' : 0.001, 'optimizer' : 'rmsprop', 'epochs' : 10}

  # Initialize a new wandb run
  wandb.init(config = default_config)

  # Config is a variable that holds and saves hyperparameters and inputs
  config = wandb.config

  max_words = 10000
  embedding_dim = config.embedding_dim      # 50 ~ 200 사이에서 적절히 설정

  model = Sequential()
  model.add(Embedding(input_dim = max_words, output_dim = embedding_dim, input_length = max_len))
  model.add(SpatialDropout1D(config.SpatialDropout1D))
  model.add(LSTM(config.LSTM_hidden))
  model.add(Dropout(config.dropout_rate1))
  model.add(Dense(32, activation = 'relu', kernel_regularizer = regularizers.l2(config.regularizer)))
  model.add(Dropout(config.dropout_rate2))
  # model.add(Dense(16, activation = 'relu'))
  model.add(Dense(1, activation='sigmoid'))

  if config.optimizer == 'rmsprop':
    optimizer = tf.keras.optimizers.RMSprop(learning_rate = config.learning_rate, rho=0.9, momentum=0.0, epsilon=1e-07, centered=False)
  elif config.optimizer == 'adam':
    optimizer = tf.keras.optimizers.Adam(learning_rate = config.learning_rate, beta_1=0.9, beta_2=0.999)

  model.compile(optimizer = optimizer, loss = 'binary_crossentropy', metrics = ['acc'])
  # print(model.summary())

  # callbacks
  early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=3,
                                                    mode='min', restore_best_weights=False)
  wandb_check = WandbCallback()
  callbacks_list = [early_stop, wandb_check]

  model.fit(train_X, train_y, epochs=config.epochs, batch_size=config.batch_size, validation_split=0.2, callbacks = callbacks_list)

  pred_test = model.predict(X_test)
  submission_data.loc[:,'info'] = np.where(pred_test > 0.5, 1,0).reshape(-1)

In [None]:
# Initialize a new sweep
# Arguments:
#     – sweep_id: the sweep_id to run - this was returned above by wandb.sweep()
#     – function: function that defines your model architecture and trains it
wandb.agent(sweep_id, train, count=150)