In [None]:
!pip install keras-tuner

Collecting keras-tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/129.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.7 kt-legacy-1.0.5


In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from keras_tuner import RandomSearch

In [None]:
# 데이터 로딩
train_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/training.csv')
validation_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/validation.csv')

In [None]:
def preprocess_data(df):
    df = df[['Origin Address', 'ex Add']].dropna()
    return df

train_data = preprocess_data(train_data)
validation_data = preprocess_data(validation_data)


In [None]:
# 토크나이저 생성 및 훈련
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['ex Add'].tolist() + validation_data['ex Add'].tolist())

train_sequences = tokenizer.texts_to_sequences(train_data['ex Add'].tolist())
validation_sequences = tokenizer.texts_to_sequences(validation_data['ex Add'].tolist())

max_length = max(len(seq) for seq in train_sequences + validation_sequences)
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post')
validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding='post')

address_tokenizer = Tokenizer()
address_tokenizer.fit_on_texts(train_data['Origin Address'].tolist() + validation_data['Origin Address'].tolist())

train_labels = address_tokenizer.texts_to_sequences(train_data['Origin Address'].tolist())
validation_labels = address_tokenizer.texts_to_sequences(validation_data['Origin Address'].tolist())

train_labels_padded = pad_sequences(train_labels, maxlen=max_length, padding='post')
validation_labels_padded = pad_sequences(validation_labels, maxlen=max_length, padding='post')

def build_model(hp):
    model = Sequential([
        Embedding(input_dim=len(tokenizer.word_index) + 1,
                  output_dim=hp.Int('embedding_dim', min_value=32, max_value=128, step=32),
                  input_length=max_length),
        LSTM(hp.Int('lstm_units', min_value=32, max_value=128, step=32), return_sequences=True),
        LSTM(hp.Int('lstm_units', min_value=32, max_value=128, step=32)),
        Dense(hp.Int('dense_units', min_value=32, max_value=128, step=32), activation='relu'),
        Dense(len(address_tokenizer.word_index) + 1, activation='softmax')
    ])
    model.compile(optimizer=Adam(learning_rate=hp.Float('learning_rate', min_value=1e-5, max_value=1e-2, sampling='LOG')),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model


In [None]:
# 조기 종료 설정
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

tuner = RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=10,
    executions_per_trial=1,
    directory='model_tuning',
    project_name='AddressTranslation'
)


In [None]:
# 튜너 검색 실행
tuner.search(train_padded, np.argmax(train_labels_padded, axis=1), epochs=50, validation_data=(validation_padded, np.argmax(validation_labels_padded, axis=1)), callbacks=[early_stopping])
best_model = tuner.get_best_models(num_models=1)[0]

Trial 10 Complete [00h 02m 05s]
val_accuracy: 0.9088654518127441

Best val_accuracy So Far: 0.9088654518127441
Total elapsed time: 00h 24m 05s


In [12]:
# 하이퍼파라미터 출력
best_hyperparameters = tuner.get_best_hyperparameters(num_trials=1)[0]
print(f"Embedding Dimension: {best_hyperparameters.get('embedding_dim')}")
print(f"LSTM Units: {best_hyperparameters.get('lstm_units')}")
print(f"Dense Units: {best_hyperparameters.get('dense_units')}")
print(f"Learning Rate: {best_hyperparameters.get('learning_rate')}")

Embedding Dimension: 128
LSTM Units: 128
Dense Units: 32
Learning Rate: 0.000602751646027759


In [None]:
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score

# 모델 평가
validation_predictions = best_model.predict(validation_padded)
validation_predictions = np.argmax(validation_predictions, axis=1)

# Flatten the padded validation labels
validation_labels_flat = np.argmax(validation_labels_padded, axis=1)

# Calculate metrics
accuracy = accuracy_score(validation_labels_flat, validation_predictions)
precision = precision_score(validation_labels_flat, validation_predictions, average='weighted')
recall = recall_score(validation_labels_flat, validation_predictions, average='weighted')
f1 = f1_score(validation_labels_flat, validation_predictions, average='weighted')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

Accuracy: 0.9088654680719157
Precision: 0.8998771238543575
Recall: 0.9088654680719157
F1 Score: 0.8843403634691157
