# 머신러닝[A] 기말 PBL 프로젝트
---
## 두 문장 사이의 의미적 유사성 판별
- https://www.kaggle.com/competitions/quora-question-pairs/data
- 인공신경망 알고리즘 활용
- 2011051 최준서
---

# 1. 라이브러리 설치 및 불러오기

In [6]:
!pip install keras tensorflow numpy pandas matplotlib tensorflow-macos tensorflow-metal



In [38]:
import tensorflow as tf

# GPU 설정
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    try:
        # GPU 메모리 제한 설정 (선택사항)
        for gpu in physical_devices:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("GPU is set for training")
    except RuntimeError as e:
        print(e)
else:
    print("No GPU found. Training on CPU")

Physical devices cannot be modified after being initialized


In [50]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Subtract, Dropout
from tensorflow.keras.models import Model
import matplotlib.pyplot as plt
%matplotlib inline
from tensorflow.keras.callbacks import Callback

---
# 2. 데이터 불러오기

In [51]:
# Quora 데이터셋 파일 경로 설정
data_path = '/Users/junseo/PycharmProjects/MachineLearning_project_semantics/data'

# 데이터 경로 생성
train_path = os.path.join(data_path, 'train.csv')
test_path = os.path.join(data_path, 'test.csv')

# 데이터 읽기
train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

In [53]:
# train 데이터 확인
print(train_data.head())

   id  qid1  qid2                                          question1  \
0   0     1     2  What is the step by step guide to invest in sh...   
1   1     3     4  What is the story of Kohinoor (Koh-i-Noor) Dia...   
2   2     5     6  How can I increase the speed of my internet co...   
3   3     7     8  Why am I mentally very lonely? How can I solve...   
4   4     9    10  Which one dissolve in water quikly sugar, salt...   

                                           question2  is_duplicate  
0  What is the step by step guide to invest in sh...             0  
1  What would happen if the Indian government sto...             0  
2  How can Internet speed be increased by hacking...             0  
3  Find the remainder when [math]23^{24}[/math] i...             0  
4            Which fish would survive in salt water?             0  


In [54]:
# test 데이터 확인
print(test_data.head())

  test_id                                          question1  \
0       0  How does the Surface Pro himself 4 compare wit...   
1       1  Should I have a hair transplant at age 24? How...   
2       2  What but is the best way to send money from Ch...   
3       3                        Which food not emulsifiers?   
4       4                   How "aberystwyth" start reading?   

                                           question2  
0  Why did Microsoft choose core m3 and not core ...  
1        How much cost does hair transplant require?  
2                      What you send money to China?  
3                                  What foods fibre?  
4                     How their can I start reading?  


---
# 3. 데이터 전처리

In [55]:
# 결측치 제거
train_data = train_data.dropna(subset=['question1', 'question2', 'is_duplicate'])
test_data = test_data.fillna('')

# 질문과 레이블 추출
questions1 = train_data['question1'].values
questions2 = train_data['question2'].values
labels = train_data['is_duplicate'].values

# 토크나이저 초기화 및 텍스트 토큰화
vocab_size = 20000  # 최대 단어 수
max_len = 50  # 패딩 길이

tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(list(questions1) + list(questions2))

# 텍스트를 시퀀스로 변환
sequences1 = tokenizer.texts_to_sequences(questions1)
sequences2 = tokenizer.texts_to_sequences(questions2)

# 패딩 처리
padded1 = pad_sequences(sequences1, maxlen=max_len, padding='post')
padded2 = pad_sequences(sequences2, maxlen=max_len, padding='post')

# 데이터 분리
X_train_1, X_val_1, X_train_2, X_val_2, y_train, y_val = train_test_split(
    padded1, padded2, labels, test_size=0.2, random_state=42
)

---
# 5. 모델 설계

In [61]:
embedding_dim = 128

# 입력 정의
input_q1 = Input(shape=(max_len,))
input_q2 = Input(shape=(max_len,))

# 임베딩 층
embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len)
q1_embedded = embedding(input_q1)
q2_embedded = embedding(input_q2)

# LSTM 층
shared_lstm = LSTM(64)
q1_encoded = shared_lstm(q1_embedded)
q2_encoded = shared_lstm(q2_embedded)

# 두 벡터의 차이 계산
subtracted = Subtract()([q1_encoded, q2_encoded])

# 밀집 층
dense = Dense(64, activation='relu')(subtracted)
dense = Dropout(0.4)(dense)
output = Dense(1, activation='sigmoid')(dense)

# 모델 생성
model = Model(inputs=[input_q1, input_q2], outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [62]:
model.summary()

---
# 6. 시각화 함수 정의

In [63]:
def plot_training_history(history):
    # Accuracy Plot
    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Model Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    # Loss Plot
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.tight_layout()
    plt.show()


---
# 7. 모델 학습

In [68]:
# 모델 학습
history = model.fit(
    [X_train_1, X_train_2], y_train,
    validation_data=([X_val_1, X_val_2], y_val),
    epochs=10,
    batch_size=64,
    verbose=1
)
plot_training_history(history)

Epoch 1/10
[1m5054/5054[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 24ms/step - accuracy: 0.6612 - loss: 0.6018 - val_accuracy: 0.7236 - val_loss: 0.5525
Epoch 2/10
[1m5054/5054[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m117s[0m 23ms/step - accuracy: 0.7508 - loss: 0.5157 - val_accuracy: 0.7731 - val_loss: 0.4903
Epoch 3/10
[1m5054/5054[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m121s[0m 24ms/step - accuracy: 0.8028 - loss: 0.4313 - val_accuracy: 0.7879 - val_loss: 0.4533
Epoch 4/10
[1m5054/5054[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m118s[0m 23ms/step - accuracy: 0.8381 - loss: 0.3679 - val_accuracy: 0.8027 - val_loss: 0.4484
Epoch 5/10
[1m5054/5054[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 24ms/step - accuracy: 0.8665 - loss: 0.3159 - val_accuracy: 0.8111 - val_loss: 0.4543
Epoch 6/10
[1m 714/5054[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m1:36[0m 22ms/step - accuracy: 0.8956 - loss: 0.2624

KeyboardInterrupt: 

---
# 6. 모델 테스트

In [72]:
# 테스트 데이터 전처리
test_sequences1 = tokenizer.texts_to_sequences(test_data['question1'])
test_sequences2 = tokenizer.texts_to_sequences(test_data['question2'])

test_padded1 = pad_sequences(test_sequences1, maxlen=max_len, padding='post')
test_padded2 = pad_sequences(test_sequences2, maxlen=max_len, padding='post')

# 모델 예측
predictions = model.predict([test_padded1, test_padded2])

# 예측값 이진화 (0 또는 1로 변환)
predicted_labels = (predictions > 0.5).astype(int).flatten()

# 결과를 test_data에 추가
test_data['is_duplicate_pred'] = predicted_labels

# 예측값 확인
print(test_data[['id', 'question1', 'question2', 'is_duplicate_pred']].head())

# 결과 저장
test_data[['id', 'is_duplicate_pred']].to_csv('test_predictions.csv', index=False)

[1m111359/111359[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m439s[0m 4ms/step


KeyError: "['id'] not in index"

In [67]:
# 모델 초기화
# 기존 모델 폐기
del model  # 메모리에서 모델 제거 (선택 사항)

# 모델을 다시 정의하여 초기화
input_q1 = Input(shape=(max_len,))
input_q2 = Input(shape=(max_len,))

embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len)
q1_embedded = embedding(input_q1)
q2_embedded = embedding(input_q2)

shared_lstm = LSTM(64)
q1_encoded = shared_lstm(q1_embedded)
q2_encoded = shared_lstm(q2_embedded)

subtracted = Subtract()([q1_encoded, q2_encoded])
dense = Dense(64, activation='relu')(subtracted)
dense = Dropout(0.2)(dense)
output = Dense(1, activation='sigmoid')(dense)

model = Model(inputs=[input_q1, input_q2], outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

print("모델이 초기화되었습니다.")

모델이 초기화되었습니다.
