<a href="https://colab.research.google.com/github/dmswl0707/Dacon_AI_Competition/blob/main/%EC%98%81%ED%99%94_%EB%A6%AC%EB%B7%B0_%EA%B0%90%EC%84%B1%EB%B6%84%EB%A5%98_%EB%AA%A8%EB%8D%B8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
import os
import pandas as pd
from google.colab import drive
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Dense,Flatten, Bidirectional,LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tqdm import tqdm
!pip install konlpy
from konlpy.tag import Okt

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [36]:
train = '/content/drive/My Drive/Colab Notebooks/dataset/train.csv'
test = '/content/drive/My Drive/Colab Notebooks/dataset/test.csv'
sub = '/content/drive/My Drive/Colab Notebooks/dataset/sample_submission.csv'

df = pd.read_csv(train)
testset = pd.read_csv(test)
submission = pd.read_csv(sub)

df.head(3)

Unnamed: 0,id,document,label
0,1,영상이나 음악이 이쁘다 해도 미화시킨 불륜일뿐,0
1,2,히치콕이 이 영화를 봤다면 분명 박수를 쳤을듯...,1
2,3,괜찮은 음악영화가 또 나왔군요!!! 따뜻한 겨울이 될 것 같아요~,1


In [4]:
df.shape

(5000, 3)

In [5]:
# 긍정 부정 간 레이블 균형이 비슷함

df=df.drop(columns='id')
df.value_counts('label')

label
0    2564
1    2436
dtype: int64

In [6]:
df.isnull().sum()

document    0
label       0
dtype: int64

### 데이터 전처리
1. 정규 표현식 제거
2. 불용어 제거
3. 토큰화
4. 인코딩 작업



In [7]:
# 정규 표현식 제거
df['document']=df['document'].str.replace("[^ㄱ-ㅎ ㅏ-ㅣ 가-힣]", "")
df.head(3)

Unnamed: 0,document,label
0,영상이나 음악이 이쁘다 해도 미화시킨 불륜일뿐,0
1,히치콕이 이 영화를 봤다면 분명 박수를 쳤을듯,1
2,괜찮은 음악영화가 또 나왔군요 따뜻한 겨울이 될 것 같아요,1


In [8]:
# 불용어 제거
# 불용어를 직접 정의할수도 있고, set(stop_words.split(" "))을 사용하기도 함

stop_words = ['의', '가', '이', '은', '들', '는', '좀', '잘', '걍', '과', '도', '를', '으로', '자', '에', '와', '한', '하다']
x_train = []
x_test = []
okt = Okt()

def remove_stopwords(list_, df):
  for sent in tqdm(df):
    token = okt.morphs(sent, stem=True)
    rem_stopwords = [word for word in token if not word in stop_words]
    list_.append(rem_stopwords)

In [9]:
# train set 가공
remove_stopwords(x_train, df['document'])

100%|██████████| 5000/5000 [00:20<00:00, 242.61it/s]


In [10]:
# test set 가공
remove_stopwords(x_test, testset['document'])

100%|██████████| 5000/5000 [00:09<00:00, 509.11it/s]


In [11]:
# 토큰화 작업
# 인덱스 확인하기
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)

In [12]:
vocab_size =len(tokenizer.word_index)
#print(tokenizer.word_index)

In [13]:
# 정수 인코딩 작업
# 각 단어마다 인덱스를 주어 인덱스 번호를 갖는 시퀀스로 만듬
tokenizer = Tokenizer(vocab_size)
tokenizer.fit_on_texts(x_train)
x_train=tokenizer.texts_to_sequences(x_train)

In [14]:
# 패딩의 길이 임의로 설정
x_train=pad_sequences(x_train, maxlen=20)
x_test=pad_sequences(x_train, maxlen=20)

y_train=df['label']

### 신경망 모델링

In [103]:
model = Sequential()
model.add(Embedding(vocab_size, 1500))
model.add(Bidirectional(LSTM(500, activation='elu', recurrent_dropout=0.2)))
model.add(Dense(600))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_18"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_18 (Embedding)    (None, None, 1500)        9972000   
                                                                 
 bidirectional_21 (Bidirecti  (None, 1000)             8004000   
 onal)                                                           
                                                                 
 dense_23 (Dense)            (None, 600)               600600    
                                                                 
 flatten_12 (Flatten)        (None, 600)               0         
                                                                 
 dense_24 (Dense)            (None, 1)                 601       
                                                                 
Total params: 18,577,201
Trainable params: 18,577,201
Non-trainable params: 0
_________________________________________

### 신경망 학습

In [111]:
EarlyStop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
Checkpoint = ModelCheckpoint('checkpoint.h5', monitor='val_loss', mode='min', save_best_only=True)
scheduler =  ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=0.0001)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(x_train, y_train, epochs=30, callbacks=[EarlyStop, Checkpoint, scheduler], batch_size=256, validation_split=0.2)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 00009: early stopping


### 신경망 추론

In [112]:
inference = load_model('checkpoint.h5')
y_pred = inference.predict(x_test)



In [113]:
def submission_ypred():
    for i in range(len(y_pred)):
        if y_pred[i] <0.5 :
            y_pred[i] = 0
        else:
            y_pred[i] = 1
    
    

submission_ypred()
submission['label']=y_pred
submission.to_csv('/content/drive/MyDrive/Colab Notebooks/dataset/BidirectionalLSTM.csv', index=False)