<a href="https://colab.research.google.com/github/dinggga/super-broccoli/blob/main/220509ss.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **NLP #4. Embedding**



**1. Keras Embedding Layer**

In [9]:
import pandas as pd
train = pd.read_csv('/content/drive/MyDrive/ESAA_OB/220502M/train.csv')

In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def text2sequence(train_text, max_len=100):
    
    tokenizer = Tokenizer() #keras의 vectorizing 함수 호출
    tokenizer.fit_on_texts(train_text) #train 문장에 fit
    train_X_seq = tokenizer.texts_to_sequences(train_text) #각 토큰들에 정수 부여
    vocab_size = len(tokenizer.word_index) + 1 #모델에 알려줄 vocabulary의 크기 계산
    print('vocab_size : ', vocab_size)
    X_train = pad_sequences(train_X_seq, maxlen = max_len) #설정한 문장의 최대 길이만큼 padding
    
    return X_train, vocab_size, tokenizer

train_X, vocab_size, vectorizer = text2sequence(train['text'], max_len = 100)

vocab_size :  42331


In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding

max_len=100
model = Sequential()
model.add(Embedding(vocab_size, 128, input_length=max_len))

**2. word2vec**

In [14]:
import gensim
word2vec = gensim.models.KeyedVectors.load_word2vec_format('/content/drive/MyDrive/ESAA_OB/220509M/GoogleNews-vectors-negative300.bin.gz', binary = True)

In [19]:
import numpy as np

embedding_matrix = np.zeros((vocab_size, 300))

for index, word in enumerate(vectorizer.word_index):
  if word in word2vec:
    embedding_vector = word2vec[word]
    embedding_matrix[index] = embedding_vector
  else:
    print('word2vec에 없는 단어입니다.')
    break

word2vec에 없는 단어입니다.


In [20]:
model = Sequential()
model.add(Embedding(vocab_size, 300, weights = [embedding_matrix], input_length = max_len))

**3. glove**



In [22]:
glove = dict()
f = open('/content/drive/MyDrive/ESAA_OB/220509M/word-embeddings/glove/glove.txt')
for line in f:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:], dtype='float32')
    glove[word] = vector
f.close()

In [23]:
embedding_matrix = np.zeros((vocab_size, 100)) #300차원의 임베딩 매트릭스 생성

for index, word in enumerate(vectorizer.word_index): #vocabulary에 있는 토큰들을 하나씩 넘겨줍니다.
    if word in glove: #넘겨 받은 토큰이 word2vec에 존재하면(이미 훈련이 된 토큰이라는 뜻)
        embedding_vector = glove[word] #해당 토큰에 해당하는 vector를 불러오고
        embedding_matrix[index] = embedding_vector #해당 위치의 embedding_mxtrix에 저장합니다.
    else:
        print("glove 없는 단어입니다.")
        break

glove 없는 단어입니다.


In [25]:
model = Sequential()
model.add(Embedding(vocab_size, 100,weights = [embedding_matrix], input_length = max_len))

**4. Fasttext**

In [30]:
# 파일 encoding이 안됨 ㅠㅠ
from gensim.models.keyedvectors import KeyedVectors
FastText = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/ESAA_OB/220509M/word-embeddings/fasttext/fasttext.bin', binary = True, encoding='unicode_escape')

ValueError: ignored

In [None]:
embedding_matrix = np.zeros((vocab_size, 300)) #300차원의 임베딩 매트릭스 생성

for index, word in enumerate(vectorizer.word_index): #vocabulary에 있는 토큰들을 하나씩 넘겨줍니다.
    if word in word2vec: #넘겨 받은 토큰이 word2vec에 존재하면(이미 훈련이 된 토큰이라는 뜻)
        embedding_vector = word2vec[word] #해당 토큰에 해당하는 vector를 불러오고
        embedding_matrix[index] = embedding_vector 

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 300,weights = [embedding_matrix], input_length = max_len))

# **NLP #5. Modeling**

**1. 간단한 전처리 + 형태소 분석**

In [None]:
!curl -s https://raw.githubusercontent.com/teddylee777/machine-learning/master/99-Misc/01-Colab/mecab-colab.sh | bash

In [34]:
#음 tqdm 계속 에러나서 #2 글에 있던 방식으로 했습니다 ㅎ..
from konlpy.tag import Okt
import re

def text_preprocessing(text_list):
    
    stopwords = ['을', '를', '이', '가', '은', '는', 'null'] #불용어 설정
    tokenizer = Okt() #형태소 분석기 
    token_list = []
    
    for text in text_list:
        txt = re.sub('[^가-힣a-z]', ' ', text) #한글과 영어 소문자만 남기고 다른 글자 모두 제거
        token = tokenizer.morphs(txt) #형태소 분석
        token = [t for t in token if t not in stopwords or type(t) != float] #형태소 분석 결과 중 stopwords에 해당하지 않는 것만 추출
        token_list.append(token)
        
    return token_list, tokenizer

train['token'], okt = text_preprocessing(train['text'])

**2. vectorization**

In [35]:
def text2sequence(train_text, max_len=1000):
    
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(train_text)
    train_X_seq = tokenizer.texts_to_sequences(train_text)
    vocab_size = len(tokenizer.word_index) + 1
    print('vocab_size : ', vocab_size)
    X_train = pad_sequences(train_X_seq, maxlen = max_len)
    return X_train, vocab_size, tokenizer

train_y = train['author']
train_X, vocab_size, vectorizer = text2sequence(train['token'], max_len = 100)
print(train_X.shape, train_y.shape)

vocab_size :  36342
(54879, 100) (54879,)


**3. Embedding**

In [36]:
word2vec = gensim.models.KeyedVectors.load_word2vec_format('/content/drive/MyDrive/ESAA_OB/220509M/GoogleNews-vectors-negative300.bin.gz', binary = True)
embedding_matrix = np.zeros((vocab_size, 300))

for index, word in enumerate(vectorizer.word_index):
    if word in word2vec:
        embedding_vector = word2vec[word] 
        embedding_matrix[index] = embedding_vector 
    else:
        print("word2vec에 없는 단어입니다.")
        break

word2vec에 없는 단어입니다.


**4. Modeling**

In [37]:
def LSTM(vocab_size, max_len=1000):
    model = Sequential()
    model.add(Embedding(vocab_size, 300,weights = [embedding_matrx], input_length = max_len)) #임베딩 가중치 적용 코드
    model.add(SpatialDropout1D(0.3))
    model.add(LSTM(64))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu', kernel_regularizer = regularizers.l2(0.001)))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics='accuracy')
    model.summary()
    return model