In [1]:
!pip install fasttext

Collecting fasttext
[?25l  Downloading https://files.pythonhosted.org/packages/f8/85/e2b368ab6d3528827b147fdb814f8189acc981a4bc2f99ab894650e05c40/fasttext-0.9.2.tar.gz (68kB)
[K     |████▊                           | 10kB 19.0MB/s eta 0:00:01[K     |█████████▌                      | 20kB 24.5MB/s eta 0:00:01[K     |██████████████▎                 | 30kB 26.5MB/s eta 0:00:01[K     |███████████████████             | 40kB 18.8MB/s eta 0:00:01[K     |███████████████████████▉        | 51kB 20.0MB/s eta 0:00:01[K     |████████████████████████████▋   | 61kB 17.4MB/s eta 0:00:01[K     |████████████████████████████████| 71kB 6.5MB/s 
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp37-cp37m-linux_x86_64.whl size=3094600 sha256=45139b817d2fb02b93d0cc2c29f05083e819c12c17610c61bb97e0241fdc2424
  Stored in directory: /root/.cache/pip/wheels/98/ba/7f/b154944a1cf5a8cee9

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#import fasttext.util
#fasttext.util.download_model('ko', if_exists='ignore')

In [7]:
# library 
import tensorflow as tf
import numpy as np
import fasttext
import numpy as np
from tqdm import tqdm
import math
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [None]:
EMBEDDING_DIM = 300   # fasttext 임베딩 차원 

def trash(x):
    sw = ['부대사업','사업','부대','각호','판매업']
    x = [word for word in x if not word in sw]
    return x 

# 토큰 자르는 함수 
def end_token(x, n):
  if len(x)>n:
    return x[:n]
  else:
    return x

# X, y train data 만들기 
def read_corpus(path):
    data = pd.read_pickle(path)
    data = data.query('big not in ["O","U","T"]')  # 나중에 out 얘네들 rule base로 쳐내야함 

    X = [x for x in data.iloc[:,-1].apply(trash)]   # 불용어 쳐내기 
    X = [x for x in X.apply(end_token, n=100)] # 앞에 나온 토큰 100개로 한정 ([50:] + [:50] 시도해보기)

    Y = pd.get_dummies(data['big']).values   # 대분류 get dummy로 펼치기 

    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2,
                                                    random_state=0,
                                                    stratify=Y)

    #label_enc = OneHotEncoder()    
    #Ytrain_p = label_enc.fit_transform(np.array(Ytrain).reshape(-1,1))
    #Ytest_p = label_enc.transform(np.array(Ytest).reshape(-1,1))    
    return np.array(Xtrain), np.array(Xtest), np.array(Ytrain), np.array(Ytest)   # array 로 반환 


## max_len 제한 둔 패딩 
def pad(data, max_len=100):    
    if max_len == 0:
        max_len = max(len(tokens) for tokens in data)

    result = []
    for tokens in tqdm(data, desc='Padding'):
        if len(tokens) >= max_len:  ## max_len보다 크면 그냥 max_len까지만 자르기 
            result.append(tokens[:max_len])

        else:
            n_to_pad = max_len - len(tokens) 
            result.append(tokens + [''] * n_to_pad)

    return max_len, result


## 패딩하기 
def preprocess(tokenized_sentences):
    max_tokens, padded_sentences = pad(tokenized_sentences)
    return padded_sentences

### Sequence dataset 맞춤형 딥러닝 
class Dataset(tf.keras.utils.Sequence):
    fasttext_model_cache = {}
    
    def __init__(self, x_set, y_set, batch_size):
        self.x_set = x_set
        self.y_set = y_set
        self.batch_size = batch_size

        fasttext_model_path = 'cc.ko.300.bin' # 나중에 추가학습 시도하기 

        if fasttext_model_path not in Dataset.fasttext_model_cache:
            Dataset.fasttext_model_cache[fasttext_model_path] = fasttext.load_model(fasttext_model_path)  

        self.fasttext_model = Dataset.fasttext_model_cache[fasttext_model_path]
        #self.fasttext_model = fasttext.load_model(fasttext_model_path)

    def __len__(self):
        return math.ceil(len(self.x_set) / self.batch_size)

    def __getitem__(self, idx):
        padded_sentences = self.x_set[idx * self.batch_size:(idx + 1) * self.batch_size]        
        word_vectors = [self.get_word_vectors(padded_sentence) for padded_sentence in padded_sentences]        
        batch_y = self.y_set[idx * self.batch_size:(idx + 1) * self.batch_size]
        
        return np.array(word_vectors), np.array(batch_y)
    
    ## word_vectors를 얻기 
    def get_word_vectors(self, words):
        result = []
        for word in words:
            if not word: 
                result.append(np.zeros((EMBEDDING_DIM,)))   # LSTM을 위한 zero padding 
            else:
                result.append(self.fasttext_model.get_word_vector(word))

        return np.array(result)

In [None]:
# if __name__ == '__main__':
# import argparse

# parser = argparse.ArgumentParser()               
# parser.add_argument('data')
# parser.add_argument('--batch-size', type=int, default=128)            
# parser.add_argument('--test-batch-size', type=int)
# parser.add_argument('--epochs', type=int, default=10)    
# args = parser.parse_args()

path = './data/0504_alltoken.pkl'
batch_size =  128
epoch = 10


#### train 
train_sentences, test_sentences,train_labels,test_labels= read_corpus(path)      # X, y data 만들기 
train_padded_sentences = preprocess(train_sentences)   # 패딩하기 

train_dataset = Dataset(train_padded_sentences, train_labels, batch_size)    # Dataset 제작 

Padding: 100%|██████████| 1262969/1262969 [00:08<00:00, 147890.49it/s]


In [None]:
### test 
test_padded_sentences = preprocess(test_sentences)       
test_batch_size = batch_size
test_dataset = Dataset(test_padded_sentences, test_labels, test_batch_size)     

Padding: 100%|██████████| 315743/315743 [00:05<00:00, 60953.12it/s] 


## Modeling

In [None]:
## Modeling function 
def build_model():
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True), 
                                            input_shape=(None, EMBEDDING_DIM)))
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128)))
    model.add(tf.keras.layers.Dense(64))
    model.add(tf.keras.layers.Dense(17, activation='softmax'))    ## 17개의 대분류 분류
    model.summary() 

    return model

In [None]:
model = build_model()    
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])    
model.fit(train_dataset, epochs=epoch)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional (Bidirectional (None, None, 256)         439296    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               394240    
_________________________________________________________________
dense (Dense)                (None, 64)                16448     
_________________________________________________________________
dense_1 (Dense)              (None, 17)                1105      
Total params: 851,089
Trainable params: 851,089
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


IndexError: ignored

In [None]:
model.save('./model/classfier.{}.model'.format('fasttext'))



INFO:tensorflow:Assets written to: ./model/classfier.fasttext.model/assets


INFO:tensorflow:Assets written to: ./model/classfier.fasttext.model/assets


In [None]:
test_loss, test_accuracy = model.evaluate(test_dataset)    
print('test_loss', test_loss)
print('test_accuracy', test_accuracy)

test_loss 0.5448988676071167
test_accuracy 0.8392806649208069


In [None]:
[print(i.shape, i.dtype) for i in model.inputs]
[print(o.shape, o.dtype) for o in model.outputs]
[print(l.name, l.input_shape, l.dtype) for l in model.layers]