# Sparse Matrix(Tfidf) + DNN model

1. csv file load
2. label + texts
3. texts 전처리 : 텍스트 백터화
4. 희소행렬(sparse matrix)
5. DNN model 생성

In [1]:

import pandas as pd
import numpy as np
import string
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from tensorflow.keras import Sequential # model 생성
from tensorflow.keras.layers import Dense # layer 생성
from sklearn.metrics import accuracy_score



temp_spam = pd.read_csv("C:/IITT/6_Tensorflow/data/temp_spam_data2.csv",
                        header = None, encoding = "utf-8")
temp_spam.info()



# 1. 변수 선택
label = temp_spam[0]
texts = temp_spam[1]
len(label) # 5574


# 2. data 전처리
# target dummy('spam'=1, 'ham'=0)
target = [1 if x=='spam' else 0 for x in label]
print('target :', target)
target = np.array(target)

# texts 전처리
def text_prepro(texts):
    # Lower case
    texts = [x.lower() for x in texts]
    # Remove punctuation
    texts = [''.join(c for c in x if c not in string.punctuation) for x in texts]
    # Remove numbers
    texts = [''.join(c for c in x if c not in string.digits) for x in texts]
    # Trim extra whitespace
    texts = [' '.join(x.split()) for x in texts]
    return texts


text = text_prepro(texts)
text[0]



# 3. 토큰화 : texts -> token
tokenizer = Tokenizer(num_words=4000)
tokenizer.fit_on_texts(text)
token = tokenizer.word_index
print(len(token)) # 8629



# 4. 희소행렬(sparse matrix) :  tfidf
x_data = tokenizer.texts_to_matrix(text, mode='tfidf')
x_data.shape # (5574, 4000)


# 5. dataset split
x_train, x_val, y_train, y_val = train_test_split(x_data, target, test_size=0.3)


# 6. DNN layer
input_shape = (4000,)

model = Sequential()

model.add(Dense(64, input_shape=input_shape, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.summary()



# 7. compile(학습 환경 설정)/training
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(x_train, y_train, epochs=5, verbose=1,
          batch_size = 512, validation_data=(x_val, y_val))


loss, score = model.evaluate(x_val, y_val)
print("loss = {:.5f}, accuracy = {:.5f}".format(loss, score))  

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5574 entries, 0 to 5573
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       5574 non-null   object
 1   1       5574 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
target : [0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 64)                256064    
_________________________________________________________________
dense_1 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 258,177
Trainable params: 258,177
Non-trainable params: 0
_________________________________________________________________
Train on 3901 samples, validate on 1673 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


loss = 0.11493, accuracy = 0.97848
