## 지도 학습을 위한 데이터 분리 전처리 작업

### 1) zip 함수를 이용해 X, Y 분리

In [2]:
X,y = zip(['a',1], ['b',2], ['c',3])
print(X)
print(y)

('a', 'b', 'c')
(1, 2, 3)


In [4]:
sequences = [['a',1],['b',2],['c',3]]
X, y = zip(*sequences)
print(X)
print(y)

('a', 'b', 'c')
(1, 2, 3)


### 2) 데이터프레임을 이용해 분리



In [9]:
import pandas as pd
values = [['당신에게 드리는 마지막 혜택!',1],
          ['내일 뵐 수 있을지 확인 부탁드...',0],
          ['도연씨. 잘 지내시죠? 오랜만입...', 0],
          ['(광고) AI로 주가를 예측할 수 있다!',1]]
columns = ['메일 본문', '스팸 메일 유무']
df = pd.DataFrame(values, columns = columns)
df

Unnamed: 0,메일 본문,스팸 메일 유무
0,당신에게 드리는 마지막 혜택!,1
1,내일 뵐 수 있을지 확인 부탁드...,0
2,도연씨. 잘 지내시죠? 오랜만입...,0
3,(광고) AI로 주가를 예측할 수 있다!,1


In [11]:
X = df['메일 본문']
y = df['스팸 메일 유무']
print(X)
print(y)

0          당신에게 드리는 마지막 혜택!
1      내일 뵐 수 있을지 확인 부탁드...
2      도연씨. 잘 지내시죠? 오랜만입...
3    (광고) AI로 주가를 예측할 수 있다!
Name: 메일 본문, dtype: object
0    1
1    0
2    0
3    1
Name: 스팸 메일 유무, dtype: int64


### 3) Numpy를 이용한 분리

In [4]:
import numpy as np
ar = np.arange(0,16).reshape((4,4))
print(ar)
X= ar[:,:3]
print(X)
y = ar[:, 3]
print(y)

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]
 [12 13 14 15]]
[[ 0  1  2]
 [ 4  5  6]
 [ 8  9 10]
 [12 13 14]]
[ 3  7 11 15]


## 테스트 데이터 분리

### 1) 사이킷 런을 이용해 분리

In [10]:
from sklearn.model_selection import train_test_split
X, y = np.arange(10).reshape((5,2)), range(5)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1234)
print(X)
print(list(y))
print(X_train)
print(X_test)
print(y_train)
print(y_test)

[[0 1]
 [2 3]
 [4 5]
 [6 7]
 [8 9]]
[0, 1, 2, 3, 4]
[[2 3]
 [4 5]
 [6 7]]
[[8 9]
 [0 1]]
[1, 2, 3]
[4, 0]


### 2) 수동 분리

In [14]:
X, y = np.arange(0,24).reshape((12,2)), range(12)
print(X)
print(list(y))

#전체의 80%
n_of_train = int(len(X)*0.8)
n_of_test = int(len(X)-n_of_train)

print(n_of_train)
print(n_of_test)

X_test = X[n_of_train:]
y_test = y[n_of_train:]
X_train = X[:n_of_train]
y_train = y[:n_of_train]

print(X_test)
print(list(y_test))

[[ 0  1]
 [ 2  3]
 [ 4  5]
 [ 6  7]
 [ 8  9]
 [10 11]
 [12 13]
 [14 15]
 [16 17]
 [18 19]
 [20 21]
 [22 23]]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
9
3
[[18 19]
 [20 21]
 [22 23]]
[9, 10, 11]


## 정수 인코딩(Integer Encoding)

컴퓨터는 텍스트보다 숫자를 잘 처리  
-> 자연어 처리에서 텍스트를 숫자로 바꾸는 여러 기법 존재
   이 기법들을 사용하기 위해 각 단어를 고유한 숫자에 mapping시키는
   전처리 작업이 필요

### 1) 정수 인코딩(Integer Encoding)

In [22]:
# 단어를 빈도수 순으로 정렬
# 이후 빈도수 높은 순서대로 차례로 낮은 숫자부터 정수 부여
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
text = "A barber is a person. a barber is good person. a barber is huge person. he Knew A Secret! The Secret He Kept is huge secret. Huge secret. His barber kept his word. a barber kept his word. His barber kept his secret. But keeping and keeping such a huge secret to himself was driving the barber crazy. the barber went up a huge mountain."
text = sent_tokenize(text)
print(text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Chris\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


['A barber is a person.', 'a barber is good person.', 'a barber is huge person.', 'he Knew A Secret!', 'The Secret He Kept is huge secret.', 'Huge secret.', 'His barber kept his word.', 'a barber kept his word.', 'His barber kept his secret.', 'But keeping and keeping such a huge secret to himself was driving the barber crazy.', 'the barber went up a huge mountain.']


In [25]:
#정제 작업 + 단어 토큰화 + 단어 빈도수 계산
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
nltk.download('stopwords')
vocab = Counter()

sentences = []
stop_words = set(stopwords.words('english'))

for i in text:
    sentence = word_tokenize(i)
    result = []
    
    for word in sentence:
        word = word.lower()
        if word not in stop_words:
            if len(word) > 2 :
                result.append(word)
                vocab[word] = vocab[word]+1
    sentences.append(result)
print(sentences)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Chris\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


[['barber', 'person'], ['barber', 'good', 'person'], ['barber', 'huge', 'person'], ['knew', 'secret'], ['secret', 'kept', 'huge', 'secret'], ['huge', 'secret'], ['barber', 'kept', 'word'], ['barber', 'kept', 'word'], ['barber', 'kept', 'secret'], ['keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'], ['barber', 'went', 'huge', 'mountain']]


In [29]:
print(vocab)
vocab_sorted = sorted(vocab.items(), key = lambda x:x[1], reverse = True)
print(vocab_sorted)
word_to_index = {}
i = 0
for (word, freq) in vocab_sorted:
    if freq > 1:
        i = i+1
        word_to_index[word] = i
print(word_to_index)

Counter({'barber': 8, 'secret': 6, 'huge': 5, 'kept': 4, 'person': 3, 'word': 2, 'keeping': 2, 'good': 1, 'knew': 1, 'driving': 1, 'crazy': 1, 'went': 1, 'mountain': 1})
[('barber', 8), ('secret', 6), ('huge', 5), ('kept', 4), ('person', 3), ('word', 2), ('keeping', 2), ('good', 1), ('knew', 1), ('driving', 1), ('crazy', 1), ('went', 1), ('mountain', 1)]
{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5, 'word': 6, 'keeping': 7}


### 2) 케라스(Keras)의 텍스트 전처리

In [49]:
from keras.preprocessing.text import Tokenizer
text=["A barber is a person. a barber is good person. a barber is huge person. he Knew A Secret! The Secret He Kept is huge secret. Huge secret. His barber kept his word. a barber kept his word. His barber kept his secret. But keeping and keeping such a huge secret to himself was driving the barber crazy. the barber went up a huge mountain."]
text.type

Using TensorFlow backend.


ModuleNotFoundError: No module named 'tensorflow'