## nltk 모듈의 sent_tokenize
입력 텍스트를 문장 단위로 토큰화해준다.

In [1]:
from nltk import sent_tokenize
import nltk
nltk.download('punkt')

text_sample = 'The Matrix is everywhere its all around us, here even in this room. \
               You can see it out your window or on your television. \
               You feel it when you go to work, or go to church or pay your taxes.'
sentences = sent_tokenize(text=text_sample)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/datawhales/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
sentences

['The Matrix is everywhere its all around us, here even in this room.',
 'You can see it out your window or on your television.',
 'You feel it when you go to work, or go to church or pay your taxes.']

## nltk 모듈의 word_tokenize
입력 텍스트를 단어 단위로 토큰화해준다.

In [4]:
from nltk import word_tokenize

sentence = 'The Matrix is everywhere its all around us, here even in this room.'
words = word_tokenize(sentence)
print(words)

['The', 'Matrix', 'is', 'everywhere', 'its', 'all', 'around', 'us', ',', 'here', 'even', 'in', 'this', 'room', '.']


## 여러 개 문장으로 이루어진 텍스트 데이터를 문장별로 단어 토큰화하는 함수 작성

In [6]:
from nltk import sent_tokenize, word_tokenize

def tokenize_text(text):
    """ 주어진 텍스트를 먼저 문장 단위로 토큰화하고,
        각 문장을 단어 단위로 토큰화하는 함수.
    """
    sentences = sent_tokenize(text)
    word_tokens = [word_tokenize(sentence) for sentence in sentences]
    return word_tokens

In [7]:
word_tokens = tokenize_text(text_sample)
print(word_tokens)

[['The', 'Matrix', 'is', 'everywhere', 'its', 'all', 'around', 'us', ',', 'here', 'even', 'in', 'this', 'room', '.'], ['You', 'can', 'see', 'it', 'out', 'your', 'window', 'or', 'on', 'your', 'television', '.'], ['You', 'feel', 'it', 'when', 'you', 'go', 'to', 'work', ',', 'or', 'go', 'to', 'church', 'or', 'pay', 'your', 'taxes', '.']]


## stopwords
stopwords란 텍스트 분석에 있어서 큰 의미가 없는 단어를 말한다. 문법적인 특성으로 인해 텍스트에 자주 나타나지만 문장을 이해하는데 중요한 특성으로 사용되지 않는 단어들이 해당된다.

In [8]:
import nltk
nltk.download('stopwords')

print(f"영어 stopwords 개수: {len(nltk.corpus.stopwords.words('english'))}")
print(nltk.corpus.stopwords.words('english')[:10])

영어 stopwords 개수: 179
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/datawhales/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
# 단어 단위로 토큰화된 text_sample인 word_tokens에서 stopwords 제거
stopwords = nltk.corpus.stopwords.words('english')
tokens = []
for sent in word_tokens:
    filtered_words = [word.lower() for word in sent if word.lower() not in stopwords]
    tokens.append(filtered_words)
print(tokens)

[['matrix', 'everywhere', 'around', 'us', ',', 'even', 'room', '.'], ['see', 'window', 'television', '.'], ['feel', 'go', 'work', ',', 'go', 'church', 'pay', 'taxes', '.']]


## Stemming & Lemmatization

In [14]:
from nltk.stem import LancasterStemmer
stemmer = LancasterStemmer()

print(stemmer.stem('working'), stemmer.stem('works'), stemmer.stem('worked'))
print(stemmer.stem('amusing'), stemmer.stem('amuses'), stemmer.stem('amused'))
print(stemmer.stem('happier'), stemmer.stem('happiest'))
print(stemmer.stem('fancier'), stemmer.stem('faciest'))

work work work
amus amus amus
happy happiest
fant faciest


In [15]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemma = WordNetLemmatizer()
print(lemma.lemmatize('amusing', 'v'), lemma.lemmatize('amuses', 'v'), lemma.lemmatize('amused', 'v'))

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/datawhales/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


amuse amuse amuse


In [16]:
print(lemma.lemmatize('happier', 'a'), lemma.lemmatize('happiest', 'a'))
print(lemma.lemmatize('fancier', 'a'), lemma.lemmatize('fanciest', 'a'))

happy happy
fancy fancy


## Bag of Words - BOW
Bag of Words 모델은 문서가 가지는 모든 단어를 문맥이나 순서를 무시하고 각 단어에 대해 빈도(frequency) 값을 부여하여 피처 값을 추출하는 모델이다.  
BOW 모델의 단점으로는 semantic context(문맥적 의미)를 반영하기 어렵다는 것과 sparse matrix 문제가 존재한다.  
sparse matrix 문제란 문서마다 서로 다른 단어로 구성되기 때문에 하나의 문서에 있는 단어가 전체 단어 종류 중 극히 일부분이 되므로 대부분의 데이터가 0으로 채워지는 것을 말한다. 이러한 sparse matrix는 ML 알고리즘의 예측 성능을 떨어뜨리게 된다.

BOW 모델에서의 feature vectorization.  
text 데이터를 수치화해야 하는데, 모든 문서에서 모든 단어를 column 형태로 나열하고 각 문서에서 해당 단어의 횟수를 값으로 부여하여 M * N 형태의 matrix를 생성한다.  

## TF-IDF
TF-IDF란 Term Frequency Inverse Document Frequency의 약자로, 개별 문서에서 자주 나타나는 단어일수록 높은 가중치를 주되, 모든 문서에서 전반적으로 자주 나타나는 단어에 대해서는 패털티를 주는 방식으로 값을 부여하는 방식을 말한다.  


## Sparse Matrix의 처리 - COO, CSR 방식
COO(Coordinate) 방식은 0이 아닌 데이터만 별도의 데이터 배열에 저장하고 그 데이터가 가리키는 행과 열의 위치를 별도의 배열로 저장하는 방식.  
CSR(Compressed Sparse Row) 방식은 COO 방식이 위치를 표현할 때 같은 데이터를 반복적으로 사용하는 것을 해결한 방식.  

In [19]:
# COO
import numpy as np

dense = np.array([[3,0,1], [0,2,0]])

from scipy import sparse

data = np.array([3, 1, 2])

# (0, 0)에 3, (0, 2)에 1, (1, 1)에 2
row_pos = np.array([0, 0, 1])
col_pos = np.array([0, 2, 1])

sparse_coo = sparse.coo_matrix((data, (row_pos, col_pos)))

In [20]:
sparse_coo.toarray()

array([[3, 0, 1],
       [0, 2, 0]])

In [22]:
# CSR
from scipy import sparse
dense2 = np.array([[0,0,1,0,0,5],
                   [1,4,0,3,2,5],
                   [0,6,0,3,0,0],
                   [2,0,0,0,0,0],
                   [0,0,0,7,0,8],
                   [1,0,0,0,0,0]])

# 0이 아닌 데이터
data2 = np.array([1, 5, 1, 4, 3, 2, 5, 6, 3, 2, 7, 8, 1])

row_pos = np.array([0,0,1,1,1,1,1,2,2,3,4,4,5])
col_pos = np.array([2,5,0,1,3,4,5,1,3,0,3,5,0])

sparse_coo = sparse.coo_matrix((data2, (row_pos, col_pos)))

row_pos_ind = np.array([0,2,7,9,10,12,13])

sparse_csr = sparse.csr_matrix((data2, col_pos, row_pos_ind))

print(sparse_coo.toarray())
print('-'*20)
print(sparse_csr.toarray())

[[0 0 1 0 0 5]
 [1 4 0 3 2 5]
 [0 6 0 3 0 0]
 [2 0 0 0 0 0]
 [0 0 0 7 0 8]
 [1 0 0 0 0 0]]
--------------------
[[0 0 1 0 0 5]
 [1 4 0 3 2 5]
 [0 6 0 3 0 0]
 [2 0 0 0 0 0]
 [0 0 0 7 0 8]
 [1 0 0 0 0 0]]


In [23]:
## example
dense3 = np.array([[0,0,1,0,0,5],
                   [1,4,0,3,2,5],
                   [0,6,0,3,0,0],
                   [2,0,0,0,0,0],
                   [0,0,0,7,0,8],
                   [1,0,0,0,0,0]])

coo = sparse.coo_matrix(dense3)
csr = sparse.csr_matrix(dense3)


In [26]:
print(coo)

  (0, 2)	1
  (0, 5)	5
  (1, 0)	1
  (1, 1)	4
  (1, 3)	3
  (1, 4)	2
  (1, 5)	5
  (2, 1)	6
  (2, 3)	3
  (3, 0)	2
  (4, 3)	7
  (4, 5)	8
  (5, 0)	1


In [27]:
print(csr)

  (0, 2)	1
  (0, 5)	5
  (1, 0)	1
  (1, 1)	4
  (1, 3)	3
  (1, 4)	2
  (1, 5)	5
  (2, 1)	6
  (2, 3)	3
  (3, 0)	2
  (4, 3)	7
  (4, 5)	8
  (5, 0)	1


## 20newsgroups Classification

In [28]:
from sklearn.datasets import fetch_20newsgroups

news_data = fetch_20newsgroups(subset='all', random_state=156)

news_data.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [30]:
len(news_data.data)

18846

In [31]:
news_data.data[0]

'From: egreen@east.sun.com (Ed Green - Pixel Cruncher)\nSubject: Re: Observation re: helmets\nOrganization: Sun Microsystems, RTP, NC\nLines: 21\nDistribution: world\nReply-To: egreen@east.sun.com\nNNTP-Posting-Host: laser.east.sun.com\n\nIn article 211353@mavenry.altcit.eskimo.com, maven@mavenry.altcit.eskimo.com (Norman Hamer) writes:\n> \n> The question for the day is re: passenger helmets, if you don\'t know for \n>certain who\'s gonna ride with you (like say you meet them at a .... church \n>meeting, yeah, that\'s the ticket)... What are some guidelines? Should I just \n>pick up another shoei in my size to have a backup helmet (XL), or should I \n>maybe get an inexpensive one of a smaller size to accomodate my likely \n>passenger? \n\nIf your primary concern is protecting the passenger in the event of a\ncrash, have him or her fitted for a helmet that is their size.  If your\nprimary concern is complying with stupid helmet laws, carry a real big\nspare (you can put a big or small 

In [32]:
news_data.data[1]



In [38]:
import pandas as pd

for key in news_data.keys():
    print(len(news_data[key]))

18846
18846
20
18846
9442


In [39]:
news_data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [40]:
news_data.target

array([ 8,  8, 12, ...,  7,  3,  9])

In [56]:
pd.Series(news_data.target).value_counts().sort_index()

0     799
1     973
2     985
3     982
4     963
5     988
6     975
7     990
8     996
9     994
10    999
11    991
12    984
13    990
14    987
15    997
16    910
17    940
18    775
19    628
dtype: int64

In [57]:
print(news_data.data[0])

From: egreen@east.sun.com (Ed Green - Pixel Cruncher)
Subject: Re: Observation re: helmets
Organization: Sun Microsystems, RTP, NC
Lines: 21
Distribution: world
Reply-To: egreen@east.sun.com
NNTP-Posting-Host: laser.east.sun.com

In article 211353@mavenry.altcit.eskimo.com, maven@mavenry.altcit.eskimo.com (Norman Hamer) writes:
> 
> The question for the day is re: passenger helmets, if you don't know for 
>certain who's gonna ride with you (like say you meet them at a .... church 
>meeting, yeah, that's the ticket)... What are some guidelines? Should I just 
>pick up another shoei in my size to have a backup helmet (XL), or should I 
>maybe get an inexpensive one of a smaller size to accomodate my likely 
>passenger? 

If your primary concern is protecting the passenger in the event of a
crash, have him or her fitted for a helmet that is their size.  If your
primary concern is complying with stupid helmet laws, carry a real big
spare (you can put a big or small head in a big helmet, bu

### train, test data

In [58]:
from sklearn.datasets import fetch_20newsgroups

train_news = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), random_state=156)

train_x = train_news.data
train_y = train_news.target

test_news = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), random_state=156)

test_x = test_news.data
test_y = test_news.target
print(f"Train data size: {len(train_x)}, Test data size: {len(test_x)}")

Train data size: 11314, Test data size: 7532


## CountVectorizer 이용

In [64]:
from sklearn.feature_extraction.text import CountVectorizer

cnt_vect = CountVectorizer()
cnt_vect.fit(train_x)
train_x_cnt_vect = cnt_vect.transform(train_x)

# test data도 train data와 같은 feature를 가지도록 transform
test_x_cnt_vect = cnt_vect.transform(test_x)

print(train_x_cnt_vect.shape)

(11314, 101631)


train data를 CountVectorizer를 이용하여 feature extraction을 진행한 결과 11314개 문서에서 feature라고 할 수 있는 단어가 101631개 만들어졌음을 확인할 수 있다. 이를 이용하여 로지스틱 회귀를 적용하여 뉴스그룹에 대한 classification을 할 수 있다.

In [65]:
train_x_cnt_vect[0]

<1x101631 sparse matrix of type '<class 'numpy.int64'>'
	with 55 stored elements in Compressed Sparse Row format>

In [67]:
print(train_x_cnt_vect)

  (0, 2223)	1
  (0, 16251)	1
  (0, 16406)	1
  (0, 17936)	1
  (0, 18903)	1
  (0, 19756)	1
  (0, 20123)	1
  (0, 21987)	1
  (0, 23663)	2
  (0, 23790)	1
  (0, 24444)	1
  (0, 25370)	1
  (0, 25590)	3
  (0, 26271)	3
  (0, 26277)	1
  (0, 26992)	1
  (0, 28805)	1
  (0, 31939)	1
  (0, 33551)	1
  (0, 33799)	1
  (0, 35147)	2
  (0, 38824)	1
  (0, 41715)	1
  (0, 43217)	2
  (0, 43961)	2
  :	:
  (11313, 82046)	1
  (11313, 82393)	1
  (11313, 83426)	1
  (11313, 84598)	1
  (11313, 84995)	1
  (11313, 86607)	1
  (11313, 88273)	1
  (11313, 88532)	5
  (11313, 88694)	1
  (11313, 88755)	1
  (11313, 88767)	2
  (11313, 89360)	5
  (11313, 90780)	1
  (11313, 92629)	2
  (11313, 92875)	1
  (11313, 93870)	1
  (11313, 96138)	1
  (11313, 96429)	3
  (11313, 96433)	1
  (11313, 96683)	1
  (11313, 96917)	1
  (11313, 96940)	1
  (11313, 97181)	1
  (11313, 99908)	1
  (11313, 100208)	1


In [68]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

classifier = LogisticRegression()
classifier.fit(train_x_cnt_vect, train_y)
pred = classifier.predict(test_x_cnt_vect)
print(f"CountVectorized Logistic Regression Accuracy: {accuracy_score(test_y, pred):.3f}")

AttributeError: 'str' object has no attribute 'decode'

In [69]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(train_x)
train_x_tfidf_vect = tfidf_vect.transform(train_x)
test_x_tfidf_vect = tfidf_vect.transform(test_x)

classifier = LogisticRegression()
classifier.fit(train_x_tfidf_vect, train_y)
pred = classifier.predict(test_x_tfidf_vect)
print(f"TF-IDF Logistic Regression Accuracy: {accuracy_score(test_y, pred):.3f}")

TF-IDF Logistic Regression Accuracy: 0.674
