In [None]:
# Term Frequency - Inverse Document Frequency
# W = TF(특정 단어가 문서 내에 등장하는 빈도 수) * log(N/Df)
# log를 사용하는 이유 : N/Df가 굉장히 큰 값인데 가중치이기에 log로 크기를 줄여 사용

# 상대적으로 덜 등장하는 값이 중요할 가능성이 높기 때문에 inverse함
# Rank-Frequency Law (Zipf’s Law) : Frequency(빈도 수) * Rank(중요도) = Constant
# 정보검색 시스템(Information Retrieval System) : 질의어와 관련있는 document의 상대적인 부합 정도를 반환
# Rank ≅ 𝛂𝑻𝑭 ∗ 𝜷[𝑳𝒐𝒈𝜸𝑵 − 𝑳𝒐𝒈𝜸𝑫𝑭 + 𝟏]

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?']

vect1 = CountVectorizer().fit(corpus)
tf = vect1.transform(corpus)

feature_names = vect1.get_feature_names() # 어휘사전에 있는 어휘를 리스트 형태로 반환
print("Term:{}".format(feature_names[:]))
print(tf.toarray(), "\n")

vect2 = TfidfVectorizer().fit(corpus)
tfidf = vect2.transform(corpus)
print(tfidf.toarray()) # 가중치로 인해 실수로 나타남

Term:['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
[[0 1 1 1 0 0 1 0 1]
 [0 1 0 1 0 2 1 0 1]
 [1 0 0 0 1 0 1 1 0]
 [0 1 1 1 0 0 1 0 1]] 

[[0.         0.43877674 0.54197657 0.43877674 0.         0.
  0.35872874 0.         0.43877674]
 [0.         0.27230147 0.         0.27230147 0.         0.85322574
  0.22262429 0.         0.27230147]
 [0.55280532 0.         0.         0.         0.55280532 0.
  0.28847675 0.55280532 0.        ]
 [0.         0.43877674 0.54197657 0.43877674 0.         0.
  0.35872874 0.         0.43877674]]


In [None]:
# n-gram 방식
# gram : unit -> 어휘사전 unit(term)의 갯수
# ngram_range(1,1) : min = 1, max = 1
# unit이 term일 필요는 없음 ex) char

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

bards_words = ["The fool doth think he is wise,",
              "but the wise man knows himself to be a fool"]

cv = CountVectorizer(ngram_range = (1,1)).fit(bards_words)
print("어휘사전의 크기: {}".format(len(cv.vocabulary_)))
print("어휘사전:\n{}".format(cv.get_feature_names()))
print("변환된 데이터:\n{}".format(cv.transform(bards_words).toarray()))

어휘사전의 크기: 13
어휘사전:
['be', 'but', 'doth', 'fool', 'he', 'himself', 'is', 'knows', 'man', 'the', 'think', 'to', 'wise']
변환된 데이터:
[[0 0 1 1 1 0 1 0 0 1 1 0 1]
 [1 1 0 1 0 1 0 1 1 1 0 1 1]]


In [7]:
from sklearn.feature_extraction.text import CountVectorizer

bards_words = ["The fool doth think he is wise,",
              "but the wise man knows himself to be a fool"]

cv = CountVectorizer(ngram_range = (1,3)).fit(bards_words)
print("어휘사전의 크기: {}".format(len(cv.vocabulary_)))
print("어휘사전:\n{}".format(cv.get_feature_names()))
print("변환된 데이터:\n{}".format(cv.transform(bards_words).toarray()))

어휘사전의 크기: 39
어휘사전:
['be', 'be fool', 'but', 'but the', 'but the wise', 'doth', 'doth think', 'doth think he', 'fool', 'fool doth', 'fool doth think', 'he', 'he is', 'he is wise', 'himself', 'himself to', 'himself to be', 'is', 'is wise', 'knows', 'knows himself', 'knows himself to', 'man', 'man knows', 'man knows himself', 'the', 'the fool', 'the fool doth', 'the wise', 'the wise man', 'think', 'think he', 'think he is', 'to', 'to be', 'to be fool', 'wise', 'wise man', 'wise man knows']
변환된 데이터:
[[0 0 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0 1 1 0 0 0 0 0 0 1 1 1 0 0 1 1 1 0 0 0
  1 0 0]
 [1 1 1 1 1 0 0 0 1 0 0 0 0 0 1 1 1 0 0 1 1 1 1 1 1 1 0 0 1 1 0 0 0 1 1 1
  1 1 1]]


In [None]:
# relevant : 부합하는, 관련있는
# append : 첨부하다
# substitue : 대체하다
# accuracy : 정확성

In [1]:
# import nltk
# nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [1]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

ps = PorterStemmer() # stemmer 객체 생성

example_words = ["python", "pythoner", "pythoning", "pythoned", "pythonly"]
for s in example_words:
    print(ps.stem(s))

python
python
python
python
pythonli


In [3]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

ps = PorterStemmer()

new_text = "It is important to be very pythonly while you are pythoning with python"
words = word_tokenize(new_text) # 토큰화(찢기)
print(words)
print("\n")

for w in words:
    print(ps.stem(w)) # 어간 추출(stemming)
print("\n")

# 화면에 출력하지 않고 term 리스트로 유지
result = [ps.stem(w) for w in words]
print(result)

['It', 'is', 'important', 'to', 'be', 'very', 'pythonly', 'while', 'you', 'are', 'pythoning', 'with', 'python']


It
is
import
to
be
veri
pythonli
while
you
are
python
with
python


['It', 'is', 'import', 'to', 'be', 'veri', 'pythonli', 'while', 'you', 'are', 'python', 'with', 'python']


In [5]:
import nltk
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

stemmer = PorterStemmer()

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

# CountVectorizer에 토큰을 생성하는 별도의 함수(tokenize)를 지정함
# 함수 내부에서 토큰화와 stemming을 진행
vect = CountVectorizer(tokenizer = tokenize, stop_words = 'english') # tokenizer가 token_pattern보다 우선순위가 높은 것으로 추정
vect.fit(["The swimmer likes swimming."])

sentence1 = vect.transform(["The swimmer likes swimming."])
sentence2 = vect.transform(["The swimmer swim. ."])

print(vect.get_feature_names()) # 어휘사전에 있는 어휘를 리스트 형태로 반환

# sentence1과 2는 sparse matrix
print(sentence1.toarray())
print(sentence2.toarray())

['.', 'like', 'swim', 'swimmer']
[[1 1 1 1]]
[[2 0 1 1]]




In [3]:
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
import nltk
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
import numpy as np
import re

reviews_train = load_files("/Users/leedonghyeok/Downloads/aclImdb/train")
reviews_test = load_files("/Users/leedonghyeok/Downloads/aclImdb/test")

text_train, y_train = reviews_train.data, reviews_train.target
text_test, y_test = reviews_test.data, reviews_test.target

print("테스트 데이터의 수: {}".format(len(text_test)))
print("클래스별 샘플 수 (테스트 데이터): {}".format(np.bincount(y_test)))

text_train = [doc.replace(b"<br />", b" ")  for doc in text_train]
text_test = [doc.replace(b"<br />", b" ")  for doc in text_test]

stemmer = PorterStemmer()

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

vect = CountVectorizer(tokenizer = tokenize, stop_words = 'english').fit(text_train)
X_train = vect.transform(text_train)
X_test = vect.transform(text_test)

clf = LogisticRegression()
clf.fit(X_train, y_train)
pre = clf.predict(X_test)

ac_score = accuracy_score(y_test,pre)
print("정답률 = ", ac_score)

테스트 데이터의 수: 25000
클래스별 샘플 수 (테스트 데이터): [12500 12500]




정답률 =  0.86488


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# accuray_score : maximum iteration 횟수가 존재하여 돌리면 돌릴수록 수치가 올라갈 듯

In [None]:
# P(x|y) >= P(x,y)
# P(x,y) = P(x) P(y) (if x and y are statistically independent)
# P(x,y) = P(y,x) -> P(x|y) = P(y|x) P(x) / P(y) : Bayesian theorem