In [1]:
from sklearn.feature_extraction import DictVectorizer
v = DictVectorizer(sparse=False)
D = [{'A':1, 'B':2}, {'B':3, 'C': 1}]
X = v.fit_transform(D)
X

array([[1., 2., 0.],
       [0., 3., 1.]])

In [2]:
v.feature_names_

['A', 'B', 'C']

In [3]:
v.transform({'C':4, 'D':3}) # D는 없는걸로 나옴

array([[0., 0., 4.]])

## CountVectorizer
- 문서를 넣어주면 알아서 토큰리스트를 변환해주고 빈도를 새고 BOW로 변환까지 시켜줌
- 개발자가 직접 만들어야하기 때문에 거의 쓸 수 없음

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
    'This is the first document.',
    'This is the second second document',
    'And the third one.',
    'Is this the first document?',
    'The last document?'
]

vect = CountVectorizer()
vect.fit(corpus)
vect.vocabulary_

{'this': 9,
 'is': 3,
 'the': 7,
 'first': 2,
 'document': 1,
 'second': 6,
 'and': 0,
 'third': 8,
 'one': 5,
 'last': 4}

In [5]:
vect.transform(['This is the second document.']).toarray()

array([[0, 1, 0, 1, 0, 0, 1, 1, 0, 1]])

In [6]:
vect.transform(['Something completely new.']).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

## TF-IDF 인코딩


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidv = TfidfVectorizer().fit(corpus)
tfidv.transform(corpus).toarray()

array([[0.        , 0.38947624, 0.55775063, 0.4629834 , 0.        ,
        0.        , 0.        , 0.32941651, 0.        , 0.4629834 ],
       [0.        , 0.24151532, 0.        , 0.28709733, 0.        ,
        0.        , 0.85737594, 0.20427211, 0.        , 0.28709733],
       [0.55666851, 0.        , 0.        , 0.        , 0.        ,
        0.55666851, 0.        , 0.26525553, 0.55666851, 0.        ],
       [0.        , 0.38947624, 0.55775063, 0.4629834 , 0.        ,
        0.        , 0.        , 0.32941651, 0.        , 0.4629834 ],
       [0.        , 0.45333103, 0.        , 0.        , 0.80465933,
        0.        , 0.        , 0.38342448, 0.        , 0.        ]])

## 해시 트릭(Hash Trick)

## gensim

In [8]:
corpus = [
    'This is the first document.',
    'This is the second second document',
    'And the third one.',
    'Is this the first document?',
    'The last document?'
]

In [9]:
token_list = [[text for text in doc.split()] for doc in corpus]
token_list

[['This', 'is', 'the', 'first', 'document.'],
 ['This', 'is', 'the', 'second', 'second', 'document'],
 ['And', 'the', 'third', 'one.'],
 ['Is', 'this', 'the', 'first', 'document?'],
 ['The', 'last', 'document?']]

In [10]:
!pip install --upgrade gensim



In [11]:
!pip install --upgrade gensim



In [12]:
!python3 -m pip install --upgrade gensim



In [13]:
from gensim.corpora import Dictionary

In [14]:
from nltk.corpus import movie_reviews
from nltk.util import ngrams

sentences = []
for tokens in movie_reviews.sents():
    bigram = ngrams(tokens, 2, pad_left=True, pad_right=True, left_pad_symbol="SS", right_pad_symbol="SE")
    sentences += [t for t in bigram]

sentences[:17]

[('SS', 'plot'),
 ('plot', ':'),
 (':', 'two'),
 ('two', 'teen'),
 ('teen', 'couples'),
 ('couples', 'go'),
 ('go', 'to'),
 ('to', 'a'),
 ('a', 'church'),
 ('church', 'party'),
 ('party', ','),
 (',', 'drink'),
 ('drink', 'and'),
 ('and', 'then'),
 ('then', 'drive'),
 ('drive', '.'),
 ('.', 'SE')]

In [15]:
from nltk import ConditionalFreqDist

cfd = ConditionalFreqDist(sentences)

In [16]:
cfd["SS"].most_common(5)


[('the', 8071), ('.', 3173), ('it', 3136), ('i', 2471), ('but', 1814)]

In [17]:
import matplotlib.pyplot as plt
 

cfd["SS"].plot(5, title="문장의 첫단어 분포")
plt.show()

In [18]:
from nltk.probability import ConditionalProbDist, MLEProbDist
cpd = ConditionalProbDist(cfd, MLEProbDist)

In [19]:
cpd["i"].prob("am")

0.018562267971650354

In [20]:
cpd["i"].prob("is")

0.0002249971875351558

In [21]:
cpd["we"].prob("are")

0.08504504504504505

In [22]:
cpd["we"].prob("is")

0.0

In [23]:
def sentence_score(s):
    p = 0.0
    for i in range(len(s) - 1):
        c = s[i]
        w = s[i + 1]
        p += np.log(cpd[c].prob(w) + np.finfo(float).eps)
    return np.exp(p)

In [24]:
test_sentence = ["i", "like", "the", "movie", "."]

sentence_score(test_sentence)

2.740764134071561e-06

In [25]:
def generate_sentence(seed=None):
    if seed is not None:
        import random
        random.seed(seed)
    c = "SS"
    sentence = []
    while True:
        if c not in cpd:
            break
        w = cpd[c].generate()

        if w == "SE":
            break
        elif w in ["i", "ii", "iii"]:
            w2 = w.upper()
        elif w in ["mr", "luc", "i", "robin", "williams", "cindy", "crawford"]:
            w2 = w.title()
        else:
            w2 = w

        if c == "SS":
            sentence.append(w2.title())
        elif c in ["`", "\"", "'", "("]:
            sentence.append(w2)
        elif w in ["'", ".", ",", ")", ":", ";", "?"]:
            sentence.append(w2)
        else:
            sentence.append(" " + w2)

        c = w
    return "".join(sentence)

In [26]:
generate_sentence()

"She't."

## Soynlp

In [27]:
!wget https://raw.githubusercontent.com/lovit/soynlp/master/tutorials/2016-10-20.txt -O 2016-10-20.txt

--2021-06-20 16:09:25--  https://raw.githubusercontent.com/lovit/soynlp/master/tutorials/2016-10-20.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 43694449 (42M) [text/plain]
Saving to: `2016-10-20.txt'


2021-06-20 16:09:32 (11.5 MB/s) - `2016-10-20.txt' saved [43694449/43694449]



In [28]:
import sys
print(sys.executable)

/Library/Frameworks/Python.framework/Versions/3.8/bin/python3


In [29]:
sys.path.append('/Library/Frameworks/Python.framework/Versions/3.8/bin/python3')

In [30]:
print(sys.path)

['/Users/sangjulee1/Documents/dss0/dataScience_fastCampus/07.머신러닝', '/Library/Frameworks/Python.framework/Versions/3.8/lib/python38.zip', '/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8', '/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/lib-dynload', '', '/Users/sangjulee1/Library/Python/3.8/lib/python/site-packages', '/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages', '/Users/sangjulee1/Library/Python/3.8/lib/python/site-packages/IPython/extensions', '/Users/sangjulee1/.ipython', '/Library/Frameworks/Python.framework/Versions/3.8/bin/python3']


In [31]:
!pip install soynlp




In [32]:
from soynlp import DoublespaceLineCorpus

# 문서 단위 말뭉치 생성 
corpus = DoublespaceLineCorpus("2016-10-20.txt")
len(corpus)  # 문서의 갯수

30091

In [34]:

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [35]:
import nltk
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/sangjulee1/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [36]:
from nltk.corpus import movie_reviews
sentence = [ list(s) for s in movie_reviews.sents() ]

In [37]:
sentence[0]

['plot',
 ':',
 'two',
 'teen',
 'couples',
 'go',
 'to',
 'a',
 'church',
 'party',
 ',',
 'drink',
 'and',
 'then',
 'drive',
 '.']

In [38]:
from gensim.models.word2vec import Word2Vec

In [39]:
model = Word2Vec(sentences)

In [40]:
model.init_sims(replace=True)

In [41]:
model.wv.similarity('actor', 'actress')

0.87093997

In [42]:
model.wv.similarity('he', 'she')

0.98096657

In [43]:
model.wv.similarity('actor', 'he')

0.26820204

In [44]:
model.wv.most_similar("accident")

[('expedition', 0.8901711106300354),
 ('experiment', 0.8870561122894287),
 ('elevator', 0.8835715651512146),
 ('authority', 0.8817894458770752),
 ('edge', 0.8772596120834351),
 ('trunk', 0.8746817111968994),
 ('enemy', 0.8714993596076965),
 ('operation', 0.8683598041534424),
 ('settled', 0.8663983345031738),
 ('saint', 0.8661829233169556)]

In [45]:
model.wv.most_similar(positive=['she', 'actor'], negative='actress', topn=1)

[('who', -0.035959843546152115)]