<a href="https://colab.research.google.com/github/git933/Machine-Learning/blob/main/04_text_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# DictVectorizer

In [24]:
from sklearn.feature_extraction import DictVectorizer

v = DictVectorizer(sparse = False)
D = [{'A' : 1, 'B' : 2}, {'B' : 3, 'C' : 1}]
X = v.fit_transform(D)
X

array([[1., 2., 0.],
       [0., 3., 1.]])

In [3]:
v.feature_names_

['A', 'B', 'C']

In [4]:
v.transform({'C' : 4, 'D' : 3})

array([[0., 0., 4.]])

# CountVectorizer

In [25]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = [
          'This is the first document.',
          'This is the second document.',
          'And the third one.',
          'Is this the first document?',
          'The last document?',
]
vect = CountVectorizer()
vect.fit(corpus)
vect.vocabulary_

{'and': 0,
 'document': 1,
 'first': 2,
 'is': 3,
 'last': 4,
 'one': 5,
 'second': 6,
 'the': 7,
 'third': 8,
 'this': 9}

In [6]:
vect.transform(['This is the second document.']).toarray()

array([[0, 1, 0, 1, 0, 0, 1, 1, 0, 1]])

In [7]:
vect.transform(['Something completely new.']).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [8]:
vect.transform(corpus).toarray()

array([[0, 1, 1, 1, 0, 0, 0, 1, 0, 1],
       [0, 1, 0, 1, 0, 0, 1, 1, 0, 1],
       [1, 0, 0, 0, 0, 1, 0, 1, 1, 0],
       [0, 1, 1, 1, 0, 0, 0, 1, 0, 1],
       [0, 1, 0, 0, 1, 0, 0, 1, 0, 0]])

# stop words

In [10]:
vect = CountVectorizer(stop_words = ["and", "is", "the", "this"]).fit(corpus)
vect.vocabulary_

{'document': 0, 'first': 1, 'last': 2, 'one': 3, 'second': 4, 'third': 5}

In [11]:
vect = CountVectorizer(stop_words = "english").fit(corpus)
vect.vocabulary_

{'document': 0, 'second': 1}

# 토큰

In [12]:
vect = CountVectorizer(analyzer = "char").fit(corpus)
vect.vocabulary_

{' ': 0,
 '.': 1,
 '?': 2,
 'a': 3,
 'c': 4,
 'd': 5,
 'e': 6,
 'f': 7,
 'h': 8,
 'i': 9,
 'l': 10,
 'm': 11,
 'n': 12,
 'o': 13,
 'r': 14,
 's': 15,
 't': 16,
 'u': 17}

In [15]:
vect = CountVectorizer(token_pattern = "t\w+").fit(corpus)
vect.vocabulary_

{'the': 0, 'third': 1, 'this': 2}

In [26]:
import nltk
nltk.download('punkt')

vect = CountVectorizer(tokenizer = nltk.word_tokenize).fit(corpus)
vect.vocabulary_

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!




{'.': 0,
 '?': 1,
 'and': 2,
 'document': 3,
 'first': 4,
 'is': 5,
 'last': 6,
 'one': 7,
 'second': 8,
 'the': 9,
 'third': 10,
 'this': 11}

# n-그램

In [17]:
vect = CountVectorizer(ngram_range = (2, 2)).fit(corpus)
vect.vocabulary_

{'and the': 0,
 'first document': 1,
 'is the': 2,
 'is this': 3,
 'last document': 4,
 'second document': 5,
 'the first': 6,
 'the last': 7,
 'the second': 8,
 'the third': 9,
 'third one': 10,
 'this is': 11,
 'this the': 12}

In [18]:
vect = CountVectorizer(ngram_range = (1, 2), token_pattern = "t\w+").fit(corpus)
vect.vocabulary_

{'the': 0, 'the third': 1, 'third': 2, 'this': 3, 'this the': 4}

# 빈도수

In [19]:
vect = CountVectorizer(max_df = 4, min_df = 2).fit(corpus)
vect.vocabulary_, vect.stop_words_

({'document': 0, 'first': 1, 'is': 2, 'this': 3},
 {'and', 'last', 'one', 'second', 'the', 'third'})

In [20]:
vect.transform(corpus).toarray().sum(axis = 0)

array([4, 2, 3, 3])

# TF-IDF

In [32]:
import os
import sys
import numpy

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [30]:
from sklearn.feature_extraction.text import TfidfTransformer

In [33]:
tfidv = TfidfVectorizer().fit(corpus)
tfidv.transform(corpus).toarray()

array([[0.        , 0.38947624, 0.55775063, 0.4629834 , 0.        ,
        0.        , 0.        , 0.32941651, 0.        , 0.4629834 ],
       [0.        , 0.36055905, 0.        , 0.42860858, 0.        ,
        0.        , 0.63998972, 0.30495853, 0.        , 0.42860858],
       [0.55666851, 0.        , 0.        , 0.        , 0.        ,
        0.55666851, 0.        , 0.26525553, 0.55666851, 0.        ],
       [0.        , 0.38947624, 0.55775063, 0.4629834 , 0.        ,
        0.        , 0.        , 0.32941651, 0.        , 0.4629834 ],
       [0.        , 0.45333103, 0.        , 0.        , 0.80465933,
        0.        , 0.        , 0.38342448, 0.        , 0.        ]])

# hashing trick

In [34]:
from sklearn.datasets import fetch_20newsgroups
twenty = fetch_20newsgroups()
len(twenty.data)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


11314

In [35]:
%time CountVectorizer().fit(twenty.data).transform(twenty.data);

CPU times: user 6.16 s, sys: 23.8 ms, total: 6.19 s
Wall time: 6.2 s


<11314x130107 sparse matrix of type '<class 'numpy.int64'>'
	with 1787565 stored elements in Compressed Sparse Row format>

In [36]:
from sklearn.feature_extraction.text import HashingVectorizer
hv = HashingVectorizer(n_features = 300000)

In [37]:
%time hv.transform(twenty.data);

CPU times: user 2.19 s, sys: 4.87 ms, total: 2.2 s
Wall time: 2.21 s


<11314x300000 sparse matrix of type '<class 'numpy.float64'>'
	with 1786336 stored elements in Compressed Sparse Row format>