# <font color='red'>Text processing - Part 2</font>

# Handle text with sklearn

## CountVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# list of text documents
text = ["The quick brown fox jumped over the lazy dog."]

# create the transform
vectorizer = CountVectorizer()

# tokenize and build vocab
vectorizer.fit(text)

# summarize
print(vectorizer.vocabulary_)

In [None]:
# encode document
vector = vectorizer.transform(text)

# summarize encoded vector
print(vector.shape)
print(type(vector))
print(vector.toarray())

In [None]:
# encode another document
text2 = ["the puppy"]
vector = vectorizer.transform(text2)
print(vector.toarray())

## TfidfVectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# list of text documents
text = ["The quick brown fox jumped over the lazy dog.", "The dog.", "The fox"]

# create the transform
vectorizer = TfidfVectorizer()

# tokenize and build vocab
vectorizer.fit(text)

# summarize
print(vectorizer.vocabulary_)
print(vectorizer.idf_)

In [None]:
# encode document
vector = vectorizer.transform([text[2]])   #<-- try to encode doc 0, 1, 2 
# summarize encoded vector
print(vector.shape)
print(vector.toarray())

## HashingVectorizer

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer

# list of text documents
text = ["The quick brown fox jumped over the lazy dog."]

# create the transform
vectorizer = HashingVectorizer(n_features=20)

In [None]:
# encode document
vector = vectorizer.transform(text)

# summarize encoded vector
print(vector.shape)
print(vector.toarray())

# Handle text with Keras

Use of `text_to_word_sequence()`.



In [None]:
from keras.preprocessing.text import text_to_word_sequence 

# define the document
text = 'The quick brown fox jumped over the lazy dog.'

# tokenize the document
result = text_to_word_sequence(text)
print(result)

In [None]:
from keras.preprocessing.text import text_to_word_sequence 

# define the document
text = 'The quick brown fox jumped over the lazy dog.'

# estimate the size of the vocabulary
words = set(text_to_word_sequence(text))
vocab_size = len(words)
print(vocab_size)

Use of `one_hot()`.

In [None]:
from keras.preprocessing.text import text_to_word_sequence 
from keras.preprocessing.text import one_hot

# define the document
text = 'The quick brown fox jumped over the lazy dog.'

# estimate the size of the vocabulary
words = set(text_to_word_sequence(text))
vocab_size = len(words)
print(vocab_size)

# integer encode the document
result = one_hot(text, round(vocab_size*55.3))
print(result)

Hash encoding with `hashing_trick()`.

In [None]:
from keras.preprocessing.text import text_to_word_sequence 
from keras.preprocessing.text import hashing_trick

# define the document
text = 'The quick brown fox jumped over the lazy dog.'

# estimate the size of the vocabulary
words = set(text_to_word_sequence(text))
vocab_size = len(words)
print(vocab_size)

# integer encode the document
result = hashing_trick(text, round(vocab_size*1.3), hash_function='md5') 
print(result)

## Tokenizer API

In [None]:
from keras.preprocessing.text import Tokenizer 

# define 5 documents
docs = ['Well done!',
        'Good work', 
        'Great effort', 
        'nice work', 
        'Excellent!']

# create the tokenizer
t = Tokenizer()

# fit the tokenizer on the documents
t.fit_on_texts(docs)

# summarize what was learned
print("word count ->", t.word_counts)
print("document count ->", t.document_count)
print("word index ->", t.word_index)
print("word docs ->", t.word_docs)

In [None]:
# integer encode documents
encoded_docs = t.texts_to_matrix(docs, mode='count')
print(encoded_docs)

In [None]:
# integer encode documents
encoded_docs = t.texts_to_matrix(docs, mode='freq')
print(encoded_docs)

In [None]:
# integer encode documents
encoded_docs = t.texts_to_matrix(docs, mode='tfidf')
print(encoded_docs)