In [2]:
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

2023-08-08 23:50:08.320333: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
sklearn.__version__

'1.3.0'

In [4]:
sentences = [
    'I love my dog.',
    'I love my cat.',
    'I love my dog and love my cat',
    'You love my dog!',
    'Do you think my dog is amazing?'
]
sentences

['I love my dog.',
 'I love my cat.',
 'I love my dog and love my cat',
 'You love my dog!',
 'Do you think my dog is amazing?']

In [5]:
count_vectorizer = CountVectorizer()
count_vectorizer

In [6]:
features = count_vectorizer.fit_transform(sentences)
features

<5x10 sparse matrix of type '<class 'numpy.int64'>'
	with 22 stored elements in Compressed Sparse Row format>

In [7]:
features.shape

(5, 10)

In [8]:
vectorized_sentences = features.toarray()
vectorized_sentences

array([[0, 0, 0, 0, 1, 0, 1, 1, 0, 0],
       [0, 0, 1, 0, 0, 0, 1, 1, 0, 0],
       [0, 1, 1, 0, 1, 0, 2, 2, 0, 0],
       [0, 0, 0, 0, 1, 0, 1, 1, 0, 1],
       [1, 0, 0, 1, 1, 1, 0, 1, 1, 1]])

In [9]:
feature_names = count_vectorizer.get_feature_names_out()
feature_names

array(['amazing', 'and', 'cat', 'do', 'dog', 'is', 'love', 'my', 'think',
       'you'], dtype=object)

In [10]:
df = pd.DataFrame(vectorized_sentences, columns=feature_names)
df

Unnamed: 0,amazing,and,cat,do,dog,is,love,my,think,you
0,0,0,0,0,1,0,1,1,0,0
1,0,0,1,0,0,0,1,1,0,0
2,0,1,1,0,1,0,2,2,0,0
3,0,0,0,0,1,0,1,1,0,1
4,1,0,0,1,1,1,0,1,1,1


In [11]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer

In [12]:
tfidf_sentences = tfidf_vectorizer.fit_transform(sentences)
tfidf_sentences

<5x10 sparse matrix of type '<class 'numpy.float64'>'
	with 22 stored elements in Compressed Sparse Row format>

In [13]:
tfidf_vect_sentences = tfidf_sentences.toarray()
tfidf_vect_sentences

array([[0.        , 0.        , 0.        , 0.        , 0.60685614,
        0.        , 0.60685614, 0.51327503, 0.        , 0.        ],
       [0.        , 0.        , 0.73792244, 0.        , 0.        ,
        0.        , 0.51528988, 0.43582888, 0.        , 0.        ],
       [0.        , 0.49110884, 0.39622352, 0.        , 0.27668216,
        0.        , 0.55336431, 0.46803199, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.45805379,
        0.        , 0.45805379, 0.38741896, 0.        , 0.65595732],
       [0.43872423, 0.        , 0.        , 0.43872423, 0.24716958,
        0.43872423, 0.        , 0.20905445, 0.43872423, 0.35395995]])

In [14]:
tfidf_vect_sentences.shape

(5, 10)

In [15]:
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_feature_names

array(['amazing', 'and', 'cat', 'do', 'dog', 'is', 'love', 'my', 'think',
       'you'], dtype=object)

In [16]:
df = pd.DataFrame(tfidf_vect_sentences, columns=tfidf_feature_names)
df

Unnamed: 0,amazing,and,cat,do,dog,is,love,my,think,you
0,0.0,0.0,0.0,0.0,0.606856,0.0,0.606856,0.513275,0.0,0.0
1,0.0,0.0,0.737922,0.0,0.0,0.0,0.51529,0.435829,0.0,0.0
2,0.0,0.491109,0.396224,0.0,0.276682,0.0,0.553364,0.468032,0.0,0.0
3,0.0,0.0,0.0,0.0,0.458054,0.0,0.458054,0.387419,0.0,0.655957
4,0.438724,0.0,0.0,0.438724,0.24717,0.438724,0.0,0.209054,0.438724,0.35396


In [17]:
tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')
tokenizer

<keras.src.preprocessing.text.Tokenizer at 0x7f7e00fba100>

In [18]:
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
word_index

{'<OOV>': 1,
 'my': 2,
 'love': 3,
 'dog': 4,
 'i': 5,
 'cat': 6,
 'you': 7,
 'and': 8,
 'do': 9,
 'think': 10,
 'is': 11,
 'amazing': 12}

In [19]:
tokenizer.index_word

{1: '<OOV>',
 2: 'my',
 3: 'love',
 4: 'dog',
 5: 'i',
 6: 'cat',
 7: 'you',
 8: 'and',
 9: 'do',
 10: 'think',
 11: 'is',
 12: 'amazing'}

In [20]:
sequences = tokenizer.texts_to_sequences(sentences)
sentences, sequences

(['I love my dog.',
  'I love my cat.',
  'I love my dog and love my cat',
  'You love my dog!',
  'Do you think my dog is amazing?'],
 [[5, 3, 2, 4],
  [5, 3, 2, 6],
  [5, 3, 2, 4, 8, 3, 2, 6],
  [7, 3, 2, 4],
  [9, 7, 10, 2, 4, 11, 12]])

In [21]:
padded = pad_sequences(sequences, padding='post', truncating='post')
padded

array([[ 5,  3,  2,  4,  0,  0,  0,  0],
       [ 5,  3,  2,  6,  0,  0,  0,  0],
       [ 5,  3,  2,  4,  8,  3,  2,  6],
       [ 7,  3,  2,  4,  0,  0,  0,  0],
       [ 9,  7, 10,  2,  4, 11, 12,  0]], dtype=int32)

In [22]:
tokenizer.index_word

{1: '<OOV>',
 2: 'my',
 3: 'love',
 4: 'dog',
 5: 'i',
 6: 'cat',
 7: 'you',
 8: 'and',
 9: 'do',
 10: 'think',
 11: 'is',
 12: 'amazing'}

In [23]:
for sequence in sequences:
    sent = []
    for idx in sequence:
        sent.append(tokenizer.index_word[idx])
    print(' '.join(sent))

i love my dog
i love my cat
i love my dog and love my cat
you love my dog
do you think my dog is amazing


In [24]:
ohe = to_categorical(padded)
ohe.shape, ohe

((5, 8, 13),
 array([[[0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
 
        [[0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0., 0., 0