# Forming a vocabulary from a given text, using:
- CountVectorizer
- TF-IDF (normalized)

In [23]:
# %pip install scikit-learn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
# Read the text file
with open('textfile.txt', 'r') as f:
    corpus = [line.strip() for line in f]
corpus

['This is the first document.',
 'This document is the second document.',
 'And this is the third one.',
 'Is this the first document?']

# Using CountVectorizer

In [19]:
# Create an CountVectorizer() object, create the vector
vectorizer_cv = CountVectorizer()
X_cv = vectorizer_cv.fit_transform(corpus)

In [20]:
# Get the vocabulary
vectorizer_cv.get_feature_names_out()

array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
       'this'], dtype=object)

In [21]:
# Print vectors
X_cv.toarray()

array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 2, 0, 1, 0, 1, 1, 0, 1],
       [1, 0, 0, 1, 1, 0, 1, 1, 1],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]], dtype=int64)

## Using TF-IDF (normalized)

In [24]:
# Create an TfidfVectorizer() object, create the vector
vectorizer_tfidf = TfidfVectorizer()
X_tfidf = vectorizer_tfidf.fit_transform(corpus)

In [25]:
# Get the features
vectorizer_tfidf.get_feature_names_out()

array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
       'this'], dtype=object)

In [26]:
X_tfidf.toarray()

array([[0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524],
       [0.        , 0.6876236 , 0.        , 0.28108867, 0.        ,
        0.53864762, 0.28108867, 0.        , 0.28108867],
       [0.51184851, 0.        , 0.        , 0.26710379, 0.51184851,
        0.        , 0.26710379, 0.51184851, 0.26710379],
       [0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524]])