# Notebook Imports

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Generate sample data

In [3]:
corpus = ['Martin is not a bad person.',
         'Kevin, is the brother of Martin.',
         'Kevin is a bad person.'] 

# Bag-of-Words (BoW)

In [4]:
# create tokenizer
vectorizer = CountVectorizer(lowercase=False, stop_words='english') 

# fit tokenizer
BoW = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out())
print(BoW.toarray())

['Kevin' 'Martin' 'bad' 'brother' 'person']
[[0 1 1 0 1]
 [1 1 0 1 0]
 [1 0 1 0 1]]


# Bag-of-N-Grams

In [5]:
# create bag Bag-of-n-Grams tokenizer
vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(2, 2), lowercase=False, stop_words='english')

# fit the vectorizer
Bo2G = vectorizer2.fit_transform(corpus)
vectorizer2.get_feature_names_out()

array(['Kevin bad', 'Kevin brother', 'Martin bad', 'bad person',
       'brother Martin'], dtype=object)

In [6]:
Bo2G.toarray()

array([[0, 0, 1, 1, 0],
       [0, 1, 0, 0, 1],
       [1, 0, 0, 1, 0]], dtype=int64)

# Term Frequency-Inverse Document Frequency (TF-IDF)

In [12]:
TFIDF = TfidfVectorizer(lowercase=False, norm='l2', stop_words='english')

# fit tokenizer
TFIDFtext = TFIDF.fit_transform(corpus)
TFIDF.get_feature_names_out()

array(['Kevin', 'Martin', 'bad', 'brother', 'person'], dtype=object)

In [13]:
print(TFIDFtext)

  (0, 4)	0.5773502691896257
  (0, 2)	0.5773502691896257
  (0, 1)	0.5773502691896257
  (1, 3)	0.680918560398684
  (1, 0)	0.5178561161676974
  (1, 1)	0.5178561161676974
  (2, 0)	0.5773502691896257
  (2, 4)	0.5773502691896257
  (2, 2)	0.5773502691896257
