NLTK Example from https://www.dataknowsall.com/bowtfidf.html

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from string import punctuation

from nltk.util import ngrams
from nltk.tokenize import SyllableTokenizer
from nltk import word_tokenize
from nltk.tokenize import LegalitySyllableTokenizer
import nltk
nltk.download("punkt")


[nltk_data] Downloading package punkt to /home/jupyter-
[nltk_data]     zkm1/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
corpus = [
    "Tune a hyperparameter.",
    "You can tune a piano but you can't tune a fish.",
    "Fish who eat fish, catch fish.",
    "People can tune a fish or a hyperparameter.",
    "It is hard to catch fish and tune it.",
]

In [3]:
# start with CountVectorizer which creates a BoW
vectorizer = CountVectorizer(stop_words='english') 
X = vectorizer.fit_transform(corpus) 
pd.DataFrame(X.A, columns=vectorizer.get_feature_names_out())

Unnamed: 0,catch,eat,fish,hard,hyperparameter,people,piano,tune
0,0,0,0,0,1,0,0,1
1,0,0,1,0,0,0,1,2
2,1,1,3,0,0,0,0,0
3,0,0,1,0,1,1,0,1
4,1,0,1,1,0,0,0,1


In [4]:
# change vectorizer
vectorizer = TfidfVectorizer(stop_words='english', use_idf=False) 
X = vectorizer.fit_transform(corpus) 
df = pd.DataFrame(np.round(X.A,3), columns=vectorizer.get_feature_names_out())
df


Unnamed: 0,catch,eat,fish,hard,hyperparameter,people,piano,tune
0,0.0,0.0,0.0,0.0,0.707,0.0,0.0,0.707
1,0.0,0.0,0.408,0.0,0.0,0.0,0.408,0.816
2,0.302,0.302,0.905,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.5,0.0,0.5,0.5,0.0,0.5
4,0.5,0.0,0.5,0.5,0.0,0.0,0.0,0.5


In [5]:
# inverse vectorizer
vectorizer = TfidfVectorizer(stop_words='english', use_idf=True) 
X = vectorizer.fit_transform(corpus) 
df = pd.DataFrame(np.round(X.A,3), columns=vectorizer.get_feature_names_out())
df


Unnamed: 0,catch,eat,fish,hard,hyperparameter,people,piano,tune
0,0.0,0.0,0.0,0.0,0.82,0.0,0.0,0.573
1,0.0,0.0,0.35,0.0,0.0,0.0,0.622,0.701
2,0.38,0.471,0.796,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.373,0.0,0.534,0.661,0.0,0.373
4,0.534,0.0,0.373,0.661,0.0,0.0,0.0,0.373


In [15]:
# sylable, single word
SSP = SyllableTokenizer()
s = SSP.tokenize('justification')
print("Syllables: ",s)

# tokenize into words
w = word_tokenize(" ".join(corpus))
print("Words: ",w)
# syllables
result = [SSP.tokenize(token) for token in word_tokenize(" ".join(corpus))]
print("Syllables in sentence: ",result)


Syllables:  ['jus', 'ti', 'fi', 'ca', 'tion']
Words:  ['Tune', 'a', 'hyperparameter', '.', 'You', 'can', 'tune', 'a', 'piano', 'but', 'you', 'ca', "n't", 'tune', 'a', 'fish', '.', 'Fish', 'who', 'eat', 'fish', ',', 'catch', 'fish', '.', 'People', 'can', 'tune', 'a', 'fish', 'or', 'a', 'hyperparameter', '.', 'It', 'is', 'hard', 'to', 'catch', 'fish', 'and', 'tune', 'it', '.']
Syllables in sentence:  [['Tu', 'ne'], ['a'], ['hy', 'per', 'pa', 'ra', 'me', 'ter'], ['.'], ['Yo', 'u'], ['can'], ['tu', 'ne'], ['a'], ['pia', 'no'], ['but'], ['yo', 'u'], ['ca'], ["n't"], ['tu', 'ne'], ['a'], ['fish'], ['.'], ['Fish'], ['who'], ['eat'], ['fish'], [','], ['catch'], ['fish'], ['.'], ['Peo', 'ple'], ['can'], ['tu', 'ne'], ['a'], ['fish'], ['or'], ['a'], ['hy', 'per', 'pa', 'ra', 'me', 'ter'], ['.'], ['It'], ['is'], ['hard'], ['to'], ['catch'], ['fish'], ['and'], ['tu', 'ne'], ['it'], ['.']]
