In [None]:
!pip install sklearn

In [12]:
################ TOKENIZATION & STOP WORDS ##################

from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

text1 = """Yesterday I went fishing. I don't fish that often, 
so I didn't catch any fish. I was told I'd enjoy myself, 
but it didn't really seem that fun."""

text2 ="""I am sai Hemanth Reddy"""

list_of_docs = [text1, text2]
list_of_stop_words = []

#By default, the CountVectorizer also only uses words that are 2 or more letters, so we need to use token_pattern=r"(?u)\b\w+\b"
#we can also use character based tokenization by using analyzer='char'
#you can use custom stop words or english stop words
vectorizer = CountVectorizer(token_pattern=r"(?u)\b\w+\b", lowercase=True, strip_accents='unicode', analyzer='word', stop_words = 'english')
#A sparse matrix is returned
matrix = vectorizer.fit_transform(list_of_docs)

#By default, the CountVectorizer splits words on punctuation, so didn't becomes two words - didn and t. We don't have a good solution to this
vectors = pd.DataFrame(matrix.toarray(), columns= vectorizer.get_feature_names_out(), index=["doc1", "doc2"])
vectors

Unnamed: 0,catch,d,didn,don,enjoy,fish,fishing,fun,hemanth,really,reddy,sai,t,told,went,yesterday
doc1,1,1,2,1,1,2,1,1,0,1,0,0,3,1,1,1
doc2,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0


In [5]:
#Too use stop words of some other language use nltk
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [8]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
list_of_stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
################ STEMMING AND LEMMATIZATION ##################

# NORMAL TOKENIZATION doesn't consider walk, walking as the same 
# This increases the dimension of the vector & also walk is no closer to walking than it is to duck
# To avoid this 2 popular methods are used tokenization & stemming
# stemming just chops off the last part 
# lemmatization uses actual language rules

#////////////////// STEMMING \\\\\\\\\\\\\\\\\\\
#based on heuristic many algorithms are available NLTK has inbuilt Porter Stemmer algorithm 
from nltk.stem import PorterStemmer
porter = PorterStemmer()
porter.stem("walking")

'walk'

In [15]:
#////////////////// LEMMATIZATION \\\\\\\\\\\\\\\\\\\

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

nltk.download("wordnet") #only need to do once

lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("mice"))
print(lemmatizer.lemmatize("going"))
print(lemmatizer.lemmatize("going", pos=wordnet.VERB))

[nltk_data] Downloading package wordnet to /root/nltk_data...


mouse
going
go


In [9]:
################# Wroking more with stemming & lemmatization ###############
from nltk.stem import PorterStemmer
porter = PorterStemmer()
print(porter.stem("walking"))
print(porter.stem("walked"))
print(porter.stem("walks"))
print("..",porter.stem("ran"),"..")
print(porter.stem("running"))
print(porter.stem("bosses"))
print("..",porter.stem("replacement"),"..")
print("...",porter.stem("unnecessary"),"...")
#stemming does not remove un cause necessary & unnecesary are completely different words
print("...",porter.stem("berry"),"...")

walk
walk
walk
.. ran ..
run
boss
.. replac ..
... unnecessari ...
... berri ...


In [11]:
import nltk
from nltk.stem import WordNetLemmatizer

#As you recall, lemmatization essentially amounts to looking things up in a database.
#Our data in this case is part of NLTK's Word Net Package, which we can download by calling NCTK.download.
nltk.download("wordnet")

from nltk.corpus import wordnet

lemmatizer = WordNetLemmatizer()

lemmatizer.lemmatize("walking")

[nltk_data] Downloading package wordnet to /root/nltk_data...


'walking'

In [12]:
lemmatizer.lemmatize("walking", pos=wordnet.VERB)
#pos = parts of speech

'walk'

In [13]:
lemmatizer.lemmatize("going")

'going'

In [15]:
lemmatizer.lemmatize("going", pos=wordnet.VERB)

'go'

In [16]:
lemmatizer.lemmatize("ran", pos=wordnet.VERB)

'run'

In [17]:
lemmatizer.lemmatize("mice")

'mouse'

In [18]:
lemmatizer.lemmatize("was")
#possible that lemmatizer thinks was as plural

'wa'

In [19]:
lemmatizer.lemmatize("was", pos=wordnet.VERB)

'be'

In [21]:
lemmatizer.lemmatize("is")

'is'

In [22]:
lemmatizer.lemmatize("better")

'better'

In [23]:
lemmatizer.lemmatize("better", pos=wordnet.ADJ)

'good'

In [26]:
#To input correct POS for every word 
def get_wordnet_pos(treebank_tag):
  if treebank_tag.startswith('J'):
    return wordnet.ADJ
  elif treebank_tag.startswith('V'):
    return wordnet.VERB
  elif treebank_tag.startswith('N'):
    return wordnet.NOUN
  elif treebank_tag.startswith('R'):
    return wordnet.ADV
  else:
    return wordnet.NOUN

In [25]:
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [28]:
sentence = "donald Trump has a devoted following".split()

words_and_tags = nltk.pos_tag(sentence)
words_and_tags

[('donald', 'NN'),
 ('Trump', 'NNP'),
 ('has', 'VBZ'),
 ('a', 'DT'),
 ('devoted', 'VBN'),
 ('following', 'NN')]

In [29]:
for word, tag in words_and_tags:
  lemma = lemmatizer.lemmatize(word,pos=get_wordnet_pos(tag))
  print(lemma, end=" ")
#In the above sentence following is noun & not a verb therefore it
#shouldn't be reduced

donald Trump have a devote following 