### NLP: Cleaning, Stop Words Removal, Stemming, Lemmatization, Bag of Words
##### Saurabh Chatterjee

In [20]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer


In [9]:
paragraph = "Naruto is a Japanese manga series written and illustrated by Masashi Kishimoto. It tells the story of Naruto Uzumaki, a young ninja who seeks recognition from his peers and dreams of becoming the Hokage, the leader of his village. Naruto is one of the best-selling manga series of all time, having 250 million copies in circulation worldwide in 47 countries and regions, with 153 million copies in Japan alone and remaining 97 million copies elsewhere. On the day of Naruto Uzamaki's birth the village of Konoha was attacked by the 9-tailed fox demon. In order to protect the village Naruto's father the 4th hokage sacrificed his life and sealed the demon in his new born son. 13 years later Naruto graduates the ninja academy and becomes a shinobi with goal to be the hokage of the village. Joining him are rival Sasuke Uchiha who attempts to gain power to avenge his clan after they were murdered by his older brother Itachi. And Sakura Haruno who is Naruto's love interest who of course loves his rival Sasuke. But when itachi returns to the village after the chunnin exams and Sasuke proves to be powerless against him. Sasuke will fall to the villainous Orochimaru to gain power. Naruto must do everything in his power to stop his friend from loosing himself to darkness even if it means losing himself."

In [10]:
paragraph

"Naruto is a Japanese manga series written and illustrated by Masashi Kishimoto. It tells the story of Naruto Uzumaki, a young ninja who seeks recognition from his peers and dreams of becoming the Hokage, the leader of his village. Naruto is one of the best-selling manga series of all time, having 250 million copies in circulation worldwide in 47 countries and regions, with 153 million copies in Japan alone and remaining 97 million copies elsewhere. On the day of Naruto Uzamaki's birth the village of Konoha was attacked by the 9-tailed fox demon. In order to protect the village Naruto's father the 4th hokage sacrificed his life and sealed the demon in his new born son. 13 years later Naruto graduates the ninja academy and becomes a shinobi with goal to be the hokage of the village. Joining him are rival Sasuke Uchiha who attempts to gain power to avenge his clan after they were murdered by his older brother Itachi. And Sakura Haruno who is Naruto's love interest who of course loves his

In [None]:
## TOKENIZATION: Converts paragraph into sentences (sentence-tokenized)
nltk.download('punkt')      # Download Punkt Package 

In [14]:
## TOKENIZATION: Converts paragraph into sentences (sentence-tokenized)

sentences = nltk.sent_tokenize(paragraph)       # Returns a LIST of Sentences 

[nltk_data] Downloading package punkt to C:\Users\Saurabh
[nltk_data]     Chatterjee\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [15]:
sentences       # sentence list

['Naruto is a Japanese manga series written and illustrated by Masashi Kishimoto.',
 'It tells the story of Naruto Uzumaki, a young ninja who seeks recognition from his peers and dreams of becoming the Hokage, the leader of his village.',
 'Naruto is one of the best-selling manga series of all time, having 250 million copies in circulation worldwide in 47 countries and regions, with 153 million copies in Japan alone and remaining 97 million copies elsewhere.',
 "On the day of Naruto Uzamaki's birth the village of Konoha was attacked by the 9-tailed fox demon.",
 "In order to protect the village Naruto's father the 4th hokage sacrificed his life and sealed the demon in his new born son.",
 '13 years later Naruto graduates the ninja academy and becomes a shinobi with goal to be the hokage of the village.',
 'Joining him are rival Sasuke Uchiha who attempts to gain power to avenge his clan after they were murdered by his older brother Itachi.',
 "And Sakura Haruno who is Naruto's love int

In [26]:
## STEMMING

stemmer = PorterStemmer()

print(stemmer.stem('going'))
print(stemmer.stem('history'))
print(stemmer.stem('drinking'))
print(stemmer.stem('goes'))

go
histori
drink
goe


In [None]:
## LEMMATIZATION
nltk.download('wordnet')        # download WordNet package

In [31]:
## LEMMATIZATION

lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize('going'))
print(lemmatizer.lemmatize('history'))
print(lemmatizer.lemmatize('drinking'))
print(lemmatizer.lemmatize('goes'))

going
history
drinking
go


#### Text Preprocessing: CLEANING

In [27]:
import re       # REGULAR EXPRESSION **

In [32]:
corpus = []     # cleaned sentences list

for i in range(len(sentences)):
    review = re.sub('[^a-zA-Z]', ' ', sentences[i])     # Replace Characters "OTHER THAN' (^) a-z and A-Z
    review = review.lower()     # Lower the CASE
    corpus.append(review)

corpus

['naruto is a japanese manga series written and illustrated by masashi kishimoto ',
 'it tells the story of naruto uzumaki  a young ninja who seeks recognition from his peers and dreams of becoming the hokage  the leader of his village ',
 'naruto is one of the best selling manga series of all time  having     million copies in circulation worldwide in    countries and regions  with     million copies in japan alone and remaining    million copies elsewhere ',
 'on the day of naruto uzamaki s birth the village of konoha was attacked by the   tailed fox demon ',
 'in order to protect the village naruto s father the  th hokage sacrificed his life and sealed the demon in his new born son ',
 '   years later naruto graduates the ninja academy and becomes a shinobi with goal to be the hokage of the village ',
 'joining him are rival sasuke uchiha who attempts to gain power to avenge his clan after they were murdered by his older brother itachi ',
 'and sakura haruno who is naruto s love int

#### Text Preprocessing: STEMMING

In [35]:
for sentence in corpus:
    words = nltk.word_tokenize(sentence)    # Word Tokenize from Sentence (get Word LIST)
    
    for word in words:
        if word not in set(stopwords.words('english')):     ## IGNORING STOP-WORDS **
            print(stemmer.stem(word))      # Stemming

naruto
japanes
manga
seri
written
illustr
masashi
kishimoto
tell
stori
naruto
uzumaki
young
ninja
seek
recognit
peer
dream
becom
hokag
leader
villag
naruto
one
best
sell
manga
seri
time
million
copi
circul
worldwid
countri
region
million
copi
japan
alon
remain
million
copi
elsewher
day
naruto
uzamaki
birth
villag
konoha
attack
tail
fox
demon
order
protect
villag
naruto
father
th
hokag
sacrif
life
seal
demon
new
born
son
year
later
naruto
graduat
ninja
academi
becom
shinobi
goal
hokag
villag
join
rival
sasuk
uchiha
attempt
gain
power
aveng
clan
murder
older
brother
itachi
sakura
haruno
naruto
love
interest
cours
love
rival
sasuk
itachi
return
villag
chunnin
exam
sasuk
prove
powerless
sasuk
fall
villain
orochimaru
gain
power
naruto
must
everyth
power
stop
friend
loos
dark
even
mean
lose


#### Text Preprocessing: LEMMATIZATION

In [36]:
for sentence in corpus:
    words = nltk.word_tokenize(sentence)    # Word Tokenize from Sentence (get Word LIST)
    
    for word in words:
        if word not in set(stopwords.words('english')):     ## IGNORING STOP-WORDS **
            print(lemmatizer.lemmatize(word))      # Stemming

naruto
japanese
manga
series
written
illustrated
masashi
kishimoto
tell
story
naruto
uzumaki
young
ninja
seek
recognition
peer
dream
becoming
hokage
leader
village
naruto
one
best
selling
manga
series
time
million
copy
circulation
worldwide
country
region
million
copy
japan
alone
remaining
million
copy
elsewhere
day
naruto
uzamaki
birth
village
konoha
attacked
tailed
fox
demon
order
protect
village
naruto
father
th
hokage
sacrificed
life
sealed
demon
new
born
son
year
later
naruto
graduate
ninja
academy
becomes
shinobi
goal
hokage
village
joining
rival
sasuke
uchiha
attempt
gain
power
avenge
clan
murdered
older
brother
itachi
sakura
haruno
naruto
love
interest
course
love
rival
sasuke
itachi
return
village
chunnin
exam
sasuke
prof
powerless
sasuke
fall
villainous
orochimaru
gain
power
naruto
must
everything
power
stop
friend
loosing
darkness
even
mean
losing


#### Text Processing: Cleaning, Stop-words Removal, Lemmatization

In [47]:
import re       # REGULAR EXPRESSION 

corpus = []

for i in range(len(sentences)):
    review = re.sub('[^a-zA-Z]', ' ', sentences[i])     # Replace Characters "OTHER THAN' (^) a-z and A-Z in the sentence   (Cleaning)
    review = review.lower()     # lower the Case
    review = review.split()     # Get the WORDS as a LIST (Split based on Space)

    review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]     ## IGNORING STOP-WORDS ** and doing LEMMATIZATION | List
    review = ' '.join(review)   # joining list to string sentence
    corpus.append(review)
    

#### Bag of Words

In [59]:
from sklearn .feature_extraction.text import CountVectorizer

## BAG of WORDS **
countvec = CountVectorizer()  
# countvec = CountVectorizer(ngram_range=(1,3))  ## *ngram_range: (1 to 3): consider single word, BI-GRAMS and TRI-GRAMS  

X_bow = countvec.fit_transform(corpus)

In [60]:
countvec.vocabulary_       # Corpus Vocabolary Dictionary in BoW: returns WORD-INDEX Mapping

{'naruto': 54,
 'japanese': 38,
 'manga': 48,
 'series': 77,
 'written': 92,
 'illustrated': 34,
 'masashi': 49,
 'kishimoto': 40,
 'tell': 83,
 'story': 81,
 'uzumaki': 88,
 'young': 94,
 'ninja': 56,
 'seek': 75,
 'recognition': 66,
 'peer': 61,
 'dream': 20,
 'becoming': 6,
 'hokage': 33,
 'leader': 43,
 'village': 89,
 'one': 58,
 'best': 7,
 'selling': 76,
 'time': 85,
 'million': 51,
 'copy': 14,
 'circulation': 12,
 'worldwide': 91,
 'country': 15,
 'region': 67,
 'japan': 37,
 'alone': 1,
 'remaining': 68,
 'elsewhere': 21,
 'day': 18,
 'uzamaki': 87,
 'birth': 8,
 'konoha': 41,
 'attacked': 2,
 'tailed': 82,
 'fox': 27,
 'demon': 19,
 'order': 59,
 'protect': 65,
 'father': 26,
 'th': 84,
 'sacrificed': 71,
 'life': 44,
 'sealed': 74,
 'new': 55,
 'born': 9,
 'son': 79,
 'year': 93,
 'later': 42,
 'graduate': 31,
 'academy': 0,
 'becomes': 5,
 'shinobi': 78,
 'goal': 30,
 'joining': 39,
 'rival': 70,
 'sasuke': 73,
 'uchiha': 86,
 'attempt': 3,
 'gain': 29,
 'power': 62,
 'ave

In [61]:
# checking BoW-VECTOR of first sentence in the Corpus
print(corpus[0])

X_bow[0].toarray()      

naruto japanese manga series written illustrated masashi kishimoto


array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0]], dtype=int64)

#### TF-IDF: Term Frequency - Inverse Document Frequency

In [62]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf_vec = TfidfVectorizer()
# tf_idf_vec = TfidfVectorizer(ngram_range=(1,3), max_features=10)     ## *ngram_range: (1 to 3): consider single word, BI-GRAMS and TRI-GRAMS 
#                                                                     ## max_features: Consider TOP-(max_features) only (here TOP-10)

X_tfidf = tf_idf_vec.fit_transform(corpus)

In [63]:
print(corpus[0])

X_tfidf[0].toarray()        # TF-IDF Vector of first sentence of the Corpus

naruto japanese manga series written illustrated masashi kishimoto


array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.38708586,
        0.        , 0.        , 0.        , 0.38708586, 0.        ,
        0.38708586, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.3308669 , 0.38708586,
        0.        , 0.        , 0.        , 0.        , 0.178541  ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  