In [54]:
import nltk
nltk.download('punkt')
from nltk import tokenize
nltk.download("stopwords")
from nltk.corpus import stopwords
from string import punctuation
from nltk.stem import PorterStemmer
nltk.download('wordnet')
nltk.download('gutenberg')
from sklearn.feature_extraction.text import CountVectorizer


from nltk.corpus import gutenberg




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jiwue\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jiwue\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jiwue\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\jiwue\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [3]:
raw_txt = """Welcome to the world of Deep Learning for NLP! We're in this together, and we'll learn together. 
NLP is amazing, and Deep Learning makes it even more fun. Let's learn!"""

In [4]:
tokenize.sent_tokenize(raw_txt)

['Welcome to the world of Deep Learning for NLP!',
 "We're in this together, and we'll learn together.",
 'NLP is amazing, and Deep Learning makes it even more fun.',
 "Let's learn!"]

In [5]:
txt_sents = tokenize.sent_tokenize(raw_txt)

In [6]:
type(txt_sents), len(txt_sents)

(list, 4)

In [7]:
txt_words = [tokenize.word_tokenize(sent) for sent in txt_sents]
type(txt_words), type(txt_words[0])

(list, list)

In [8]:
print(txt_words[:2])

[['Welcome', 'to', 'the', 'world', 'of', 'Deep', 'Learning', 'for', 'NLP', '!'], ['We', "'re", 'in', 'this', 'together', ',', 'and', 'we', "'ll", 'learn', 'together', '.']]


## Normalizing Case

In [9]:
raw_txt = raw_txt.lower()

In [10]:
txt_sents = [sent.lower() for sent in txt_sents]
txt_sents

['welcome to the world of deep learning for nlp!',
 "we're in this together, and we'll learn together.",
 'nlp is amazing, and deep learning makes it even more fun.',
 "let's learn!"]

## Remove Punctuation 

In [13]:
list_punct = list(punctuation)
print(list_punct)

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']


In [14]:
def drop_punct(input_tokens):
    return [token for token in input_tokens if token not in list_punct]

In [15]:
drop_punct(["let",".","us",".","go","!"])

['let', 'us', 'go']

In [16]:
txt_words_nopunct = [drop_punct(sent) for sent in txt_words]
print(txt_words_nopunct)

[['Welcome', 'to', 'the', 'world', 'of', 'Deep', 'Learning', 'for', 'NLP'], ['We', "'re", 'in', 'this', 'together', 'and', 'we', "'ll", 'learn', 'together'], ['NLP', 'is', 'amazing', 'and', 'Deep', 'Learning', 'makes', 'it', 'even', 'more', 'fun'], ['Let', "'s", 'learn']]


## Remove stop words

In [23]:
list_stop = stopwords.words("english")
len(list_stop)

179

In [24]:
print(list_stop[:50])

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be']


In [26]:
txt_sents = tokenize.sent_tokenize(raw_txt.lower())
txt_words = [tokenize.word_tokenize(sent) for sent in txt_sents]

from string import punctuation
stop_punct = list(punctuation)

from nltk.corpus import stopwords
stop_nltk = stopwords.words("english")

stop_final = stop_punct + stop_nltk

In [27]:
def drop_stop(input_tokens):
    return [token for token in input_tokens if token not in stop_final]

In [28]:
txt_words_nostop = [drop_stop(sent) for sent in txt_words]

In [29]:
print(txt_words_nostop[0])

['welcome', 'world', 'deep', 'learning', 'nlp']


In [35]:
stemmer_p = PorterStemmer()

In [36]:
print([stemmer_p.stem(token) for token in txt_words_nostop[0]])

['welcom', 'world', 'deep', 'learn', 'nlp']


In [37]:
txt_words_stem = [[stemmer_p.stem(token) for token in sent] for sent in txt_words_nostop]

In [38]:
txt_words_stem

[['welcom', 'world', 'deep', 'learn', 'nlp'],
 ["'re", 'togeth', "'ll", 'learn', 'togeth'],
 ['nlp', 'amaz', 'deep', 'learn', 'make', 'even', 'fun'],
 ['let', "'s", 'learn']]

In [47]:
txt_words_nostop = nltk.corpus.gutenberg.raw(gutenberg.fileids())

In [49]:
target_terms = ["nlp","deep","learn"]

In [50]:
def get_onehot(sent):
    return [1 if term in  sent else 0 for term in target_terms]

In [51]:
one_hot_mat = [get_onehot(sent) for sent in txt_words_nostop]

In [52]:
import numpy as np

In [53]:
np.array(one_hot_mat)

array([[0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       ...,
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0]])

In [55]:
vectorizer = CountVectorizer(max_features = 5)

In [56]:
vectorizer.fit(txt_sents)

In [57]:
vectorizer.vocabulary_

{'deep': 1, 'we': 4, 'together': 3, 'and': 0, 'learn': 2}

In [58]:
txt_dtm = vectorizer.fit_transform(txt_sents)

In [59]:
txt_dtm.toarray()

array([[0, 1, 0, 0, 0],
       [1, 0, 1, 2, 2],
       [1, 1, 0, 0, 0],
       [0, 0, 1, 0, 0]], dtype=int64)

In [64]:
txt_words_stem

[['welcom', 'world', 'deep', 'learn', 'nlp'],
 ["'re", 'togeth', "'ll", 'learn', 'togeth'],
 ['nlp', 'amaz', 'deep', 'learn', 'make', 'even', 'fun'],
 ['let', "'s", 'learn']]