In [62]:
import nltk
import numpy as np

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


from nltk import tokenize
from string import punctuation
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
raw_txt = """Welcome to the world of Deep Learning for NLP!\
             We're in this together, and we'll learn together.\
             NLP is amazing,\
             and Deep Learning makes it even more fun.\
             Let's learn!"""

In [4]:
print(raw_txt)

Welcome to the world of Deep Learning for NLP!             We're in this together, and we'll learn together.             NLP is amazing,             and Deep Learning makes it even more fun.             Let's learn!


In [7]:
txt_sents = tokenize.sent_tokenize(raw_txt)
print(txt_sents)

['Welcome to the world of Deep Learning for NLP!', "We're in this together, and we'll learn together.", 'NLP is amazing,             and Deep Learning makes it even more fun.', "Let's learn!"]


In [8]:
print(type(txt_sents))

<class 'list'>


In [9]:
print(len(txt_sents))

4


In [10]:
for i in range(len(txt_sents)):
  print(txt_sents[i])

Welcome to the world of Deep Learning for NLP!
We're in this together, and we'll learn together.
NLP is amazing,             and Deep Learning makes it even more fun.
Let's learn!


In [11]:
txt_word_sentence1 = tokenize.word_tokenize(txt_sents[0])
print(len(txt_word_sentence1))
print(txt_word_sentence1)

10
['Welcome', 'to', 'the', 'world', 'of', 'Deep', 'Learning', 'for', 'NLP', '!']


In [12]:
txt_words = [tokenize.word_tokenize(i) for i in txt_sents]
print(txt_words)

[['Welcome', 'to', 'the', 'world', 'of', 'Deep', 'Learning', 'for', 'NLP', '!'], ['We', "'re", 'in', 'this', 'together', ',', 'and', 'we', "'ll", 'learn', 'together', '.'], ['NLP', 'is', 'amazing', ',', 'and', 'Deep', 'Learning', 'makes', 'it', 'even', 'more', 'fun', '.'], ['Let', "'s", 'learn', '!']]


In [14]:
raw_txt_lower = raw_txt.lower()
print(raw_txt_lower)

welcome to the world of deep learning for nlp!             we're in this together, and we'll learn together.             nlp is amazing,             and deep learning makes it even more fun.             let's learn!


In [15]:
print(type(raw_txt))
txt_sents_lower = [sent.lower() for sent in txt_sents]
print(txt_sents_lower)

<class 'str'>
['welcome to the world of deep learning for nlp!', "we're in this together, and we'll learn together.", 'nlp is amazing,             and deep learning makes it even more fun.', "let's learn!"]


In [19]:
list_punct = list(punctuation)
print(list_punct)
print(len(list_punct))

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']
32


In [32]:
txt_words_lower = [tokenize.word_tokenize(i) for i in txt_sents_lower]

In [22]:
def drop_punct(x):
  drop = [i for i in x if i not in list_punct]
  return drop

In [26]:
txt_words_nopunct = [drop_punct(i) for i in txt_words]
print(txt_words_nopunct)

[['Welcome', 'to', 'the', 'world', 'of', 'Deep', 'Learning', 'for', 'NLP'], ['We', "'re", 'in', 'this', 'together', 'and', 'we', "'ll", 'learn', 'together'], ['NLP', 'is', 'amazing', 'and', 'Deep', 'Learning', 'makes', 'it', 'even', 'more', 'fun'], ['Let', "'s", 'learn']]


In [29]:
list_stop = stopwords.words('english')
print(list_stop)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [28]:
list_final = list_punct + list_stop
print(list_final)

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when'

In [52]:
def drop_punct_stop(x):
  drop = [i for i in x if i not in list_final]
  return drop

In [53]:
txt_final = [drop_punct_stop(i) for i in txt_words_lower]
print(txt_final)

[['welcome', 'world', 'deep', 'learning', 'nlp'], ["'re", 'together', "'ll", 'learn', 'together'], ['nlp', 'amazing', 'deep', 'learning', 'makes', 'even', 'fun'], ['let', "'s", 'learn']]


In [35]:
stemer_p = PorterStemmer()
stem1 = stemer_p.stem('Driving')
print(stem1)

drive


In [37]:
stem2 = stemer_p.stem(txt_sents[0])
print(stem2)

welcome to the world of deep learning for nlp!


In [39]:
txt_words = [tokenize.word_tokenize(sent) for sent in txt_sents]
txt = '''I mustered all my drive,\ 
	   drove to the driving school!'''
tokens = tokenize.word_tokenize(txt)

In [40]:
print([stemer_p.stem(word) for word in tokens])

['I', 'muster', 'all', 'my', 'drive', ',', '\\', 'drove', 'to', 'the', 'drive', 'school', '!']


In [43]:
lemmatizer = WordNetLemmatizer()

In [44]:
lem_test = lemmatizer.lemmatize("ponies")

In [45]:
print(lem_test)

pony


In [46]:
lem_test2 = lemmatizer.lemmatize("mice")
print(lem_test2)

mouse


In [47]:
lem_test3 = lemmatizer.lemmatize("geese")
print(lem_test3)

goose


In [48]:
stem3 = stemer_p.stem('Mice')
print(stem3)

mice


In [55]:
#txt_final
target_terms = ['nlp', 'deep', 'learn']

def get_onehot(x):
  y = [1 if i in x else 0 for i in target_terms]
  return y

one_hot_matrix = [get_onehot(i) for i in txt_final]
print(txt_final)
print(np.array(one_hot_matrix))

[['welcome', 'world', 'deep', 'learning', 'nlp'], ["'re", 'together', "'ll", 'learn', 'together'], ['nlp', 'amazing', 'deep', 'learning', 'makes', 'even', 'fun'], ['let', "'s", 'learn']]
[[1 1 0]
 [0 0 1]
 [1 1 0]
 [0 0 1]]


In [60]:
vectorizer = CountVectorizer(max_features = 5)
vectorizer.fit(txt_sents)
print(vectorizer.vocabulary_)

{'deep': 1, 'we': 4, 'together': 3, 'and': 0, 'learn': 2}


In [61]:
txt_dtm = vectorizer.fit_transform(txt_sents)
print(txt_dtm.toarray())

[[0 1 0 0 0]
 [1 0 1 2 2]
 [1 1 0 0 0]
 [0 0 1 0 0]]


In [63]:
vectorizer_tfidf = TfidfVectorizer(max_features=5)
vectorizer_tfidf.fit(txt_sents)
print(vectorizer_tfidf.vocabulary_)

{'deep': 1, 'we': 4, 'together': 3, 'and': 0, 'learn': 2}


In [64]:
txt_tfidf = vectorizer_tfidf.transform(txt_sents)
print(txt_tfidf.toarray())

[[0.         1.         0.         0.         0.        ]
 [0.25932364 0.         0.25932364 0.65783832 0.65783832]
 [0.70710678 0.70710678 0.         0.         0.        ]
 [0.         0.         1.         0.         0.        ]]
