In [2]:
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
text = "As in classical machine learning it is based on the performance on the validation set, but typically does not involve does not include a very thorough search."

In [12]:
import string

In [13]:
text_no_punct = ''.join(ch for ch in text if ch not in string.punctuation)

In [14]:
text_no_punct

'As in classical machine learning it is based on the performance on the validation set but typically does not involve does not include a very thorough search'

In [15]:
text = text_no_punct

In [20]:
# TOKENIZATION Split text into individual words ( tokens )
from nltk.tokenize import word_tokenize
tokens = word_tokenize(text)
tokens

['As',
 'in',
 'classical',
 'machine',
 'learning',
 'it',
 'is',
 'based',
 'on',
 'the',
 'performance',
 'on',
 'the',
 'validation',
 'set',
 'but',
 'typically',
 'does',
 'not',
 'involve',
 'does',
 'not',
 'include',
 'a',
 'very',
 'thorough',
 'search']

In [21]:
#STOPWORD REMOVAL: remove common non informative words ('the', 'and', 'is')
stop_words = set(stopwords.words('english'))
filtered = [w for w in tokens if w.lower() not in stop_words]
filtered

['classical',
 'machine',
 'learning',
 'based',
 'performance',
 'validation',
 'set',
 'typically',
 'involve',
 'include',
 'thorough',
 'search']

In [22]:
# STEMMING REDUCE WORDS TO THEIR ROOT FORMS 
# running - run, studies - studi
stemmer = PorterStemmer()
stems = [stemmer.stem(word) for word in filtered]
stems

['classic',
 'machin',
 'learn',
 'base',
 'perform',
 'valid',
 'set',
 'typic',
 'involv',
 'includ',
 'thorough',
 'search']

In [23]:
#LEMMATIZATION # MORE ACCURATE REDUCTION USING DICTIONARY MEANING 
# better - good 
lemmatizer = WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(word) for word in filtered]
lemmas

['classical',
 'machine',
 'learning',
 'based',
 'performance',
 'validation',
 'set',
 'typically',
 'involve',
 'include',
 'thorough',
 'search']

In [28]:
#TEXT REPRESENTATION TO COUNT HOW MANY TIMES EACH WORD APPEARS
words = ["NLP is fun and exciting", 
        "NLP helps computers understand humman language", 
        "As in classical machine learning it is based on the performance on",
"the validation set, but typically does not involve does not include a very thorough search"]
cv = CountVectorizer()
X_word = cv.fit_transform(words)
df = pd.DataFrame(X_word.toarray(), columns = cv.get_feature_names_out())

In [29]:
df

Unnamed: 0,and,as,based,but,classical,computers,does,exciting,fun,helps,...,on,performance,search,set,the,thorough,typically,understand,validation,very
0,1,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,0,1,1,0,1,0,0,0,0,0,...,2,1,0,0,1,0,0,0,0,0
3,0,0,0,1,0,0,2,0,0,0,...,0,0,1,1,1,1,1,0,1,1


In [30]:
# WORD EMBEDDINGS : Converting words into numeric sequences 
# foundation of deelp learning models in NLP 
sentences = ["NLP is fun and exciting", 
        "NLP helps computers understand humman language", 
        "As in classical machine learning it is based on the performance on",
"the validation set, but typically does not involve does not include a very thorough search"]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
word_index

{'nlp': 1,
 'is': 2,
 'on': 3,
 'the': 4,
 'does': 5,
 'not': 6,
 'fun': 7,
 'and': 8,
 'exciting': 9,
 'helps': 10,
 'computers': 11,
 'understand': 12,
 'humman': 13,
 'language': 14,
 'as': 15,
 'in': 16,
 'classical': 17,
 'machine': 18,
 'learning': 19,
 'it': 20,
 'based': 21,
 'performance': 22,
 'validation': 23,
 'set': 24,
 'but': 25,
 'typically': 26,
 'involve': 27,
 'include': 28,
 'a': 29,
 'very': 30,
 'thorough': 31,
 'search': 32}

In [32]:
#Convert each sentence to a sequence of integers 
sequences = tokenizer.texts_to_sequences(sentences)
sequences

[[1, 2, 7, 8, 9],
 [1, 10, 11, 12, 13, 14],
 [15, 16, 17, 18, 19, 20, 2, 21, 3, 4, 22, 3],
 [4, 23, 24, 25, 26, 5, 6, 27, 5, 6, 28, 29, 30, 31, 32]]

In [33]:
# PADDING 
# PAD SEQUENCES SO THEY ALL HAVE THE SAME LENGTH
padded = pad_sequences(sequences, padding = 'post')
padded

array([[ 1,  2,  7,  8,  9,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 1, 10, 11, 12, 13, 14,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [15, 16, 17, 18, 19, 20,  2, 21,  3,  4, 22,  3,  0,  0,  0],
       [ 4, 23, 24, 25, 26,  5,  6, 27,  5,  6, 28, 29, 30, 31, 32]])

In [34]:
#SIMPLE PREDICTION RNN 
data = "Finding the optimal learning rate is important for achieving good results in a reasonable training time."

In [36]:
data = ''.join(ch for ch in data if ch not in string.punctuation)
data

'Finding the optimal learning rate is important for achieving good results in a reasonable training time'

In [None]:
stop_words = set(stopwords.words('english'))
filtered = [w for w in tokens if w.lower() not in stop_words]
data = filtered

In [40]:
# Tokenize and create vocabulary 
tok  = Tokenizer()
tokenized=tok.fit_on_texts([data])
sequence = tok.texts_to_sequences([data])[0]
vocab_size = len(tok.word_index)+1
sequence


[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]

In [41]:
tokens = word_tokenize(data)
tokens

['Finding',
 'the',
 'optimal',
 'learning',
 'rate',
 'is',
 'important',
 'for',
 'achieving',
 'good',
 'results',
 'in',
 'a',
 'reasonable',
 'training',
 'time']

In [42]:
stop_words = set(stopwords.words('english'))
filtered = [w for w in tokens if w.lower() not in stop_words]
filtered

['Finding',
 'optimal',
 'learning',
 'rate',
 'important',
 'achieving',
 'good',
 'results',
 'reasonable',
 'training',
 'time']

In [43]:
X, y = [], []
for i in range(1, len(sequence)):
    X.append(sequence[:i])
    y.append(sequence[i])

In [45]:
X = pad_sequences(X, maxlen = 5, padding = 'pre')
y = np.array(y)
X

array([[ 0,  0,  0,  0,  1],
       [ 0,  0,  0,  1,  2],
       [ 0,  0,  1,  2,  3],
       [ 0,  1,  2,  3,  4],
       [ 1,  2,  3,  4,  5],
       [ 2,  3,  4,  5,  6],
       [ 3,  4,  5,  6,  7],
       [ 4,  5,  6,  7,  8],
       [ 5,  6,  7,  8,  9],
       [ 6,  7,  8,  9, 10],
       [ 7,  8,  9, 10, 11],
       [ 8,  9, 10, 11, 12],
       [ 9, 10, 11, 12, 13],
       [10, 11, 12, 13, 14],
       [11, 12, 13, 14, 15]])

In [46]:
model = Sequential([
    Embedding(vocab_size, 10, input_length = 5), 
    SimpleRNN(20, activation = 'tanh'),
    Dense(vocab_size, activation = 'softmax') # PREDICT THE NEXT WORD 
])



In [48]:
model.compile(loss = 'sparse_categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [49]:
model.summary()

In [51]:
model.fit(X, y, epochs = 100, verbose = 1)

Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 0.2000 - loss: 2.7503
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step - accuracy: 0.2000 - loss: 2.7406
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step - accuracy: 0.2000 - loss: 2.7307
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step - accuracy: 0.2000 - loss: 2.7207
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - accuracy: 0.2667 - loss: 2.7105
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 0.4000 - loss: 2.7001
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step - accuracy: 0.4000 - loss: 2.6894
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 256ms/step - accuracy: 0.4000 - loss: 2.6784
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

<keras.src.callbacks.history.History at 0x23a70308da0>

In [52]:
# PREDICTING THE NEXT WORD
test_text = "Finding"
seq = tok.texts_to_sequences([test_text])[0]
padded_seq = pad_sequences([seq], maxlen = 5, padding = 'pre')
pred = model.predict(padded_seq, verbose = 1) #index predicted
pred_word = tok.index_word[np.argmax(pred)]
pred_word

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 378ms/step


'the'

In [53]:
pred

array([[0.0131345 , 0.02713269, 0.4185501 , 0.18564577, 0.12849826,
        0.01953771, 0.00178974, 0.00771518, 0.02049799, 0.00506964,
        0.00133503, 0.00383882, 0.02056559, 0.02135329, 0.03700764,
        0.02219756, 0.06613045]], dtype=float32)