In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
lst  = ["https://en.wikipedia.org/wiki/Mahatma_Gandhi",
        "https://en.wikipedia.org/wiki/Ratan_Tata",
        "https://en.wikipedia.org/wiki/P._V._Sindhu",
        "https://en.wikipedia.org/wiki/India",
       "https://en.wikipedia.org/wiki/Deep_learning"]


In [3]:
import re

def remove_number(text):
    return re.sub(r'\[\d+\]', '', text)

def scrap_text(lst, filename):
    all_content = []
    with open(filename, "w", encoding='utf-8') as f:
        for url in lst:
            res = requests.get(url)
            if res.status_code==200:
                soup = BeautifulSoup(res.text,'html.parser')
            
                for div in soup.find_all('div'):
                    for p in div.find_all("p"):
                        content = p.get_text().strip()
                        cleaned_content = remove_number(content)
                        # print(cleaned_content)
                        all_content.append(cleaned_content)
                        f.write(cleaned_content+"\n")
            else:
                print("Something wrong!!!")
    return all_content

In [4]:
scraped_text = scrap_text(lst, "scraped_clen_text.txt")
print("Scraping complete.")


Scraping complete.


In [5]:
with open("/kaggle/working/scraped_clen_text.txt","r") as f:
    text = f.read().strip()

In [6]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [7]:
tokenizer = Tokenizer()

tokenizer.fit_on_texts([text])
len(tokenizer.word_index)

7505

In [8]:
input_sequences = []

for sentences in text.split("\n"):
    tokenized_sent = tokenizer.texts_to_sequences([sentences])[0]
    for i in range(1, len(tokenized_sent)):
        input_sequences.append(tokenized_sent[:i+1])

In [9]:
# input_sequences

In [10]:
max_len = max([len(x) for x in input_sequences])
max_len

276

In [11]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

padded_input_sequences = pad_sequences(input_sequences, maxlen=max_len, padding="pre")

In [12]:
X = padded_input_sequences[:,:-1]
y = padded_input_sequences[:,-1]

In [13]:
X.shape

(235206, 275)

In [14]:
y.shape

(235206,)

In [15]:
from tensorflow.keras.utils import to_categorical

y = to_categorical(y, num_classes=7506)
y.shape

(235206, 7506)

In [16]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [17]:
model = Sequential()

model.add(Embedding(7506, 100, input_length=276)) # Embedding helps to convert sparse vector to dense vector, for every word 100 dim vector 
model.add(LSTM(500))
model.add(Dense(7506, activation='softmax'))

model.build(input_shape=(None, 276))



In [18]:
model.compile(loss="categorical_crossentropy",optimizer="adam", metrics=['accuracy'])

In [19]:
model.summary()

In [20]:
model.fit(X,y,epochs=5)

Epoch 1/5
[1m7351/7351[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m306s[0m 41ms/step - accuracy: 0.1254 - loss: 6.2846
Epoch 2/5
[1m7351/7351[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m303s[0m 41ms/step - accuracy: 0.7220 - loss: 1.3489
Epoch 3/5
[1m7351/7351[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m303s[0m 41ms/step - accuracy: 0.9566 - loss: 0.2431
Epoch 4/5
[1m7351/7351[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m302s[0m 41ms/step - accuracy: 0.9664 - loss: 0.1642
Epoch 5/5
[1m7351/7351[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m301s[0m 41ms/step - accuracy: 0.9603 - loss: 0.1739


<keras.src.callbacks.history.History at 0x7b69e8fbf5b0>

In [71]:
import numpy as np


def predict_next_word(text,num_words=30):
    for i in range(num_words):
        # tokenizer
        token_text = tokenizer.texts_to_sequences([text])[0]
        
        # padding
        padded_token_text = pad_sequences([token_text],maxlen=276, padding="pre")
    
        # predict
        predicted_word = None

        pos = np.argmax(model.predict(padded_token_text,verbose=0))
    
        for word, index in tokenizer.word_index.items():
            if index==pos:
                predicted_word = word
                break

        if predicted_word is None:
            break

        text = text + " " + predicted_word

        print(predicted_word, end=" ")


user_input = input("User input: ")
predict_next_word(user_input)


User input:  Deep learning is a subset of


machine learning that focuses on utilizing neural networks to perform tasks such as classification regression and representation learning the field takes inspiration from biological neuroscience and is centered around stacking 