In [14]:
import requests
from bs4 import BeautifulSoup
import re

lst  = ["https://en.wikipedia.org/wiki/Predictive_text",
        "https://en.wikipedia.org/wiki/Natural_language_processing"
        # "https://en.wikipedia.org/wiki/Mahatma_Gandhi",
        # "https://en.wikipedia.org/wiki/Ratan_Tata",
        # "https://en.wikipedia.org/wiki/P._V._Sindhu",
        # "https://en.wikipedia.org/wiki/India",
        # "https://en.wikipedia.org/wiki/Deep_learning",
        # "https://en.wikipedia.org/wiki/Generative_artificial_intelligence",
        # "https://en.wikipedia.org/wiki/Amazon_(company)",
        # "https://en.wikipedia.org/wiki/Gmail",
        ]


def remove_number(text):
    return re.sub(r'\[\d+\]', '', text)

def scrap_text(lst, filename):
    all_content = []
    with open(filename, "w", encoding='utf-8') as f:
        for url in lst:
            res = requests.get(url)
            if res.status_code==200:
                soup = BeautifulSoup(res.text,'html.parser')
            
                for div in soup.find_all('div'):
                    for p in div.find_all("p"):
                        content = p.get_text().strip()
                        cleaned_content = remove_number(content)
                        # print(cleaned_content)
                        all_content.append(cleaned_content)
                        f.write(cleaned_content+"\n")
            else:
                print("Something wrong!!!")
    return all_content


scraped_text = scrap_text(lst, "text.txt")
print("Scraping complete.")


Scraping complete.


In [27]:
import pandas as pd 
import numpy as np 
import tensorflow as tf 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SimpleRNN

In [28]:
with open("/kaggle/working/text.txt","r") as f:
    text = f.read().strip()

In [29]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
len(tokenizer.word_index)

882

In [30]:
input_sequences = []

for sentences in text.split("\n"):
    tokenized_sent = tokenizer.texts_to_sequences([sentences])[0]
    for i in range(1, len(tokenized_sent)):
        input_sequences.append(tokenized_sent[:i+1])

In [31]:
max_len = max([len(x) for x in input_sequences])
max_len

178

In [32]:
padded_input_sequences = pad_sequences(input_sequences, maxlen=max_len, padding="pre")

In [33]:
X = padded_input_sequences[:,:-1]
y = padded_input_sequences[:,-1]

In [34]:
X.shape

(15036, 177)

In [35]:
y.shape

(15036,)

In [36]:
y = to_categorical(y, num_classes=883)
y.shape

(15036, 883)

In [38]:
# Base model
model = Sequential()

model.add(Embedding(883, 100, input_length=178))
model.add(SimpleRNN(512))
model.add(Dense(883, activation="softmax"))

model.build(input_shape=(None,178))

In [39]:
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [40]:
model.summary()

In [43]:
import dagshub
import mlflow
import mlflow.keras

In [45]:
mlflow.set_tracking_uri("https://dagshub.com/gauravbosamiya/end-to-end-mlops-pipeline-next-word-predictor.mlflow")
dagshub.init(repo_owner="gauravbosamiya",repo_name="end-to-end-mlops-pipeline-next-word-predictor", mlflow=True)

mlflow.set_experiment("SimpleRNN")

<Experiment: artifact_location='mlflow-artifacts:/05f0a83fe8a34f1fa0eaccf30381260d', creation_time=1743517723307, experiment_id='1', last_update_time=1743517723307, lifecycle_stage='active', name='SimpleRNN', tags={}>

In [46]:
import logging
import os 
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logging.info("Satrting Mlflow run...")

with mlflow.start_run():
    try:
        
        # log params
        logging.info("Logging preprocessing parameters...")
        mlflow.log_param("epochs", 5),
        # mlflow.log_param("batch_size", 32)
        mlflow.log_param("optimizer", "Adam")
        mlflow.log_param("loss_function", "categorical_crossentropy")
        
        
        logging.info("Logging model parameters...")
        mlflow.log_param("embedding_dim", 100)
        mlflow.log_param("SimpleRNN units", 512)
        # mlflow.log_param("dropout_rate", 0.3)
        mlflow.log_param("input_length", 178)
        mlflow.log_param("num_classes", 883)

        
        history = model.fit(X, y, epochs=5)
        
        
        for epoch, (train_loss, train_acc) in enumerate(zip(history.history['loss'], history.history['accuracy'])):
            mlflow.log_metric(f"train_loss_epoch_{epoch}", train_loss)
            mlflow.log_metric(f"train_accuracy_epoch_{epoch}", train_acc)
            
            
        logging.info("Saving and logging the model...")
        mlflow.keras.log_model(model,"model")
        
        
        logging.info("Model training and logging completed.")
        
    except Exception as e:
        logging.error(f"An error occurred: {e}", exc_info=True)
         

Epoch 1/5
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 232ms/step - accuracy: 0.0353 - loss: 6.6487
Epoch 2/5
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 227ms/step - accuracy: 0.0518 - loss: 6.4071
Epoch 3/5
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 228ms/step - accuracy: 0.1002 - loss: 6.1312
Epoch 4/5
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 227ms/step - accuracy: 0.1844 - loss: 5.3847
Epoch 5/5
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 227ms/step - accuracy: 0.2504 - loss: 4.6097




🏃 View run masked-shrimp-729 at: https://dagshub.com/gauravbosamiya/end-to-end-mlops-pipeline-next-word-predictor.mlflow/#/experiments/1/runs/22d9caf951b84893b3697e153c0006a3
🧪 View experiment at: https://dagshub.com/gauravbosamiya/end-to-end-mlops-pipeline-next-word-predictor.mlflow/#/experiments/1
