In [None]:
import kagglehub
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from tqdm import tqdm
import mlflow 
import re
import os
import shutil
import tensorflow as tf

  from .autonotebook import tqdm as notebook_tqdm


In [46]:
tf.__version__

'2.18.0'

In [None]:
try:
    if 'dataset' not in os.listdir("../"):
        print("Dataset doesn't exist, downloading it...")
        os.makedirs('../dataset/', exist_ok=True)
        path = kagglehub.dataset_download("myrios/news-sentiment-analysis")
        print(f"Dataset original path: {path}")
        shutil.move(path, '../dataset/')
        path = "dataset/news.csv"
        print("Dataset was downloaded and put in dataset/ directory.")
    else:
        path = "../dataset/news.csv"
        print("Loading news dataset... ")
        print(f"Dataset path: {path}")
        
    
except Exception as e :
    print(f"Dataset loading error: {e}")


df = pd.read_csv(f"{path}")
df.head(3)

Unnamed: 0,date,news,neg,neu,pos,compound,sentiment
0,2007-07-07,It was a long antipodean night. While there’s ...,0.059,0.878,0.064,0.0516,POSITIVE
1,2007-07-07,In Mexico there are no licensing or registrati...,0.044,0.956,0.0,-0.296,NEGATIVE
2,2007-07-07,The government has until Monday to protect the...,0.0,0.894,0.106,0.3818,POSITIVE


In [4]:
def clean_text(text):
    if text[0] == 'b':
        text= text[1:]
    if text[0] == '"' or text[0] == "'":
        text = text[1:len(text)-1]  
    text = re.sub(r'\\+', r'\\', text)
    text = re.sub(r'(?<!\d)\\(?!\d)', '', text)
    return text

In [5]:
df['label'] = df['sentiment'].map(lambda s: 1 if s == 'POSITIVE' else 0)
df['clean_news'] = df['news'].map(lambda t: clean_text(t))
BUFFER_SIZE = df.shape[0]
BATCH_SIZE  = 64
dataset = tf.data.Dataset.from_tensor_slices((df["news"].values, df["label"].values))

test_size = int(0.2 * len(df))
train_dataset = dataset.skip(test_size)
test_dataset = dataset.take(test_size)

train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [15]:
VOCAB_SIZE = 1000
encoder = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))

In [18]:
vocab = np.array(encoder.get_vocabulary())
vocab[:20]

array(['', '[UNK]', 'the', 'a', 'of', 'to', 'and', 'in', 'for', 'on',
       'is', 'that', 'with', 'at', 'as', 'are', 'new', 'from', 'an',
       'his'], dtype='<U16')

In [22]:
class MyCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        if logs.get('val_loss') <= 0.15:
            print("\nModel performed good on validation data, stopping training!")
            self.model.stop_training = True

callbacks = MyCallback()
earlystop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=7)

In [27]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(input_dim=len(encoder.get_vocabulary()), output_dim=64, mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [29]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [30]:
history = model.fit(train_dataset, epochs=15, validation_data=test_dataset, validation_steps=30, callbacks=[callbacks,earlystop])

Epoch 1/15
[1m5344/5344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m460s[0m 85ms/step - accuracy: 0.6610 - loss: 0.5632 - val_accuracy: 0.7635 - val_loss: 0.5618
Epoch 2/15
[1m5344/5344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m513s[0m 96ms/step - accuracy: 0.7282 - loss: 0.4973 - val_accuracy: 0.7599 - val_loss: 0.5604
Epoch 3/15
[1m5344/5344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m550s[0m 103ms/step - accuracy: 0.7328 - loss: 0.4905 - val_accuracy: 0.7266 - val_loss: 0.5973
Epoch 4/15
[1m5344/5344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m589s[0m 110ms/step - accuracy: 0.7379 - loss: 0.4837 - val_accuracy: 0.7328 - val_loss: 0.5922
Epoch 5/15
[1m5344/5344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m778s[0m 145ms/step - accuracy: 0.7384 - loss: 0.4814 - val_accuracy: 0.7469 - val_loss: 0.5746
Epoch 6/15
[1m5344/5344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m716s[0m 134ms/step - accuracy: 0.7409 - loss: 0.4779 - val_accuracy: 0.7172 - val_loss: 0

In [32]:
test_loss, test_acc = model.evaluate(test_dataset)
print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)

[1m1336/1336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 82ms/step - accuracy: 0.7330 - loss: 0.5540
Test Loss: 0.5334145426750183
Test Accuracy: 0.7383620142936707


In [None]:
# tf.keras.models.save_model(model, "./bidirectional_LSTM_models/v1.h5")
model.save("./training_saved_models/bidirectional_LSTM_models/v1.h5", save_format="tf")



In [33]:
mlflow.set_tracking_uri(uri="http://localhost:8080")

In [109]:
mlflow.set_experiment("Biderctional RNN")

params = {"epochs": 15,
          "optimizer":"adam",
          "lr":1e-4,
          "LSTM units":128}

metrics = {"accuracy":test_acc}

with mlflow.start_run(run_name="first training expirement"):
    mlflow.set_tag("Model Name", "Bidirectional LSTM")
    mlflow.set_tag("Model Architecture", "One embedding layer, Two Bidirectional LSTM layers (128, 64), Three dense layers (64, 32, 1).")
    mlflow.log_params(params) 
    mlflow.log_metrics(metrics)

    # tag=[tf.compat.v1.saved_model.tag_constants.SERVING]
    # key=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
    mlflow.tensorflow.log_model(model,
                                artifact_path= "model")




🏃 View run first training expirement at: http://localhost:8080/#/experiments/659168015968450415/runs/72614fe35527415cb98db56d1dd77393
🧪 View experiment at: http://localhost:8080/#/experiments/659168015968450415
