<a href="https://colab.research.google.com/github/dinuka-kasun-medis/NLP/blob/main/Model_0_Naive_Bayes_(baseline).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

* To create our baseline, we'll create a Scikit-Learn Pipeline using the TF-IDF (term frequency-inverse document frequency) formula to convert our words to numbers and then model them with the [Multinomial Naive Bayes algorithm](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html#sklearn.naive_bayes.MultinomialNB).
* This was chosen via referring to the [Scikit-Learn machine learning map](https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html).

In [None]:
# Download helper functions script
!wget https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py

In [13]:
from helper_functions import unzip_data, create_tensorboard_callback, plot_loss_curves, compare_historys

In [None]:
# Download data (same as from Kaggle)
!wget "https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip"

# Unzip data
unzip_data("nlp_getting_started.zip")


In [None]:
import pandas as pd

train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
train_df.head()

In [None]:
train_df_shuffeled = train_df.sample(frac=1, random_state = 42)
train_df_shuffeled.head()

In [None]:
test_df.head()

In [None]:
train_df.target.value_counts()

In [21]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df_shuffeled["text"].to_numpy(),
                                                                            train_df_shuffeled["target"].to_numpy(),
                                                                            test_size = 0.1,
                                                                            random_state = 42)

In [22]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

text_vetorizer = TextVectorization(max_tokens=None, # how many words in the vocabulary (all of the different words in your text)
                                    standardize="lower_and_strip_punctuation", # how to process text
                                    split="whitespace", # how to split tokens
                                    ngrams=None, # create groups of n-words?
                                    output_mode="int", # how to map tokens to numbers
                                    output_sequence_length=None) # how long should the output sequence of tokens be?
                                    # pad_to_max_tokens=True) # Not valid if using max_tokens=None

In [23]:
# Setup text vectorization with custom variables
max_vocab_length = 10000 # max number of words to have in our vocabulary
max_length = 15 # max length our sequences will be (e.g. how many words from a Tweet does our model see?)

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length)

In [24]:
# Fit the text vectorizer to the training text
text_vectorizer.adapt(train_sentences)

In [31]:
# Let's visualize some random training examples
import random
random_index = random.randint(0, len(train_df)-5) # create random indexes not higher than the total number of samples
for row in train_df_shuffeled[["text", "target"]][random_index:random_index+5].itertuples():
  _, text, target = row
  print(f"Target: {target}", "(real disaster)" if target > 0 else "(not real disaster)")
  print(f"Text:\n{text}\n")
  print("---\n")

Target: 1 (real disaster)
Text:
70 Years After Atomic Bombs Japan Still Struggles With War Past http://t.co/5wfXbAQMBK The anniversary of the devastation wrought by theÛ_

---

Target: 0 (not real disaster)
Text:
My lifelong all-time favorite song is 'Landslide'.  This song has gotten me through a lot of though times &amp;... http://t.co/RfB3JXbiEJ

---

Target: 0 (not real disaster)
Text:
I hear the mumbling i hear the cackling i got em scared shook panicking

---

Target: 1 (real disaster)
Text:
SMH photographer Wolter Peeters was on the front line with NSW Rural Fire Service crews laÛ_ http://t.co/gXe7nHwZ3e http://t.co/sRbqlMuwbV

---

Target: 1 (real disaster)
Text:
I added a video to a @YouTube playlist http://t.co/y2Mt6v13E8 Doc: Volcanoes and Earthquakes - Inside the Volcano

---



In [32]:
# Choose a random sentence from the training dataset and tokenize it
random_sentence = random.choice(train_sentences)
print(f"Original text:\n{random_sentence}\
      \n\nVectorized version:")
text_vectorizer([random_sentence])

Original text:
@BLutz10 But the rioting began prior to the decision for the indictment so you're not really making sense at this pointÛ_      

Vectorized version:


<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[6083,   30,    2,  367, 1550, 2782,    5,    2, 3067,   10,    2,
           1,   28,  172,   34]])>

In [26]:
 #Get the unique words in the vocabulary
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5] # most common tokens (notice the [UNK] token for "unknown" words)
bottom_5_words = words_in_vocab[-5:] # least common tokens
print(f"Number of words in vocab: {len(words_in_vocab)}")
print(f"Top 5 most common words: {top_5_words}")
print(f"Bottom 5 least common words: {bottom_5_words}")

Number of words in vocab: 10000
Top 5 most common words: ['', '[UNK]', 'the', 'a', 'in']
Bottom 5 least common words: ['pages', 'paeds', 'pads', 'padres', 'paddytomlinson1']


In [27]:
tf.random.set_seed(42)
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim=max_vocab_length, # set input shape
                             output_dim=128, # set size of embedding vector
                             embeddings_initializer="uniform", # default, intialize randomly
                             input_length=max_length, # how long is each input
                             name="embedding_1")

embedding

<keras.src.layers.core.embedding.Embedding at 0x7f413978aa40>

In [33]:
# Get a random sentence from training set
random_sentence = random.choice(train_sentences)
print(f"Original text:\n{random_sentence}\
      \n\nEmbedded version:")

# Embed the random sentence (turn it into numerical representation)
sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

Original text:
Hail The New Caesars! http://t.co/GzMoBlsJxu http://t.co/5CGtqfk2uR      

Embedded version:


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[-0.01099086,  0.04020396,  0.01949554, ..., -0.01642676,
          0.01561412,  0.04102248],
        [-0.04557574, -0.0071545 , -0.03565868, ..., -0.0355046 ,
          0.0373396 ,  0.04740373],
        [-0.02463679,  0.02006156,  0.02470035, ..., -0.02898964,
         -0.03201852, -0.01155498],
        ...,
        [ 0.04659345,  0.02252786, -0.04973537, ...,  0.02735918,
          0.04532952,  0.01837719],
        [ 0.04659345,  0.02252786, -0.04973537, ...,  0.02735918,
          0.04532952,  0.01837719],
        [ 0.04659345,  0.02252786, -0.04973537, ...,  0.02735918,
          0.04532952,  0.01837719]]], dtype=float32)>

In [34]:
#Model 0: Getting a baseline

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create tokenization and modelling pipeline
model_0 = Pipeline([
                    ("tfidf", TfidfVectorizer()), # convert words to numbers using tfidf
                    ("clf", MultinomialNB()) # model the text
])

# Fit the pipeline to the training data
model_0.fit(train_sentences, train_labels)

In [35]:
baseline_score = model_0.score(val_sentences, val_labels)
print(f"Our baseline model achieves an accuracy of: {baseline_score*100:.2f}%")

Our baseline model achieves an accuracy of: 79.27%


In [36]:
# Make predictions
baseline_preds = model_0.predict(val_sentences)
baseline_preds[:20]

array([1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1])

In [37]:
# Function to evaluate: accuracy, precision, recall, f1-score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall and f1 score of a binary classification model.

  Args:
  -----
  y_true = true labels in the form of a 1D array
  y_pred = predicted labels in the form of a 1D array

  Returns a dictionary of accuracy, precision, recall, f1-score.
  """
  # Calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  # Calculate model precision, recall and f1 score using "weighted" average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results

In [38]:
# Get baseline results
baseline_results = calculate_results(y_true=val_labels,
                                     y_pred=baseline_preds)
baseline_results

{'accuracy': 79.26509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706,
 'f1': 0.7862189758049549}