# Models and Tokenizers Validity Check

## Ensuring TensorFlow Validity

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [2]:
# loading the dataset
df = pd.read_csv('dataset/spam_email_raw_text.csv')
df.drop(columns = ['FILE_NAME'], inplace = True)

In [3]:
# loading the tokenizer
import json
fid1 = open('tf_tokenizer.json', 'r')
tf_tokenizer = json.load(fid1)
tf_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(tf_tokenizer)
fid1.close()

In [4]:
# loading the model
tf_model = tf.keras.models.load_model('tf_model.h5')

In [5]:
# loading the params
fid2 = open('tf_params.json', 'r')
tf_params = json.load(fid2)
fid2.close()

In [6]:
# evaluating on the whole dataset
df_sequences = tf_tokenizer.texts_to_sequences(df.MESSAGE)
df_padded = tf.keras.preprocessing.sequence.pad_sequences(df_sequences, maxlen = tf_params['max_length'], 
                                                          padding = tf_params['padding_type'], 
                                                          truncating = tf_params['trunc_type'])
tf_model.evaluate(df_padded, df.CATEGORY)



[0.007317504845559597, 0.9981021285057068]

## Ensuring NLTK Validity

In [7]:
# loading the model
import joblib
nltk_model = joblib.load('nltk_model.joblib')

In [8]:
# loading parameters and processing functions
from nltk_functions import text_to_tokens, text_to_count_vector
fid = open('nltk_features.json', 'r')
nltk_features = json.load(fid)
fid.close()

In [9]:
from nltk import RegexpTokenizer, WordNetLemmatizer
from nltk.corpus import stopwords
tokenizer = RegexpTokenizer(r"\w+")
lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words('english')

# a function to process the whole dataframe of messages
def df_to_X_y(df):
    y = df.CATEGORY.to_numpy().astype(int)
    messages = df.MESSAGE
    count_vectors = []
    for message in messages:
        count_vector = text_to_count_vector(message, nltk_features, tokenizer, lemmatizer, stop_words, np)
        count_vectors.append(count_vector)
    X = np.array(count_vectors).astype(int)
    return X, y

X, y = df_to_X_y(df)

In [10]:
nltk_model.score(X, y)

0.9934437543133195