In [11]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.layers import TimeDistributed, Dense, Embedding, LSTM, Reshape
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import Sequence
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, LearningRateScheduler
from sklearn.model_selection import train_test_split
import json
import math
import errno
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [2]:
LEARNING_RATE = 0.1
# Scheduling a learning-rate to produce different effects for gradient of loss wrt to the weights
# Different Learning Rates will affect the model differently by updating different % of weights of the model.
START_CHAR = "\t"
END_CHAR = "\n"
LSTM_UNITS = 300
DENSE_UNITS = 100
VOCAB_SIZE = 70
PROB_THRESHOLD = 1e-9
VERBOSITY = 2
EPOCHS = 2
MAX_LENGTH = 32
EMBEDDING_DIMENSION = 16
BATCH_SIZE = 32
METRICS = [tf.keras.metrics.CategoricalAccuracy()]

In [3]:
# Preprocessing the inputs and the outputs for the model --- 
# For Example password 'passwd'
# Adding a \t at the Start for the input - \tpasswd 
# Adding a \n at the End for the model output - passwd\n
# Here \t predicts p, p predicts a and so on..
def loadPreprocessInputOutput(FILE_NAME, START_CHAR, END_CHAR):
    try:
        assert os.path.isfile(FILE_NAME)
        assert isinstance(START_CHAR, str)
        assert isinstance(END_CHAR, str)
    except:
        raise Exception("Incorrect Inputs. Try again.")
        return
    inputPasswords = []
    outputPasswords = []
    listPasswords = []
    with open(FILE_NAME, "r") as pass_file:
        while(True):
            single_pass = pass_file.readline().rstrip("\n")
            if(single_pass == ""):
                break
            else:
                inputPasswords.append(START_CHAR + single_pass)
                outputPasswords.append(single_pass + END_CHAR)
                listPasswords.append(START_CHAR + single_pass + END_CHAR)
        return(inputPasswords, outputPasswords, listPasswords)

In [None]:
inputPasswords, outPasswords, listPasswords = loadPreprocessInputOutput("/home/rm/BE_Project/Embedding/Data/ascii_rockyou_less_than_thirty_two_cleaned.txt", START_CHAR, END_CHAR)

In [None]:
# Sanity Check
print(inputPasswords[5:10])
print(len(inputPasswords))
print(outPasswords[5:10])
print(len(outPasswords))
print(listPasswords[5:10])
print(len(listPasswords))

In [4]:
# Save the passwords:
# Do not open file - will not be shown properly
def writePreprocessedPasswordFromList(passwordList, typeList):
    passwordFileName = typeList + "_preprocessed.txt"
    with open(passwordFileName, "w") as pass_file:
        for password in passwordList:
            if(typeList.lower() == "input"):
                pass_file.write(password+"\n")
            else:
                pass_file.write(password)
    print(f"{typeList} passwords written completely to : {passwordFileName}")

In [None]:
# Write the passwords
writePreprocessedPasswordFromList(inputPasswords, "input")
writePreprocessedPasswordFromList(outPasswords, "output")
writePreprocessedPasswordFromList(listPasswords, "list")

In [4]:
# Sanity Check
# Output will not be proper.
!head -n 5 input_preprocessed.txt
!head -n 5 output_preprocessed.txt
!head -n 5 list_preprocessed.txt

	123456
	12345
	123456789
	password
	iloveyou
123456
12345
123456789
password
iloveyou
	123456
	12345
	123456789
	password
	iloveyou


In [None]:
# Specifying the Tokenizer -- leaving num_words as blank to include as many 
# unique characters as possible.
# Fit the tokenizer on the text and then save the tokenizer.
passwordTokenizer = Tokenizer(filters = "", lower = True, char_level = True)

In [5]:
# Utility Function to Save the Tokenizer Configuration
def saveTokenizer(TOKENIZER, OUTPUT_PATH):
    tokenizerConfigString = TOKENIZER.to_json()
    with open(OUTPUT_PATH+".json", "w") as op_file:
        op_file.write(tokenizerConfigString)

In [6]:
# Utility Function to Load the Tokenizer Configuration
def loadTokenizer(TOKENIZER_FILE_PATH):
    _, file_extension = os.path.splitext(TOKENIZER_FILE_PATH)
    if(file_extension != ".json"):
        raise Exception("Incorrect File.")
        return
    else:
        with open(TOKENIZER_FILE_PATH, "r") as tokenizer_cfg_file:
            tokenizer_config = tokenizer_cfg_file.read()
            tokenizer_cfg = json.loads(tokenizer_config)
            passwordTokenizer = tf.keras.preprocessing.text.tokenizer_from_json(json.dumps(tokenizer_cfg))
            return passwordTokenizer

In [None]:
# Fitting the Tokenizer on data:
passwordTokenizer.fit_on_texts(listPasswords)
passwordTokenizer.get_config()

In [None]:
# Save Tokenizer -- for later use and load it to avoid re-fitting
saveTokenizer(passwordTokenizer, "prototypeTokenizer")

In [7]:
passwordTokenizer = loadTokenizer("prototypeTokenizer.json")
# Sanity Check
passwordTokenizer.get_config()

{'num_words': None,
 'filters': '',
 'lower': True,
 'split': ' ',
 'char_level': True,
 'oov_token': None,
 'document_count': 14105697,
 'word_counts': '{"\\t": 14105697, "1": 6729506, "2": 5234401, "3": 3765169, "4": 3389487, "5": 3352338, "6": 3116090, "\\n": 14105697, "7": 3098762, "8": 3565308, "9": 3853241, "p": 1619704, "a": 8828625, "s": 4154066, "w": 799568, "o": 5173138, "r": 4576620, "d": 2484237, "i": 5553157, "l": 4460473, "v": 1050793, "e": 7203479, "y": 2373398, "u": 2307207, "n": 4827931, "c": 2608350, "k": 2012166, "b": 2110821, "g": 1717216, "m": 3205286, "j": 1237461, "h": 2335137, "q": 178503, "t": 3425154, "0": 5735322, "f": 981496, "z": 763424, "x": 479316, "!": 142923, ";": 12258, "-": 133010, "*": 123842, ".": 248717, "?": 18301, ",": 29750, "/": 48190, "#": 48873, "@": 107880, "$": 36029, "%": 10254, "^": 6394, "&": 26359, "+": 26989, "\'": 15335, "[": 7682, "]": 10802, "<": 9561, "_": 193006, ">": 2458, "=": 18365, "\\\\": 25832, "\\"": 3637, ":": 6858, "(": 1

In [36]:
def getLength(FILE_PATH):
    count = 0
    with open(FILE_PATH, "r") as f:
        for count, _ in enumerate(f):
            pass
    count = count + 1
    return count

In [37]:
# Function to preprocess and get validation data
def getValidationData(VALID_X_FILE_PATH, VALID_Y_FILE_PATH, VOCAB_SIZE, TOKENIZER, MAX_LENGTH, VALIDATION_BATCH_SIZE):
    total_passwords = getLength(VALID_X_FILE_PATH)
    total_batches = math.floor(total_passwords / VALIDATION_BATCH_SIZE)
    total_passwords_to_read = total_batches * VALIDATION_BATCH_SIZE
    valid_passwords = []
    count = 0
    valid_y = []
    flag = True
    with open(VALID_X_FILE_PATH, "r") as valid_file:
        for _, password in enumerate(valid_file):
            if(_ < total_passwords_to_read):
                valid_passwords.append(password.rstrip("\n"))
    with open(VALID_Y_FILE_PATH, "r") as valid_true_file:
        for _, true_password in enumerate(valid_true_file):
            if(_ < total_passwords_to_read):
                valid_y.append(true_password)
    valid_encoded_passwords = TOKENIZER.texts_to_sequences(valid_passwords)
    valid_y_encoded_passwords = TOKENIZER.texts_to_sequences(valid_y)
    valid_padded_passwords = pad_sequences(valid_encoded_passwords, padding = "post", maxlen = (MAX_LENGTH + 1))
    valid_y_padded_passwords = pad_sequences(valid_y_encoded_passwords, padding = "post", maxlen = (MAX_LENGTH + 1))
    for x_valid_password, y_valid_password in zip(valid_padded_passwords, valid_y_padded_passwords):
        count += 1
        if(count % 500 == 0):
            print(f"Total passwords processed {count}")
        if(flag):
            final_x_valid = np.array(x_valid_password).reshape(1, (MAX_LENGTH + 1))
            temp_y_valid = np.array(y_valid_password).reshape(1, (MAX_LENGTH + 1))
            final_y_valid = to_categorical(temp_y_valid, num_classes = (VOCAB_SIZE + 1))
            flag = False
        else:
            final_x_valid = np.concatenate((final_x_valid, np.array(x_valid_password).reshape(1, (MAX_LENGTH + 1))), axis = 0)
            temp_y_valid = np.array(y_valid_password).reshape(1, (MAX_LENGTH + 1))                                 
            final_y_valid = np.concatenate((final_y_valid, to_categorical(temp_y_valid, num_classes = (VOCAB_SIZE + 1))), axis = 0)
    return(final_x_valid, final_y_valid) 

In [157]:
# Create an input pipeline for feeding encoded passwords -- 
# Pass train_passwords as None if you want to train on entire set of passwords mentioned in password_file
class lstmNetworkInputSequence(tf.keras.utils.Sequence):
    def __init__(self, train_passwords, password_file_x, password_file_y, batch_size, tokenizer, max_length, vocab_size):
        self.batch_size = batch_size
        self.train_passwords = train_passwords
        self.password_file = password_file_x
        self.password_file_y = password_file_y
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.vocab_size = vocab_size
    
    def __len__(self):
        with open(self.password_file, "r") as pass_file:
            for count, _ in enumerate(pass_file):
                pass
        total_passwords = count + 1
        if(self.train_passwords is not None):
            if(self.train_passwords < total_passwords):
                return math.floor(self.train_passwords / self.batch_size)
            else:
                return math.floor(total_passwords / self.batch_size)
        else:
            return math.floor(total_passwords / self.batch_size)
        
    def __getitem__(self, index):
        # This could be slow, will try to improve speed later by saving encoded passwords.
        # batch_enc_padded_passwords = None
        # batch_y_true = None
        flag = True
        batch_enc_padded_passwords = []
        batch_y_true = []
        # temp_x = []
        # temp_y = []
        batch_passwords = []
        batch_y_passwords = []
        batch_password_index = list(range((index - 1) * self.batch_size, ((index) * self.batch_size)))
        with open(self.password_file, "r") as pass_file:
            for count, password in enumerate(pass_file):
                if(count in batch_password_index):
                    batch_passwords.append(password.rstrip("\n"))
                else:
                    continue
        with open(self.password_file_y, "r") as pass_file_y:
            for count_y, password_y in enumerate(pass_file_y):
                if(count_y in batch_password_index):
                    batch_y_passwords.append(password_y)
                else:
                    continue
        #print(f"{batch_passwords}\n\n\n{batch_y_passwords}")
        encoded_passwords = self.tokenizer.texts_to_sequences(batch_passwords)
        padded_encoded_passwords = pad_sequences(encoded_passwords, padding = "post", maxlen = (self.max_length + 1))
        encoded_y_passwords = self.tokenizer.texts_to_sequences(batch_y_passwords)
        padded_encoded_y_passwords = pad_sequences(encoded_y_passwords, padding = "post", maxlen = (self.max_length + 1))
        for encoded_password, encoded_y_password in zip(padded_encoded_passwords, padded_encoded_y_passwords):
            reshaped_example = np.array(encoded_password).reshape(1, (self.max_length + 1))
            reshaped_y_example = np.array(encoded_y_password).reshape(1, (self.max_length + 1))
            if(flag):
                batch_enc_padded_passwords = reshaped_example
                batch_y_true = to_categorical(y = reshaped_y_example, num_classes = (self.vocab_size + 1))
                flag = False
            else:
                batch_enc_padded_passwords = np.concatenate((batch_enc_padded_passwords, reshaped_example), axis = 0)
                batch_y_true = np.concatenate((batch_y_true, to_categorical(y = reshaped_y_example,  num_classes = (self.vocab_size + 1))), axis = 0)
        return np.array(batch_enc_padded_passwords), np.array(batch_y_true)

In [46]:
# 1. Preparing the Data to splitting into train and validation
# 2. Then prepare the train Data to split again into train and test
# 3. Don't train on test 
# 4. Modify hyperparameters on validate
# 5. Plot using TensorBoard for both train and validation for each epoch 
# 6. Necessary to train if for atleast one epoch, to plot the graphs
# Or we can choose to plot for each batch - but will be very
# Resource intensive and will slow us down considerably.
# 7. train_test_split is given the input and output arrays as X & Y
X_train_test_train, X_validation, Y_train_test_train, Y_validation = train_test_split(inputPasswords, outPasswords, test_size = 0.05, shuffle = True)
X_train, X_test, Y_train, Y_test = train_test_split(X_train_test_train, Y_train_test_train, test_size = 0.1, shuffle = True)

# Sanity Check
print(f"{repr(X_train_test_train[100])}\t{repr(Y_train_test_train[100])}")
print(f"{repr(X_validation[100])}\t{repr(Y_validation[100])}")
print(f"{repr(X_train[100])}\t{repr(Y_train[100])}")
print(f"{repr(X_test[100])}\t{repr(Y_test[100])}")

NameError: name 'inputPasswords' is not defined

In [14]:
# Preparing to save models - dedicated methods for it
# We also save them after each epoch using checkpoint callback.
# So saving manually is optional.
def saveKerasModel(MODEL, OUTPUT_MODEL_PATH):
    # Saves the model to the disk, saves both the architecture and 
    # the configuration.
    try:
        assert isinstance(MODEL, Model)
        MODEL.save(OUTPUT_MODEL_PATH)
        print(f"[+] Model has been successfully saved to {OUTPUT_MODEL_PATH}")
    except:
        raise Exception("Model instance is incorrect. Failed!")
        return   

In [15]:
# Utility Function to load the model manually.
# Is Extremely Important!
def loadKerasModel(INPUT_MODEL_PATH):
    try:
        assert os.path.isfile(INPUT_MODEL_PATH)
        loaded_model = tf.keras.load_model(INPUT_MODEL_PATH)
    except:
        raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), INPUT_MODEL_PATH)

In [None]:
!mkdir train
!mkdir test
!mkdir validation

In [16]:
# Utility Function to write the output to file -- 
def writeOutput(FILE_PATH, FILE_TYPE_TRAIN, PASS_LIST_X, PASS_LIST_Y):
    with open(os.path.join(FILE_PATH, FILE_TYPE_TRAIN + "_X" + ".txt"), "w") as x_file:
        for input_password in PASS_LIST_X:
            x_file.write(input_password + "\n")
    with open(os.path.join(FILE_PATH, FILE_TYPE_TRAIN + "_Y" + ".txt"), "w") as y_file:
        for out_password in PASS_LIST_Y:
            y_file.write(out_password)
    print("[+] Done!")

In [None]:
# Save the split data into their respective directories --
writeOutput("./train", "train", X_train, Y_train)
writeOutput("./test", "test", X_test, Y_test)
writeOutput("./validation", "validation", X_validation, Y_validation)

In [17]:
# Sanity Check 
!head -n 2 ./train/train_X.txt 
!head -n 2 ./train/train_Y.txt
!head -n 2 ./test/test_X.txt 
!head -n 2 ./test/test_Y.txt 
!head -n 2 ./validation/validation_X.txt 
!head -n 2 ./validation/validation_Y.txt 
!wc -l ./train/train_X.txt 
!wc -l ./train/train_Y.txt 
!wc -l ./test/test_X.txt
!wc -l ./test/test_Y.txt
!wc -l ./validation/validation_X.txt 
!wc -l ./validation/validation_Y.txt 

	rissaab
	0835777414
rissaab
0835777414
	ebdevils24
	catarro
ebdevils24
catarro
	xarahj
	chaus19
xarahj
chaus19
12060370 ./train/train_X.txt
12060370 ./train/train_Y.txt
1340042 ./test/test_X.txt
1340042 ./test/test_Y.txt
705285 ./validation/validation_X.txt
705285 ./validation/validation_Y.txt


In [None]:
# Preparing the file directory structure for callbacks:
!mkdir checkpoints
!mkdir tensorboard_log_dir

In [18]:
# Configuring all model callbacks
# 1. LearningRate Callback
# 2. ModelCheckpoint Callback
# 3. TensorBoard Callback
# 4. GradientClipping Callback
# Gradient Clipping is required because LSTMs/GRUs greatly suffer from both
# Exploding and Vanishing Gradients which presents as big problem and sets
# all values of delta-wt, delta-b to NaN causing numerical instability.
# This occurs because of Gradient-Descent through Time or as we call it
# Back-Propogation through time algorithm causing the gradients to be multiplied 
# with <1 or >1 weight values across all timesteps leaving the network extremely
# vulnerable to numerical instability.

# LearningRateScheduler - Using Exponential Learning Rate Decay
# First defining a 'schedule' for learning rate decay
def expLearningRateDecay(epoch):
   initial_lrate = 0.1
   k = 0.1
   lrate = initial_lrate * math.exp(-k * epoch)
   return lrate

# Scheduler Callback:
train_learning_rate_callback = LearningRateScheduler(schedule = expLearningRateDecay, verbose = 1)

# Checkpoint Callback:
train_checkpoint_callback = ModelCheckpoint("./checkpoints/Checkpoint-{epoch:03d}", verbose = 1, save_weights_only = False, save_freq = "epoch")

# TensorBoard Callback:
train_tensorboard_callback = TensorBoard(log_dir = "./tensorboard_log_dir", histogram_freq = 1, write_graph = True, write_images = True, update_freq = 'batch')

# GradientClipping Callback

# GradientClipping has to be done manually by computing the gradient wrt 
# the loss of each trainable parameter and then clip it.
# After which the gradients have to be applied to the weights / biases / gates
# thus preventing exploding/vanishing gradient.

# This will only be done if the model experiences any sort of numerical 
# instability and throws NaN Exceptions since this process has capacity 
# to cripple model's speed.


In [123]:
# Creating Optimizer Instance for Training Model
adam_optimizer = Adam(learning_rate = LEARNING_RATE, beta_1 = 0.9, beta_2 = 0.999, epsilon = 1e-07)

In [148]:
# Building teacher-forcing training model
# Must include shared architecture for following layers --
# 1. Embedding Layer
# 2. LSTM_1 
# 3. LSTM_2
# 4. LSTM_3
# 5. Dense_1
# 6. Dense_2 (Softmax - Outputs Probability Distribution over |VOCAB_SIZE + <PAD_TOKEN>|)

# The shared layers are defined below -- 
shared_embedding_layer = Embedding(input_dim = (VOCAB_SIZE + 1), output_dim = EMBEDDING_DIMENSION, mask_zero = True)
shared_lstm_1 = LSTM(LSTM_UNITS, return_sequences = True, return_state = True)
shared_lstm_2 = LSTM(LSTM_UNITS, return_sequences = True, return_state = True)
shared_lstm_3 = LSTM(LSTM_UNITS, return_sequences = True, return_state = True)
shared_dense_1 = TimeDistributed(Dense(DENSE_UNITS, activation = "relu"))
shared_dense_op = TimeDistributed(Dense((VOCAB_SIZE + 1), activation = "softmax"))

In [149]:
# 1. Defining the Teacher-Force Training Model:
# 2. Do not use initial_state call argument for the LSTM layers
# 3. Use the initial_state call arguemnt for 'all' the LSTM layers 
# in the Inference Model

train_input = Input(shape = ((MAX_LENGTH + 1),))
train_emb_op = shared_embedding_layer(train_input)
train_lstm_1_op, train_lstm_1_hidden, train_lstm_1_cell = shared_lstm_1(train_emb_op)
train_lstm_2_op, train_lstm_2_hidden, train_lstm_2_cell = shared_lstm_2(train_lstm_1_op)
train_lstm_3_op, train_lstm_3_hidden, train_lstm_3_cell = shared_lstm_3(train_lstm_2_op)
train_dense_1_op = shared_dense_1(train_lstm_3_op)
train_model_op = shared_dense_op(train_dense_1_op)

train_model = Model(inputs = train_input, outputs = train_model_op)

# Check the model summary 
print(train_model.summary())

Model: "model_24"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_57 (InputLayer)        [(None, 33)]              0         
_________________________________________________________________
embedding_10 (Embedding)     (None, 33, 16)            1136      
_________________________________________________________________
lstm_30 (LSTM)               [(None, 33, 300), (None,  380400    
_________________________________________________________________
lstm_31 (LSTM)               [(None, 33, 300), (None,  721200    
_________________________________________________________________
lstm_32 (LSTM)               [(None, 33, 300), (None,  721200    
_________________________________________________________________
time_distributed_20 (TimeDis (None, 33, 100)           30100     
_________________________________________________________________
time_distributed_21 (TimeDis (None, 33, 71)            717

In [150]:
# Compiling the Model with specified callbacks and metrics:
train_model.compile(optimizer = adam_optimizer, loss = "categorical_crossentropy", metrics = METRICS)

In [112]:
# Defining the Inference Model
# Needs the initial_state call argument for all the LSTM layers

inference_input = Input(shape = (1,))
lstm_1_hidden = Input(shape = (LSTM_UNITS,))
lstm_1_cell = Input(shape = (LSTM_UNITS,))
lstm_2_hidden = Input(shape = (LSTM_UNITS,))
lstm_2_cell = Input(shape = (LSTM_UNITS,))
lstm_3_hidden = Input(shape = (LSTM_UNITS,))
lstm_3_cell = Input(shape = (LSTM_UNITS,))
inference_emb_op = shared_embedding_layer(inference_input)
inference_lstm_1_op, inference_lstm_1_hidden, inference_lstm_1_cell = shared_lstm_1(inference_emb_op, initial_state = [lstm_1_hidden, lstm_1_cell])
inference_lstm_2_op, inference_lstm_2_hidden, inference_lstm_2_cell = shared_lstm_2(inference_lstm_1_op, initial_state = [lstm_2_hidden, lstm_2_cell])
inference_lstm_3_op, inference_lstm_3_hidden, inference_lstm_3_cell = shared_lstm_3(inference_lstm_2_op, initial_state = [lstm_3_hidden, lstm_3_cell])
inference_dense_1_op = shared_dense_1(inference_lstm_3_op)
inference_model_op = shared_dense_op(inference_dense_1_op)

inputs_list = [inference_input, lstm_1_hidden, lstm_1_cell, lstm_2_hidden, lstm_2_cell, lstm_3_hidden, lstm_3_cell]
outputs_list = [inference_lstm_1_hidden, inference_lstm_1_cell, inference_lstm_2_hidden, inference_lstm_2_cell, inference_lstm_3_hidden, inference_lstm_3_cell]

inference_model = Model(inputs = inputs_list, outputs = outputs_list)

print(inference_model.summary())

Model: "model_17"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_44 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_6 (Embedding)         multiple             1136        input_44[0][0]                   
__________________________________________________________________________________________________
input_45 (InputLayer)           [(None, 300)]        0                                            
__________________________________________________________________________________________________
input_46 (InputLayer)           [(None, 300)]        0                                            
___________________________________________________________________________________________

In [158]:
# Load the validation data into volatile memory so we 
# don't have to re-perform all the calculation during each 
# validation test run.
# Fitting the model might be very resource intensive
# Due to data fetch from physical disk and prior pre-processing.
input_pipeline = lstmNetworkInputSequence(None, "./train/train_X.txt", "./train/train_Y.txt", BATCH_SIZE, passwordTokenizer, MAX_LENGTH, VOCAB_SIZE)
print("[+] Sequence Object Created!")
validation_data = getValidationData("./validation/X.txt", "./validation/Y.txt", VOCAB_SIZE, passwordTokenizer, MAX_LENGTH, BATCH_SIZE)
result = input_pipeline.__getitem__(2)
x = result[0]
y = result[1]
print("[+] Validation Data is read into volatile memory!")

[+] Pipeline Object Created!
[+] Validation Data is read into volatile memory!


In [159]:
train_model_history = train_model.fit(x = input_pipeline, epochs = 1, batch_size = 1)



ValueError: in user code:

    /home/rm/anaconda3/envs/GRU/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:571 train_function  *
        outputs = self.distribute_strategy.run(
    /home/rm/anaconda3/envs/GRU/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:951 run  **
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /home/rm/anaconda3/envs/GRU/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2290 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /home/rm/anaconda3/envs/GRU/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2649 _call_for_each_replica
        return fn(*args, **kwargs)
    /home/rm/anaconda3/envs/GRU/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:532 train_step  **
        loss = self.compiled_loss(
    /home/rm/anaconda3/envs/GRU/lib/python3.8/site-packages/tensorflow/python/keras/engine/compile_utils.py:205 __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    /home/rm/anaconda3/envs/GRU/lib/python3.8/site-packages/tensorflow/python/keras/losses.py:143 __call__
        losses = self.call(y_true, y_pred)
    /home/rm/anaconda3/envs/GRU/lib/python3.8/site-packages/tensorflow/python/keras/losses.py:246 call
        return self.fn(y_true, y_pred, **self._fn_kwargs)
    /home/rm/anaconda3/envs/GRU/lib/python3.8/site-packages/tensorflow/python/keras/losses.py:1527 categorical_crossentropy
        return K.categorical_crossentropy(y_true, y_pred, from_logits=from_logits)
    /home/rm/anaconda3/envs/GRU/lib/python3.8/site-packages/tensorflow/python/keras/backend.py:4561 categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)
    /home/rm/anaconda3/envs/GRU/lib/python3.8/site-packages/tensorflow/python/framework/tensor_shape.py:1117 assert_is_compatible_with
        raise ValueError("Shapes %s and %s are incompatible" % (self, other))

    ValueError: Shapes (None, 1) and (None, 1, 71) are incompatible


In [99]:
print(input_pipeline.batch_size)
print(input_pipeline.train_passwords)
print(input_pipeline.password_file)
print(input_pipeline.password_file_y)
print(input_pipeline.tokenizer)
print(input_pipeline.max_length)
print(input_pipeline.vocab_size)

32
None
./train/train_X.txt
./train/train_Y.txt
<keras_preprocessing.text.Tokenizer object at 0x7f4bdc9f5610>
32
70


In [100]:
input_pipeline.__len__()

376886

In [160]:
result = input_pipeline.__getitem__(1)
print(result[0].shape)
print(result[1].shape)

(32, 33)
(32, 33, 71)


In [None]:
!wc -l ./train/train_X.txt
!wc -l ./train/train_Y.txt

In [None]:
math.floor(3/2)

In [106]:
result = input_pipeline.__getitem__(int(getLength("./train/train_X.txt") / 32))
print(type(result))
x = result[0]
print(x)
print(x.shape)
y = result[1]
print(y.shape)
train_model.evaluate(x, y)

<class 'tuple'>
[[ 1 18 18 ...  0  0  0]
 [ 1 13 17 ...  0  0  0]
 [ 1 12  7 ...  0  0  0]
 ...
 [ 1  3 10 ...  0  0  0]
 [ 1 23 27 ...  0  0  0]
 [ 1  6  5 ...  0  0  0]]
(32, 33)
(32, 33, 71)


[1.2392356395721436, 0.7121211886405945]

In [54]:
reshape = Reshape(target_shape = (MAX_LENGTH+1,))

In [56]:
result[0][0].shape

(33,)

In [64]:
train_input.get_shape().as_list()

[None, 33]

In [136]:
arr1 = np.array([1,2,3])
print(arr1.shape)
arr2 = np.array([1,2,3]).reshape(1,3)

(3,)


In [137]:
np.array(arr1).shape

(3,)

In [145]:
x

array([[ 1, 31,  3, ...,  0,  0,  0],
       [ 1, 31,  3, ...,  0,  0,  0],
       [ 1, 31,  3, ...,  0,  0,  0],
       ...,
       [ 1, 31,  3, ...,  0,  0,  0],
       [ 1, 31,  3, ...,  0,  0,  0],
       [ 1, 31,  3, ...,  0,  0,  0]], dtype=int32)

In [146]:
y

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0.

In [153]:
x.shape

(96, 33)