# Imports

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
import gc

# Cleaning Data

In [2]:
file_path = r'C:\Users\KrayZ\Documents\CMPSC.codes\Password 3\Password 3\output2.txt'

with open(file_path, 'r') as file:
    lines = file.readlines()

data = [line.strip().split(maxsplit=2) for line in lines]
encrypted_data_df = pd.DataFrame(data, columns=['PlainText', 'SHA256', 'RSA'])

In [3]:
def split_col(text):
    
    part_length = len(text) // 10
    pieces = [text[i*part_length:(i+1)*part_length] for i in range(9)]
    pieces.append(text[9*part_length:])
    
    return pieces

## SHA256 Data

In [4]:
split_column_names = ['part1', 'part2', 'part3', 'part4', 'part5', 'part6', 'part7', 'part8', 'part9', 'part10']

In [102]:
split_sha256_cols = encrypted_data_df['SHA256'].apply(split_col)
sha256_df = pd.DataFrame(split_sha256_cols.tolist(), index=encrypted_data_df.index, columns = split_column_names)

sha256_df['Length'] = encrypted_data_df["PlainText"].apply(len)

In [6]:
sha256_df.tail(10)

Unnamed: 0,part1,part2,part3,part4,part5,part6,part7,part8,part9,part10,Length
43,de5a6f,78116e,ca62d7,fc5ce1,59d23a,e6b889,b365a1,739ad2,cf36f9,25a140d0cc,1
44,50e721,e49c01,3f00c6,2cf59f,216354,2a9d8d,f02464,efeb61,5d3105,1b0fddc326,1
45,fcb5f4,0df9be,6bae66,c1d77a,6c1596,8866a9,e6cbd7,314ca4,32b019,d17392f6f4,1
46,2d7116,42b726,b04401,627ca9,fbac32,f5c853,0fb190,3cc4db,022587,17921a4881,1
47,4b68ab,3847fe,da7d6c,62c1fb,cbeebf,a35eab,7351ed,5e78f4,ddadea,5df64b8015,1
48,a1fce4,363854,ff888c,ff4b8e,7875d6,00c268,239041,2a8cf7,9b37d0,b11148b0fa,1
49,18f538,4d58bc,b1bba0,bcd9e6,a6781d,1a6ac2,cc280c,330ecb,ab6cb7,931b721552,1
50,594e51,9ae499,312b29,433b7d,d8a97f,f068de,fcba97,55b6d5,d00e84,c524d67b06,1
51,bbeebd,879e1d,ff6918,546dc0,c179fd,de505f,2a2159,1c9a9c,96e36b,054ec5af83,1
52,961b6d,d3ede3,cb8ecb,aacbd6,8de040,cd78eb,2ed588,9130cc,eb4c49,268ea4d506,2


## RSA Data

In [7]:
split_rsa_cols = encrypted_data_df['RSA'].apply(split_col)
rsa_df = pd.DataFrame(split_rsa_cols.tolist(), index=encrypted_data_df.index, columns = split_column_names)

rsa_df['Length'] = encrypted_data_df["PlainText"].apply(len)

In [8]:
rsa_df.tail(10)

Unnamed: 0,part1,part2,part3,part4,part5,part6,part7,part8,part9,part10,Length
43,04b9738cf9507f43b325fb67df1b6459bd8573a1a85b4a...,5043cbfa620fb1cc3f52f3cec85670706c4380a004cdfa...,bd36fbbe2c13fe9ec13c85f6ef4d6310deee7e75a00f14...,f4885e54cc6079cf6e9746b49b37809a7d96dd41a500ca...,f27735828380e74e745f30b8e07e6b875391579e066456...,58d0778e6fb2109df4a1da4ec240b707940c1441f89d96...,9693a2fc61d569c8eb63387183ce7ccd83c4e8070ed0f9...,406c10af703b12db04ce624a5a6f61bc7169ee9a3d8761...,567d88b947f3b51f7a6768d900d8c611c8f42f493a865a...,654e4a7c1d718961c6a62ea08cfe21125c4bd325bab77d...,1
44,4a3bee66214d528931da8f669ca293a2f7d81a94afcf60...,4739cac3448dcf9433d239a8c1817bd20d8f3e8c45b9fa...,9b5bce042c51b4c837648ab0ad551b65865269a2862016...,61d7ad77a2ba593bf4394119821fa095e37552275fde4b...,a5b60f1aa649d1bb4555a2c8621784542ba0d205c63b65...,e168917e38e40c972fe5406ac99724e123a6e48d77e4de...,438253eea6f44e6e08caedfba9dd67e4537741e2cd95eb...,44e45515661f98e8a3c2bc95986462f16d5d64fb3b8644...,5d37482deec1dc97a1821ac0bc42d5eefa93a5cd9fcca3...,fd055b220c38528c395d96f0f893082473d52f583f55ee...,1
45,b4a28df766c4becfbdb1dcdf41ac635b3a0d9818bb485a...,8f4267f8f35c86ea763d845a79921cfea17675a7b49f14...,74c0ae56bd67841c6e209481ba955166cf66cc1d8ad5f0...,f0b2299b80bf0d7a875e494a74e7bd1325bc58523fa2c5...,b6145e500ee6b1142b4e684ed6759b43375e50265b71e0...,6680c693e813fc10cf601c36c9441a6278628c352db671...,1bffa082541816d5948c5d949bebf270496cc21bcefe64...,e3d65816b637966891cb55e24620237d85c3ced5e107c5...,da2f7c04490374960a3b53c7d9c59e7629a2460377eafc...,d348f65b31965dcc1492ddd9fba62920fdbe6907923fa4...,1
46,0683958081a0fc7e70326bca231833a4b01013323618fc...,eb45765627211b186dda466be418e0a95e1121727f0577...,5cf30996536ce9f4886b12a2647d7921ea9a398d9e537c...,400f09afcf7f8ea40b68ee318e4bd957d83f26d4a7d0de...,4ab6b46012fa7af5743bec4b1d78db3d0bf2077a518dc0...,2c9f050929ca568dcc973033bac67c900e5107e140bae3...,d28199f42dc57b5d19fbaad1dfffcbf6ca21272a74933d...,101fc180e88db527b582bea19ba246e1eff5c480c40503...,f5f230f3d25f5606ba4617a606cbe15025d34e4822a49d...,af1c2c7eff314433457c871e44831b292642e5d521ed64...,1
47,b651b62dc7357a5d612fb6e899f75c2dfc0c0f3e3b934e...,524c1f0990111b0655462bd3fca39cfa7f1dfae6db2ffc...,acbec46bca028cf6cdb03fc2f45d492b8fe2191f98c09e...,59054e6f3b6361c1201c9ced27cd7c3ba78bb1e98227bf...,c1b9bad86076ff886a73ae3b52ca1191a328fa6e36e867...,ee579f091f5db76e82f02e64fd7433f523dc5e2309ff59...,d9a3ca9976070bc30c0ef7a9d755d8e7f922fb9ae350d6...,7ed1ce64f157ee574c36f2674c5a545f761d98378d032a...,07067d459719ef86c0e9a65e2c81c04fb8f4a76807b512...,7ec951b8008d4700e53b22c45be46f7f196dd4c0fa66c0...,1
48,5f71da09331d1075df7365acdf9422f8de39b5c927921d...,8f26b1518624a162410915f3f84d004172b5fa5a649717...,173372c1047ddb7fbb878f4d1197635860fb52366fc942...,b74caad230fe95f922299377d80cedcf1932c6f90f6c0c...,9d5dacd8617b3014f43f46a5c77b666e6abc1a2dc82a90...,5514be7cd18081f8b5e29825d47aefd91f2947594a1480...,339d3d4d4dd5efbe3504f81a06da3acafee72fd8d2d12c...,e9a71c6cf2bf7c37a36509027df37c7dfe23ad585d7fca...,e3b6944b4aba5cf8e93ad429881ef62ff5d6698d4ba230...,0c6123cfb581418297e52c8ae200a3da0b6d0eaa7e7b7f...,1
49,b6798fdc305ae8a3b1896b1d87f8b7d649af55f21c3864...,c1e81c1523e08cc4f642263ab4c628b267e655f31e1031...,dbdef9ca11856079d9c76e0c2c5ee75c363c54ff32cffe...,074c4e42cd8482e0a4700648e1b86ef73139c716c86dd4...,97942673146ff58ff3d27048ee7f3a599f03995a3b8676...,99f2a796995d461864ca9b8504375dded739a4c5012bae...,4c8424797d2bdaab889b1788cea473ec4ee3012d2d8855...,7111f111f186e413af430f1b6bbd037b1dca8077e6f898...,f9cdc03cfcaead3a895f8851fcb6cb1eca019d2b705834...,b23ca2b58187b53abc07ba339b6432e89af3ed05033256...,1
50,a2a832cb44ebaed64c87e1b5af5262bf15ea962643b0a9...,1e711d21d687d0eebaa2ff254e5613f2e084596dc441eb...,b80f23954d893b84a0e713cd192655384e97541198e7c2...,2c2392268347dda00ab30dc0d738c6171032aacb8bc42e...,5a8bbef1e2b097cd108aae6e36d9c1d8ba15dd1eaf9ef7...,f75fe07feb8a0277bc6bc6300a0fbe0c785fcaab754040...,ff1bfa81f6df472c4fde4af4dce93724f46e9af29d222f...,bcd01c129980b3fc58ec91b7b603f6ac72014e135f7932...,8e918aa0ced19e369cb1aae13d9af656fef2a438de3156...,64825aaf1f603650727b1da506952ff4c5e8a0557d9dee...,1
51,808932cc8c439130eef1cb37f73517d12bfbf0a9d38216...,5e8181697132afac0ba311c37f5e5aae03ff5335d61a06...,b2b7c7a455db06f8949e319245a9be50d82a8ca8c61185...,e65b641edf87e9336d0386f958a3de73438e3673489730...,5c50c83ecb15fe3f80066d696873db9d50b477dfcdb48c...,df96d7d4b22af6225cc28c62d303bbf0d7fccc5155c9c6...,2c3036f5a08bac3508c6c318115a7f2676fb4695ace191...,f37a74d4ce85af259f155d57771f29ada0af99cfb1e842...,d0d268d4ac29b87fe44a59ee036628101cd38263f4f885...,d19f1d1990d0469b2c5a4dc11f25edb1717d62bfa4f439...,1
52,8ee6af3b1ed8994aaed646ab85e217db2fea2af478ad48...,ffa485b16fec63d6df3ed7465ad4d597fe15f328a57e41...,4577649bcc8fec038c5fa23d98e8cb23d5790c094ec704...,5a09dbc374d0b7c340c910c58f09fc1708c1f5df25363d...,ed2a99dd69d673307c9443f9f40852cb318f0c9ccda160...,680cf5a74c97c95a503d0bce95b71e6267d329142a5eae...,3023b9292e557f280ca9e41872c2323222ddf09d2ba34d...,fedde40f12d18431a57b1b4b50a16cd621def96253209b...,831d9788dd74698eeef9a0692d44e10a15204e2b75b87a...,781d33bf0f9c9b4dbaa1bba192ed6bf183e9c725a522e1...,2


## Check Data Split

In [50]:
def check_data(df, num_split, encrypt_type):
    combined_text = ""
    for i in range(1,num_split+1):
        col_name = 'part'+ str(i)
        combined_text +=  df[col_name]

    result = (combined_text == encrypted_data_df[encrypt_type])

    return(result[result == False])


In [53]:
print(f'The following rows were not properly split for RSA: \n{check_data(rsa_df, 10, "RSA")}')

The following rows were not properly split: 
Series([], dtype: bool)


In [54]:
print(f'The following rows were not properly split for SHA256: \n{check_data(sha256_df, 10, "SHA256")}')

The following rows were not properly split for SHA256: 
Series([], dtype: bool)


## Combine Datasets

In [29]:
def hex_to_float(hex_string):
    hex_value = int(hex_string, 16)
    return np.float64(hex_value)

In [30]:
sha256_data = sha256_df.loc[:, split_column_names].map(hex_to_float).to_numpy()
sha256_labels = sha256_df['Length'].values.reshape(-1, 1)

In [31]:
rsa_data = rsa_df.loc[:, split_column_names].map(hex_to_float).to_numpy()
rsa_labels = rsa_df['Length'].values.reshape(-1, 1)

In [104]:
# Stack the data and label np arrays for sha256 and rsa
# dataset = np.vstack((sha256_data, rsa_data))
# labels = np.vstack((sha256_labels, rsa_labels))

dataset = sha256_data
labels = sha256_labels

In [105]:
print(dataset.shape)
print(labels.shape)

(53, 10)
(53, 1)


## Split Data into Training and Testing

In [122]:
X_train, X_test, Y_train, Y_test = train_test_split(dataset, labels, test_size=0.10, shuffle=True, random_state=42)

In [123]:
X_train = tf.convert_to_tensor(X_train)
X_test = tf.convert_to_tensor(X_test)
Y_train = tf.convert_to_tensor(Y_train)
Y_test = tf.convert_to_tensor(Y_test)

In [124]:
X_train.shape

TensorShape([47, 10])

# Model

In [125]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential([
    Dense(64, activation='relu', input_shape=(10,)),
    Dense(64, activation='relu'),
    Dense(1)
])

In [126]:
model.compile(optimizer='adam',
              loss='mean_squared_error',
              metrics=['mae']) 

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                704       
                                                                 
 dense_1 (Dense)             (None, 64)                4160      
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 4,929
Trainable params: 4,929
Non-trainable params: 0
_________________________________________________________________


In [127]:
model.fit(X_train, Y_train, epochs=50, batch_size=32, validation_split=0.2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x2461e512670>

In [128]:
# Clear memory after training
tf.keras.backend.clear_session()
gc.collect()

1524