<a href="https://colab.research.google.com/github/drawnator/PLN-grupo-19/blob/main/Grupo19_PLN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Ferramenta Otimizadora De Aleatoriedade em senhas
---
Modelo de linguagem que auxilia a tornar senhas mais fortes, analisando a entrada e dando sugestões que tornariam a senha mais improvável de adivinhar.

base de dados utilizada:
[rockyou.txt](https://github.com/brannondorsey/naive-hashcat/releases/download/data/rockyou.txt)

Assuntos:
- Análise de frequência
- Masked language model

Tecnologias utilizadas:
- Bert
- RNN
- Árvores de decisão
- Senha aleatória

##Preparando o ambiente de execução (imports necessários)

In [None]:
import requests
import os

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
import tensorflow as tf

from keras.models import Sequential
from keras.layers import Embedding, Dense, Dropout, SimpleRNN
from keras.optimizers import Adam
from keras.utils import Sequence


## Preparando os dados

In [None]:
# RECOMENDO FORTEMENTE BAIXAR E ARRASTAR MANUALMENTE ATÉ ARTIGOS, ISSO AQUI DEMORA MT \/
url = "https://github.com/brannondorsey/naive-hashcat/releases/download/data/rockyou.txt"
# response = requests.get(url)
# data = response.text
#...

In [None]:
with open("rockyou.txt", "r", encoding='latin-1') as f:
  df = pd.DataFrame(f.readlines(), columns=['password'])

In [None]:
df['password'] = df['password'].str.replace('\n', '')

In [None]:
# if not os.path.exists("rockyou.csv"):
#   df.to_csv("rockyou.csv")

In [None]:
df.shape

(6874685, 1)

# Métodos de avaliar a qualidade de uma senha

## RNN

Previsibilidade de caracteres, com base em uma parte da senha o quão fácil é prever o resto dela?


## Tokenização

Transformando os caracteres de uma senha em tokens para que eles estejam no formato adequado para o processamento.

In [None]:
all_chars = sorted(list(set(''.join(df['password']))))
char_to_int = {char: i for i, char in enumerate(all_chars)}
int_to_char = {i: char for char, i in char_to_int.items()}

In [None]:
char_to_int['[MASK]'] = len(char_to_int)
char_to_int['[CLS]'] = len(char_to_int)
char_to_int['[SEP]'] = len(char_to_int)
char_to_int['[PAD]'] = len(char_to_int)
int_to_char = {i: char for char, i in char_to_int.items()}

In [None]:
class RNNTokenizer:
  def __init__(self, char_to_int,int_to_char,max_length = 32):
    self.char_to_int = char_to_int
    self.int_to_char = int_to_char
    self.mask_token_id = char_to_int['[MASK]']
    self.cls_token_id = char_to_int['[CLS]']
    self.sep_token_id = char_to_int['[SEP]']
    self.pad_token_id = char_to_int['[PAD]']
    self.max_length = max_length
    self.vocab_size = len(char_to_int)

  def __call__(self,text):
    token = []
    for i in range(self.max_length):
      if (i < len(text)):
        token.append(char_to_int[text[i]])
      else:
        token.append(char_to_int['[PAD]'])
    return token

In [None]:
rnntokenizer = RNNTokenizer(char_to_int,int_to_char)

In [None]:
class RNNDataloader(Sequence):
  def __init__(self,dataframe,tokenizer,batch_size=100):
    self.dataframe = dataframe
    self.tokenizer = tokenizer
    self.batch_size = batch_size

  def mask_and_tokens(self,password):
      i = np.random.randint(1, min(len(password),self.tokenizer.max_length))
      tokens = self.tokenizer(password)
      input_seq = tokens[:i]
      label = tokens[i]
      input_seq += [self.tokenizer.mask_token_id]
      input_seq += [self.tokenizer.pad_token_id] * (self.tokenizer.max_length - len(input_seq))
      return input_seq,label

  def __len__(self):
    return len(self.dataframe) // self.batch_size

  def __getitem__(self, idx):
    batch = self.dataframe[idx * self.batch_size:(idx + 1) * self.batch_size]
    batch_inputs = []
    batch_labels = []
    for password in batch:
      input_seq, label = self.mask_and_tokens(password)
      batch_inputs.append(input_seq)
      batch_labels.append(label)
    return np.array(batch_inputs), np.array(batch_labels)

In [None]:
dataloader = RNNDataloader(df['password'],rnntokenizer)

In [None]:
df["password"][0]

'123456'

In [None]:
dataloader[0]

(array([[ 18,  19,  20, ..., 212, 212, 212],
        [ 18,  19,  20, ..., 212, 212, 212],
        [ 18,  19,  20, ..., 212, 212, 212],
        ...,
        [ 77,  66,  86, ..., 212, 212, 212],
        [ 78,  74,  68, ..., 212, 212, 212],
        [ 81,  83, 209, ..., 212, 212, 212]]),
 array([22, 21, 25, 83, 87, 74, 24, 76, 23, 19, 68, 79, 72, 90, 90, 70, 18,
        66, 90, 70, 18, 80, 17, 70, 72, 86, 85, 80, 70, 79, 79, 90, 70, 79,
        83, 83, 85, 87, 90, 20, 85, 83, 69, 66, 70, 80, 77, 20, 86, 79, 79,
        90, 70, 76, 70, 72, 90, 80, 90, 70, 73, 74, 74, 73, 66, 83, 77, 83,
        66, 84, 69, 23, 73, 84, 78, 85, 67, 77, 87, 74, 79, 25, 85, 73, 66,
        84, 80, 80, 86, 88, 81, 73, 83, 77, 77, 77, 77, 70, 70, 74]))

## Dividindo os dados em conjuntos de treino e

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

def create_train_val_test_arrays(df, dataloader, tokenizer, test_size=0.2, val_size=0.5,):
    train_df, temp_df = train_test_split(df, test_size=test_size, random_state=42)
    val_df, test_df = train_test_split(temp_df, test_size=val_size, random_state=42)

    train_dataloader = dataloader(train_df, tokenizer)
    val_dataloader = dataloader(val_df, tokenizer)
    test_dataloader = dataloader(test_df, tokenizer)

    return train_dataloader, val_dataloader, test_dataloader

In [None]:
train_dataloader,val_dataloader,test_dataloader = create_train_val_test_arrays(df["password"], RNNDataloader, rnntokenizer)

# defining a model
TODO: modelo RNN mas train label usados no treinamento do bert, opções:
- mudar modelo a baixo para fazer fine tunning do bert
- mudar mask_tokens para gerar uma entrada e saida condizente com um problema RNN


In [None]:
#https://colab.research.google.com/drive/1mts5E3yAd1irLzS7Ei6UtwbG773C87DB?usp=sharing
model = Sequential([
    Embedding(
        input_dim=rnntokenizer.vocab_size,
        output_dim=100,
        input_shape=(rnntokenizer.max_length,)),
    SimpleRNN(64, return_sequences=False),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(rnntokenizer.vocab_size, activation='softmax')
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

  super().__init__(**kwargs)


In [None]:
history = model.fit(
    train_dataloader,
    validation_data=(val_dataloader),
    epochs=3,
    batch_size=256,
    verbose=1)

Epoch 1/3


  self._warn_if_super_not_called()


[1m  496/54997[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m4:31[0m 5ms/step - accuracy: 0.0701 - loss: 3.8505

InvalidArgumentError: Graph execution error:

Detected at node PyFunc defined at (most recent call last):
<stack traces unavailable>
Detected at node PyFunc defined at (most recent call last):
<stack traces unavailable>
2 root error(s) found.
  (0) INVALID_ARGUMENT:  ValueError: low >= high
Traceback (most recent call last):

  File "/usr/local/lib/python3.12/dist-packages/tensorflow/python/ops/script_ops.py", line 269, in __call__
    ret = func(*args)
          ^^^^^^^^^^^

  File "/usr/local/lib/python3.12/dist-packages/tensorflow/python/autograph/impl/api.py", line 643, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^

  File "/usr/local/lib/python3.12/dist-packages/tensorflow/python/data/ops/from_generator_op.py", line 198, in generator_py_func
    values = next(generator_state.get_iterator(iterator_id))
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

  File "/usr/local/lib/python3.12/dist-packages/keras/src/trainers/data_adapters/py_dataset_adapter.py", line 248, in _finite_generator
    yield self._standardize_batch(self.py_dataset[i])
                                  ~~~~~~~~~~~~~~~^^^

  File "/tmp/ipython-input-3431407358.py", line 24, in __getitem__
    input_seq, label = self.mask_and_tokens(password)
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

  File "/tmp/ipython-input-3431407358.py", line 8, in mask_and_tokens
    i = np.random.randint(1, min(len(password),self.tokenizer.max_length))
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

  File "numpy/random/mtrand.pyx", line 798, in numpy.random.mtrand.RandomState.randint

  File "numpy/random/_bounded_integers.pyx", line 1334, in numpy.random._bounded_integers._rand_int64

ValueError: low >= high


	 [[{{node PyFunc}}]]
	 [[IteratorGetNext]]
	 [[IteratorGetNext/_4]]
  (1) INVALID_ARGUMENT:  ValueError: low >= high
Traceback (most recent call last):

  File "/usr/local/lib/python3.12/dist-packages/tensorflow/python/ops/script_ops.py", line 269, in __call__
    ret = func(*args)
          ^^^^^^^^^^^

  File "/usr/local/lib/python3.12/dist-packages/tensorflow/python/autograph/impl/api.py", line 643, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^

  File "/usr/local/lib/python3.12/dist-packages/tensorflow/python/data/ops/from_generator_op.py", line 198, in generator_py_func
    values = next(generator_state.get_iterator(iterator_id))
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

  File "/usr/local/lib/python3.12/dist-packages/keras/src/trainers/data_adapters/py_dataset_adapter.py", line 248, in _finite_generator
    yield self._standardize_batch(self.py_dataset[i])
                                  ~~~~~~~~~~~~~~~^^^

  File "/tmp/ipython-input-3431407358.py", line 24, in __getitem__
    input_seq, label = self.mask_and_tokens(password)
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

  File "/tmp/ipython-input-3431407358.py", line 8, in mask_and_tokens
    i = np.random.randint(1, min(len(password),self.tokenizer.max_length))
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

  File "numpy/random/mtrand.pyx", line 798, in numpy.random.mtrand.RandomState.randint

  File "numpy/random/_bounded_integers.pyx", line 1334, in numpy.random._bounded_integers._rand_int64

ValueError: low >= high


	 [[{{node PyFunc}}]]
	 [[IteratorGetNext]]
0 successful operations.
0 derived errors ignored. [Op:__inference_multi_step_on_iterator_2381]

## Bert

In [None]:
#https://colab.research.google.com/drive/1Suv_JhRhoYNOCHtrGQwqZQO18nMXHEfq?usp=sharing
def mask_tokens(inputs, tokenizer, mlm_probability=0.10):
    inputs = np.array(inputs)
    labels = np.copy(inputs)

    rand = np.random.rand(*inputs.shape)
    mask_arr = (rand < mlm_probability)

    special_tokens = [tokenizer.cls_token_id, tokenizer.sep_token_id]
    for special_id in special_tokens:
        mask_arr[inputs == special_id] = False

    inputs[mask_arr] = tokenizer.mask_token_id

    labels[~mask_arr] = 0

    return inputs, labels

In [None]:
def create_train_val_test_datasets(df, tokenizer, test_size=0.10, val_size=0.15, batch_size=8,max=None):

    if max:
        df = df.head(max)

    train_df, temp_df = train_test_split(df, test_size=test_size, random_state=42)
    val_df, test_df = train_test_split(temp_df, test_size=val_size, random_state=42)

    def df_to_dataset(dataframe, tokenizer, batch_size):
        tokenized_passwords = dataframe['tokenized_password'].tolist()

        masked_inputs = []
        masked_labels = []
        for tokens in tokenized_passwords:
            inputs, labels = mask_tokens(tokens, tokenizer)
            masked_inputs.append(inputs)
            masked_labels.append(labels)

        input_ids = tf.constant(masked_inputs, dtype=tf.int32)
        labels = tf.constant(masked_labels, dtype=tf.int32)

        dataset = tf.data.Dataset.from_tensor_slices((input_ids, labels))
        return dataset.shuffle(1000).batch(batch_size)

    train_dataset = df_to_dataset(train_df, tokenizer, batch_size)
    val_dataset = df_to_dataset(val_df, tokenizer, batch_size)
    test_dataset = df_to_dataset(test_df, tokenizer, batch_size)

    return train_dataset, val_dataset, test_dataset

In [None]:
train_ds, val_ds, test_ds = create_train_val_test_datasets(df, dummy_tokenizer, batch_size=8)