In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [2]:
# === Параметры ===
csv_path = 'dataset/combolist.csv'
max_sequence_length = 50
batch_size = 64
embedding_dim = 64
hidden_dim = 128
num_epochs = 10

In [3]:
df = pd.read_csv(csv_path)
passwords = df['String'].astype(str).tolist()

In [4]:
# === Добавляем базовый набор символов ===
base_chars = set('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!@#$%^&*()-_=+[]{}|;:,.<>?/\\')
chars = base_chars.union(set(''.join(passwords)))

In [5]:
# === Токенизация ===
char_to_idx = {char: idx + 1 for idx, char in enumerate(chars)}
char_to_idx['<PAD>'] = 0
char_to_idx['<UNK>'] = len(char_to_idx) + 1  # Токен для неизвестных символов

idx_to_char = {idx: char for char, idx in char_to_idx.items()}

In [6]:

# === Преобразуем пароли в индексы ===
def encode_password(password):
    return [char_to_idx.get(char, char_to_idx['<UNK>']) for char in password]

sequences = [encode_password(password) for password in passwords]
sequences = [seq[:max_sequence_length] + [0] * (max_sequence_length - len(seq)) for seq in sequences]

X = torch.tensor(sequences, dtype=torch.long)


In [7]:
# Генерация меток (сдвиг последовательности на 1 символ вперед)
y = torch.tensor([seq[1:] + [0] for seq in sequences], dtype=torch.long)

In [8]:
# === DataLoader ===
dataset = TensorDataset(X, y)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)


In [9]:
# === Архитектура LSTM ===
class PasswordLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(PasswordLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.fc(x)
        return x

In [10]:
# Инициализация модели
vocab_size = len(char_to_idx)
model = PasswordLSTM(vocab_size, embedding_dim, hidden_dim)

In [11]:
# === Функция потерь и оптимизатор ===
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# === Обучение модели ===
for epoch in range(num_epochs):
    total_loss = 0
    for batch_x, batch_y in dataloader:
        optimizer.zero_grad()
        output = model(batch_x)
        loss = criterion(output.view(-1, vocab_size), batch_y.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(dataloader)}")

Epoch 1/10, Loss: 0.6691526705456763


In [None]:
mutations = {
    '0': 'o', '1': 'i', '2': 'z', '3': 'e', '4': 'a', '5': 's', '6': 'b', '7': 't', '8': 'b', '9': 'g',
    'o': '0', 'i': '1', 'z': '2', 'e': '3', 'a': '4', 's': '5', 'b': '6', 't': '7', 'b': '8', 'g': '9',
    '@': 'a', '$': 's', '!': 'i', '#': 'h', '&': 'and', '%': 'percent', '*': 'x', '+': 'plus', '=': 'equals'
}

In [None]:
# === Лексические замены типа "4ever" ===
complex_patterns = {
    '4ever': 'forever',
    '2day': 'today',
    'b4': 'before',
    'l8r': 'later',
    'gr8': 'great',
    '4u': 'for you',
    'luv': 'love',
    'u': 'you'
}

In [None]:
# === Функция поиска паттернов ===
def find_patterns(passwords):
    patterns = {}
    for password in passwords:
        # Поиск часто повторяющихся подстрок
        for length in range(3, 8):  # Длина паттернов от 3 до 8 символов
            for i in range(len(password) - length + 1):
                substring = password[i:i + length]
                if substring in patterns:
                    patterns[substring] += 1
                else:
                    patterns[substring] = 1

    # Поиск сложных мутаций
    mutation_patterns = {}
    for password in passwords:
        mutated = ''.join([mutations.get(char, char) for char in password])
        if mutated != password:
            mutation_patterns[mutated] = mutation_patterns.get(mutated, 0) + 1

        # Усложненные мутации (типа leet)
        leet_mutations = password.replace('a', '4').replace('s', '5').replace('e', '3').replace('o', '0').replace('i', '1').replace('t', '7')
        if leet_mutations != password:
            mutation_patterns[leet_mutations] = mutation_patterns.get(leet_mutations, 0) + 1

    lexical_patterns = {}
    for pattern, replacement in complex_patterns.items():
        for password in passwords:
            if pattern in password:
                replaced = password.replace(pattern, replacement)
                lexical_patterns[replaced] = lexical_patterns.get(replaced, 0) + 1

    # Поиск "заборчиков" (чередование верхнего и нижнего регистра)
    zigzag_patterns = {}
    for password in passwords:
        if any(char.isupper() for char in password) and any(char.islower() for char in password):
            zigzag_patterns[password] = zigzag_patterns.get(password, 0) + 1

    return patterns, mutation_patterns, zigzag_patterns, lexical_patterns

In [None]:

# === Получаем паттерны ===
patterns, mutation_patterns, zigzag_patterns, lexical_patterns = find_patterns(passwords)

In [None]:
# === Сохранение результатов ===
import json
result = {
    "frequent_patterns": patterns,
    "mutation_patterns": mutation_patterns,
    "zigzag_patterns": zigzag_patterns,
    "lexical_patterns": lexical_patterns
}

with open('password_patterns.json', 'w') as f:
    json.dump(result, f, indent=4)

print("✅ Модель обучена и паттерны сохранены в 'password_patterns.json'")

✅ Модель обучена и паттерны сохранены в 'password_patterns.json'
