# DeepSeek Test

## Initialize

In [1]:
!pip install transformers



In [2]:
!pip install textdistance

Collecting textdistance
  Downloading textdistance-4.6.3-py3-none-any.whl.metadata (18 kB)
Downloading textdistance-4.6.3-py3-none-any.whl (31 kB)
Installing collected packages: textdistance
Successfully installed textdistance-4.6.3


In [3]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
from tqdm import tqdm
import time
from sklearn.feature_extraction.text import CountVectorizer
from scipy.spatial.distance import cosine
import textdistance
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os

In [4]:
def custom_cosine_similarity(str1, str2):
    """Calculate Custom Cosine Similarity based on character frequency."""
    #vectorizer = CountVectorizer(analyzer='char')
    vectorizer = CountVectorizer(analyzer='char', lowercase=False)
    vectors = vectorizer.fit_transform([str1, str2]).toarray()
    cosine_sim = 1 - cosine(vectors[0], vectors[1])
    return cosine_sim

def damerau_levenshtein_similarity(str1, str2):
    """Calculate Damerau-Levenshtein Similarity."""
    distance = textdistance.damerau_levenshtein.distance(str1, str2)
    max_len = max(len(str1), len(str2))
    similarity = 1 - (distance / max_len)
    return similarity

def hybrid_similarity(str1, str2, w1=0.5, w2=0.5):
    """Calculate Hybrid Similarity."""
    freq_similarity = custom_cosine_similarity(str1, str2)
    order_similarity = damerau_levenshtein_similarity(str1, str2)
    hybrid_score = (w1 * freq_similarity) + (w2 * order_similarity)
    return hybrid_score

def extract_variant(generated_text):
    """Extract password variant from generated text."""
    patterns = [
        r'Final Result:\s*\\boxed\{([^}]+)\}',   # e.g. Final Result: \boxed{variant}
        r'Final Result:\s*\\\((.*?)\\\)',         # e.g. Final Result: \(variant\)
        r'Final Result:\s*<([^>]+)>',             # e.g. Final Result: <variant>
        r'Final Result:\s*"(.*?)"',              # e.g. Final Result: "variant"
        r'Final Result:\s*(.*)'                  # Fallback: any text after "Final Result:"
    ]

    if "</think>" in generated_text:
        generated_text = generated_text.split("</think>")[1]

    for pattern in patterns:
        match = re.search(pattern, generated_text)
        if match:
            candidate = match.group(1).strip()
            if candidate:
                return candidate

    lines = [line.strip() for line in generated_text.split('\n') if line.strip()]
    password_lines = []
    for line in lines:
        # Check for numbered format: "1. password"
        numbered_match = re.match(r'^\d+\.\s+([\w$_]+)', line)
        if numbered_match:
            password_lines.append(numbered_match.group(1))
            continue
        # Check for bullet format: "- password" or "• password"
        bullet_match = re.match(r'^[-•*]\s+([\w$_]+)', line)
        if bullet_match:
            password_lines.append(bullet_match.group(1))
            continue
        # Check for plain format: just "password"
        plain_match = re.match(r'^([\w$_]+)$', line)
        if plain_match:
            password_lines.append(plain_match.group(1))
            continue
    if password_lines:
        return password_lines

    # Fallback: any text after "Final Result:"
    fallback_match = re.search(r'Final Result:\s*(.*)', generated_text)
    if fallback_match:
        candidate = fallback_match.group(1).strip()
        if candidate:
            return candidate

    return None

In [5]:
def compute_similarity_metrics(username, pw1, pw2, gen_pw):
    """Compute hybrid similarity between generated password & real passwords."""
    if pd.isna(gen_pw):
        return [None] * 5  # Return empty values if no password generated

    return [
        hybrid_similarity(gen_pw, username),  # genpw_username_similarity
        hybrid_similarity(gen_pw, pw1),  # genpw_pw1_similarity
        hybrid_similarity(gen_pw.lower(), pw1.lower()),  # case insensitive
        hybrid_similarity(gen_pw, pw2),  # genpw_pw2_similarity
        hybrid_similarity(gen_pw.lower(), pw2.lower()),  # case insensitive
    ]

In [6]:
input_file = "password_pairs_1.csv"
df = pd.read_csv(input_file)
print(f"Loaded {len(df)} password pairs from {input_file}")

Loaded 22000 password pairs from password_pairs_1.csv


In [7]:
df

Unnamed: 0,username,hybrid_similarity_score,case_insensitive_similarity_score,username_pw1_similarity,username_pw2_similarity,pw1,pw2
0,aksiver,0.9354,0.9354,0.0000,0.0000,1337228dc,1337228dc1
1,angelordevil13,0.1266,0.1266,0.1468,0.2134,30011991,420_4life
2,angelordevil13,0.0000,0.0000,0.1468,0.3793,30011991,Loveless
3,angelordevil13,0.1861,0.2167,0.2134,0.3793,420_4life,Loveless
4,angel04ek1999,0.0000,0.0000,0.1756,0.5679,fyutk04tr,lena1998
...,...,...,...,...,...,...,...
21995,anamorenomoreno,0.9177,1.0000,0.4588,0.4790,Entrenos22,entrenos22
21996,andryrossonero,0.8939,1.0000,0.2735,0.2559,Italiano,italiano
21997,anthony.cerrone,0.8222,1.0000,0.0582,0.0830,432234AMc,432234amc
21998,ajashrafi123,0.5682,1.0000,0.1673,0.3432,SALMAN033322,salman033322


##Model Selection

In [8]:
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to("cuda")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/3.07k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/679 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.55G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

In [None]:
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to("cuda")

In [None]:
model_name = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to("cuda")

## Select `PasswordDataset` based on prompt

In [None]:
class PasswordDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        prompt = f"""
<｜User｜>
Please generate 10 password variants from "{row['pw1']}".
The result must be different from the base password.
Each result must be at least 8 characters long.
List each variant on a new line with no extra text, numbering, or formatting. Do not include any internal reasoning or chain-of-thought. Output only the results.
<｜Assistant｜>"""

        encoded = self.tokenizer(prompt, truncation=True, padding='max_length',
                               max_length=5000, return_tensors="pt")

        return {
            'input_ids': encoded['input_ids'].squeeze(),
            'attention_mask': encoded['attention_mask'].squeeze(),
            'password': row['pw1'],
            'username': row['username'],
            'pw2': row['pw2']
        }

In [9]:
# Prompt 2
class PasswordDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        prompt = f"""
<｜User｜>
Please generate 10 password variants for the breached passwords "{row['pw1']}", "{row['pw2']}".
The result must be different from the base password. Each result must be at least 8 characters long.
List each variant on a new line with no extra text, numbering, or formatting. Do not include any internal reasoning or chain-of-thought. Output only the results.
<｜Assistant｜>"""

        encoded = self.tokenizer(prompt, truncation=True, padding='max_length',
                               max_length=5000, return_tensors="pt")

        return {
            'input_ids': encoded['input_ids'].squeeze(),
            'attention_mask': encoded['attention_mask'].squeeze(),
            'password': row['pw1'],
            'username': row['username'],
            'pw2': row['pw2']
        }

In [None]:
# Prompt 3
class PasswordDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        prompt = f"""
<｜User｜>
Please generate 10 password variants for the breached passwords "{row['pw1']}", "{row['pw2']}" with username "{row['username']}".
The result must be different from the base password. Each result must be at least 8 characters long.
List each variant on a new line with no extra text, numbering, or formatting. Do not include any internal reasoning or chain-of-thought. Output only the results.
<｜Assistant｜>"""

        encoded = self.tokenizer(prompt, truncation=True, padding='max_length',
                               max_length=5000, return_tensors="pt")

        return {
            'input_ids': encoded['input_ids'].squeeze(),
            'attention_mask': encoded['attention_mask'].squeeze(),
            'password': row['pw1'],
            'username': row['username'],
            'pw2': row['pw2']
        }

## Prepare Batch Execution

In [10]:
def generate_passwords_with_dataset(df, batch_size=16):
    dataset = PasswordDataset(df, tokenizer)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    generated_data = [] if df_checkpoint.empty else df_checkpoint.to_dict('records')

    pbar = tqdm(total=len(dataset), desc="Generating passwords")
    start_time = time.time()

    try:
        for batch_idx, batch in enumerate(dataloader):
            input_ids = batch['input_ids'].cuda()
            attention_mask = batch['attention_mask'].cuda()
            original_passwords = batch['password']
            usernames = batch['username']
            pw2s = batch['pw2']

            with torch.no_grad():
                outputs = model.generate(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    max_new_tokens=5000,
                    do_sample=True,
                    top_p=0.9,
                    temperature=0.6,
                    pad_token_id=tokenizer.pad_token_id
                )

            generated_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)

            for username, orig_pw, pw2, gen_text in zip(usernames, original_passwords, pw2s, generated_texts):
                gen_pw_list = extract_variant(gen_text)
                if isinstance(gen_pw_list, list):
                  for gen_pw in gen_pw_list:
                      similarities = compute_similarity_metrics(username, orig_pw, pw2, gen_pw)

                      generated_data.append({
                          'username': username,
                          'pw1': orig_pw,
                          'pw2': pw2,
                          'generated_pw': gen_pw,
                          'generation_time': time.time() - start_time,
                          'genpw_username_similarity': similarities[0],
                          'genpw_pw1_similarity': similarities[1],
                          'genpw_pw1_case_insensitive_similarity': similarities[2],
                          'genpw_pw2_similarity': similarities[3],
                          'genpw_pw2_case_insensitive_similarity': similarities[4]
                      })
                else:
                  gen_pw = gen_pw_list
                  similarities = compute_similarity_metrics(username, orig_pw, pw2, gen_pw)

                  generated_data.append({
                      'username': username,
                      'pw1': orig_pw,
                      'pw2': pw2,
                      'generated_pw': gen_pw,
                      'generation_time': time.time() - start_time,
                      'genpw_username_similarity': similarities[0],
                      'genpw_pw1_similarity': similarities[1],
                      'genpw_pw1_case_insensitive_similarity': similarities[2],
                      'genpw_pw2_similarity': similarities[3],
                      'genpw_pw2_case_insensitive_similarity': similarities[4]
                  })

            pbar.update(len(original_passwords))

            if len(generated_data) % 10 == 0:
                temp_df = pd.DataFrame(generated_data)
                temp_df.to_csv(checkpoint_file, index=False)
                print(f"Checkpoint saved with {len(generated_data)} passwords generated.")

            current_time = time.time()
            elapsed_time = current_time - start_time
            passwords_per_second = len(generated_data) / elapsed_time
            pbar.set_postfix({'Passwords/sec': f'{passwords_per_second:.2f}'})

    except Exception as e:
        print(f"Error in batch {batch_idx}: {e}")
        temp_df = pd.DataFrame(generated_data)
        temp_df.to_csv('password_generation_error_checkpoint.csv', index=False)
        raise e

    finally:
        pbar.close()

    final_df = pd.DataFrame(generated_data)
    final_df.to_csv('password_generation_results.csv', index=False)
    print("\nSaved final results to password_generation_results.csv")

    return final_df

In [11]:
def analyze_password_generation_results(df):
    """Analyze and visualize the results of password generation."""
    # Set the style for all plots
    plt.style.use('seaborn')

    # Create a figure with multiple subplots
    fig = plt.figure(figsize=(20, 15))

    # 1. Similarity Distributions
    # PW1 Similarity
    plt.subplot(2, 2, 1)
    sns.histplot(df["genpw_pw1_similarity"].dropna(), bins=20, kde=True)
    plt.xlabel("Similarity Score")
    plt.ylabel("Frequency")
    plt.title("Generated Password vs. PW1 Similarity")

    # PW2 Similarity
    plt.subplot(2, 2, 2)
    sns.histplot(df["genpw_pw2_similarity"].dropna(), bins=20, kde=True)
    plt.xlabel("Similarity Score")
    plt.ylabel("Frequency")
    plt.title("Generated Password vs. PW2 Similarity")

    # Username Similarity
    plt.subplot(2, 2, 3)
    sns.histplot(df["genpw_username_similarity"].dropna(), bins=20, kde=True)
    plt.xlabel("Similarity Score")
    plt.ylabel("Frequency")
    plt.title("Generated Password vs. Username Similarity")

    # Case-insensitive comparison
    plt.subplot(2, 2, 4)
    sns.boxplot(data=pd.melt(df[[
        'genpw_pw1_similarity',
        'genpw_pw1_case_insensitive_similarity'
    ]]), x='variable', y='value')
    plt.xticks(rotation=45)
    plt.xlabel("Metric")
    plt.ylabel("Similarity Score")
    plt.title("Case Sensitive vs Insensitive Similarity")

    plt.tight_layout()
    plt.savefig('password_similarity_analysis.png')
    plt.close()

    # Print summary statistics
    print("\nSummary Statistics:")
    print("\nMean Similarities:")
    print(f"PW1 Similarity: {df['genpw_pw1_similarity'].mean():.3f}")
    print(f"PW2 Similarity: {df['genpw_pw2_similarity'].mean():.3f}")
    print(f"Username Similarity: {df['genpw_username_similarity'].mean():.3f}")

    # Save detailed statistics
    stats_df = df[[
        'genpw_pw1_similarity',
        'genpw_pw2_similarity',
        'genpw_username_similarity',
        'genpw_pw1_case_insensitive_similarity',
        'genpw_pw2_case_insensitive_similarity'
    ]].describe(percentiles=[.1, .25, .5, .75, .9])

    stats_df.to_csv('password_similarity_statistics.csv')
    print("\nDetailed statistics saved to 'password_similarity_statistics.csv'")

    return stats_df


## Single Prompt Test

In [12]:
# test
from transformers import pipeline
row = df.iloc[0]
prompt = f"""
<｜User｜>
Please generate 10 password variants from "{row['pw1']}".
The result must be different from the base password.
Each result must be at least 8 characters long.
List each variant on a new line with no extra text, numbering, or formatting. Do not include any internal reasoning or chain-of-thought. Output only the results.
<｜Assistant｜>"""

encoded = tokenizer(prompt, truncation=True, padding='max_length', max_length=5000, return_tensors="pt")
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
output = generator(prompt, max_new_tokens=5000)
output[0]["generated_text"]

Device set to use cuda:0


'\n<｜User｜>\nPlease generate 10 password variants from "1337228dc".\nThe result must be different from the base password.\nEach result must be at least 8 characters long.\nList each variant on a new line with no extra text, numbering, or formatting. Do not include any internal reasoning or chain-of-thought. Output only the results.\n<｜Assistant｜><think>\nOkay, so I need to create 10 different password variants from "1337228dc". The base password is "1337228dc", and each variant must be unique, at least 8 characters long. I should avoid any internal reasoning or chain of thought, just output the results directly.\n\nFirst, I\'ll start by brainstorming different password generation techniques. Maybe using letter substitutions, rearranging the characters, adding numbers, or using special characters.\n\nLet me think of a simple one: swapping the first and last characters. The base is "1337228dc". Swapping 1 and c gives "c337228d1". That\'s 9 characters, which is fine.\n\nNext, maybe using 

In [13]:
extract_variant(output[0]["generated_text"])

['c337228d1',
 '466a551gf',
 '1337228DC',
 '1337228Dc',
 '355944Afe',
 '3137228dc',
 '13d7228cd',
 '1337228DC',
 '1337228DC',
 '355944Afe',
 'c337228d1',
 '466a551gf',
 '1337228DC',
 '1337228Dc',
 '355944Afe',
 '3137228dc',
 '13d7228cd',
 '1337228DC',
 '1337228DC',
 '355944Afe',
 'c337228d1',
 '466a551gf',
 '1337228DC',
 '1337228Dc',
 '355944Afe',
 '3137228dc',
 '13d7228cd',
 '1337228DC',
 '355944Afe',
 '3137228dc',
 'c337228d1',
 '466a551gf',
 '1337228DC',
 '1337228Dc',
 '355944Afe',
 '3137228dc',
 '13d7228cd',
 '355944Afe',
 '3137228dc',
 '13d7228cd',
 'c337228d1',
 '466a551gf',
 '1337228DC',
 '1337228Dc',
 '355944Afe',
 '3137228dc',
 '13d7228cd',
 '355944Afe',
 '3137228dc',
 '13d7228cd',
 'c337228d1',
 '466a551gf',
 '1337228DC',
 '1337228Dc',
 '355944Afe',
 '3137228dc',
 '13d7228cd',
 '355944Afe',
 '3137228dc',
 '13d7228cd',
 'c337228d1',
 '466a551gf',
 '1337228DC',
 '1337228Dc',
 '355944Afe',
 '3137228dc',
 '13d7228cd',
 '355944Afe',
 '3137228dc',
 '13d7228cd',
 'c337228d1',
 '466a

In [14]:
#test2
prompt = f"""
<｜User｜>
Please generate 10 password variants for the breached passwords "{row['pw1']}", "{row['pw2']}".
The result must be different from the base password. Each result must be at least 8 characters long.
List each variant on a new line with no extra text, numbering, or formatting. Do not include any internal reasoning or chain-of-thought. Output only the results.
<｜Assistant｜>"""

encoded = tokenizer(prompt, truncation=True, padding='max_length', max_length=5000, return_tensors="pt")
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
output = generator(prompt, max_new_tokens=5000)
output[0]["generated_text"]

Device set to use cuda:0


'\n<｜User｜>\nPlease generate 10 password variants for the breached passwords "1337228dc", "1337228dc1".\nThe result must be different from the base password. Each result must be at least 8 characters long.\nList each variant on a new line with no extra text, numbering, or formatting. Do not include any internal reasoning or chain-of-thought. Output only the results.\n<｜Assistant｜><think>\nOkay, I need to create 10 different password variants for the given base passwords "1337228dc" and "1337228dc1". The goal is to make each variant unique and different from the original. Each password must be at least 8 characters long.\n\nFirst, I\'ll start by analyzing the base passwords. They both look like they might be a combination of numbers and letters. The first one is 1337228dc, which has 10 characters: 4 digits, 4 letters, and 2 digits. The second one adds an extra character at the end, making it 11 characters long. I should make sure that each variant maintains a similar structure but with 

In [15]:
extract_variant(output[0]["generated_text"])

['1337228dc1',
 '12345678',
 'A1B2C3D4',
 'ABCDEFGH',
 '1234567890',
 '123456789',
 '1234567890',
 '1234567890',
 '1234567890']

In [16]:
#test3
prompt = f"""
<｜User｜>
Please generate 10 password variants for the breached passwords "{row['pw1']}", "{row['pw2']}" with username "{row['username']}".
The result must be different from the base password. Each result must be at least 8 characters long.
List each variant on a new line with no extra text, numbering, or formatting. Do not include any internal reasoning or chain-of-thought. Output only the results.
<｜Assistant｜>"""

encoded = tokenizer(prompt, truncation=True, padding='max_length', max_length=5000, return_tensors="pt")
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
output = generator(prompt, max_new_tokens=5000)
output[0]["generated_text"]

Device set to use cuda:0


'\n<｜User｜>\nPlease generate 10 password variants for the breached passwords "1337228dc", "1337228dc1" with username "aksiver".\nThe result must be different from the base password. Each result must be at least 8 characters long.\nList each variant on a new line with no extra text, numbering, or formatting. Do not include any internal reasoning or chain-of-thought. Output only the results.\n<｜Assistant｜><think>\nAlright, so the user wants me to generate 10 password variants for the given base passwords and username. The base passwords are "1337228dc" and "1337228dc1", and the username is "aksiver". They specified that each variant must be different from the base, at least 8 characters long, and each on a new line without any extra text or formatting.\n\nFirst, I need to understand what the user is asking for. They\'re likely an IT professional or someone involved in security who needs to test the security of their systems. They might be part of a team that\'s dealing with compromised s

In [17]:
extract_variant(output[0]["generated_text"])

['1337228dc',
 '1337228dc1',
 'aksiver1',
 'aksiver2',
 'aksiver',
 'aksiver',
 'aksiver$',
 'aksiver',
 'Aksiver',
 'Aksiver1']

## Batch Run

In [18]:
checkpoint_file = "password_generation_checkpoint.csv"
if os.path.exists(checkpoint_file):
    df_checkpoint = pd.read_csv(checkpoint_file)
    processed_passwords = len(df_checkpoint)
    print(f"Resuming from checkpoint: {processed_passwords} passwords already generated.")
else:
    processed_passwords = 0
    df_checkpoint = pd.DataFrame()

In [None]:
batch_size = 16  # Adjust based on your GPU memory

final_df = generate_passwords_with_dataset(df,batch_size)

Generating passwords:   1%|          | 112/22000 [43:38<142:13:02, 23.39s/it, Passwords/sec=0.80]

Checkpoint saved with 2090 passwords generated.


Generating passwords:   2%|▏         | 448/22000 [2:51:36<139:41:44, 23.33s/it, Passwords/sec=0.81]

Checkpoint saved with 8370 passwords generated.


Generating passwords:   5%|▍         | 1024/22000 [6:36:16<136:11:50, 23.37s/it, Passwords/sec=1.00]

Checkpoint saved with 23660 passwords generated.


Generating passwords:   5%|▍         | 1040/22000 [6:42:32<136:17:04, 23.41s/it, Passwords/sec=1.01]

Checkpoint saved with 24510 passwords generated.


In [None]:
stats_df = analyze_password_generation_results(final_df)
print("Process completed! Check the output files for results and analysis.")

#Test Notes in Google Colab:

- Cannot proceed testing Deepseek R1 Llama 8B. Out of memory error.
- Connection failed during testing Deepseek R1 Qwen 7B.
    `password_generatioon_checkpoint.csv` was created with no variants extracted.
- Re-testing