In [1]:
from itertools import product
import random
import string
import math
import time

from dask.distributed import Client
import dask.dataframe as dd

client = Client()

In [2]:
class Timer:
      
    def __enter__(self):
        self.start = time.time()
        
    def __exit__(self, *args, **kwargs):
        print('DURATION: {}'.format(time.time() - self.start))

## Generate `hash_occurences.txt` & `user_hash_occurences.txt`

In [3]:
HASH_OCCURENCE_COUNT = 10 ** 6
APPROX_USER_COUNT = 10 ** 2

all_hashes = product(string.digits + string.ascii_lowercase, repeat=32)

with open('./resources/hash_occurences.txt', 'w') as f:
    with open('./resources/user_hash_occurences.txt', 'w') as g:
        for i in range(HASH_OCCURENCE_COUNT):
            current_hash = ''.join(next(all_hashes))
            
            # -- all hashes
            f.write('{hash}:{count}\n'.format(
                hash=current_hash,
                count=random.randint(1, 10 ** 6)))       

            # -- user hashes
            if random.random() < APPROX_USER_COUNT / HASH_OCCURENCE_COUNT:
                g.write('user{id}:{hash}\n'.format(
                    id=str(i).rjust(int(math.log10(HASH_OCCURENCE_COUNT)), '0'),
                    hash=current_hash))                 

### `hash_occurences.txt` properties

In [4]:
!tail -n 5 resources/hash_occurences.txt

0000000000000000000000000000lfln:966514
0000000000000000000000000000lflo:720694
0000000000000000000000000000lflp:163569
0000000000000000000000000000lflq:612657
0000000000000000000000000000lflr:551880


In [5]:
!wc -l resources/hash_occurences.txt

1000000 resources/hash_occurences.txt


In [6]:
!ls -alh resources/hash_occurences.txt

-rw-rw-r-- 1 maciej maciej 39M lis 23 07:28 resources/hash_occurences.txt


###  `user_hash_occurences.txt` properties

In [7]:
!tail -n 5 resources/user_hash_occurences.txt 

user938739:0000000000000000000000000000k4c3
user940289:0000000000000000000000000000k5j5
user950030:0000000000000000000000000000kd1q
user953776:0000000000000000000000000000kfxs
user967790:0000000000000000000000000000kqr2


In [8]:
!wc -l resources/user_hash_occurences.txt

114 resources/user_hash_occurences.txt


In [9]:
!ls -alh resources/user_hash_occurences.txt

-rw-rw-r-- 1 maciej maciej 4,9K lis 23 07:28 resources/user_hash_occurences.txt


## Finding frequencies for our users hashes

In [10]:
with Timer():
    hash_occurences_df = dd.read_table(
        './resources/hash_occurences.txt', 
        sep=':',
        names=['hash', 'frequency'])
    hash_occurences_df = hash_occurences_df.set_index('hash')

DURATION: 1.6881103515625


In [11]:
hash_occurences_df.head()

Unnamed: 0_level_0,frequency
hash,Unnamed: 1_level_1
0,416519
1,935832
2,975667
3,648773
4,72207


In [12]:
with Timer():
    user_hash_occurences_df = dd.read_table(
        './resources/user_hash_occurences.txt', 
        sep=':',
        names=['username', 'hash'])
    user_hash_occurences_df = user_hash_occurences_df.set_index('hash')

DURATION: 0.11332082748413086


In [13]:
user_hash_occurences_df.head()

Unnamed: 0_level_0,username
hash,Unnamed: 1_level_1
0000000000000000000000000000021c,user002640
00000000000000000000000000000dy3,user018075
00000000000000000000000000000eyu,user019398
00000000000000000000000000000h7o,user022308
00000000000000000000000000000ha3,user022395


In [14]:
with Timer():
    user_frequency_df = user_hash_occurences_df.join(hash_occurences_df).compute()

DURATION: 1.8877551555633545


In [15]:
user_frequency_df.head(10)

Unnamed: 0_level_0,username,frequency
hash,Unnamed: 1_level_1,Unnamed: 2_level_1
0000000000000000000000000000021c,user002640,244220
00000000000000000000000000000dy3,user018075,720893
00000000000000000000000000000eyu,user019398,611635
00000000000000000000000000000h7o,user022308,44856
00000000000000000000000000000ha3,user022395,875788
00000000000000000000000000000i1q,user023390,162516
00000000000000000000000000000jsg,user025648,673542
00000000000000000000000000000kew,user026456,739087
00000000000000000000000000000lxh,user028421,216339
00000000000000000000000000000oeb,user031619,264530


In [16]:
user_frequency_df.to_csv('./resources/user_frequency.csv')

In [17]:
!head -n 10 ./resources/user_frequency.csv

hash,username,frequency
0000000000000000000000000000021c,user002640,244220
00000000000000000000000000000dy3,user018075,720893
00000000000000000000000000000eyu,user019398,611635
00000000000000000000000000000h7o,user022308,44856
00000000000000000000000000000ha3,user022395,875788
00000000000000000000000000000i1q,user023390,162516
00000000000000000000000000000jsg,user025648,673542
00000000000000000000000000000kew,user026456,739087
00000000000000000000000000000lxh,user028421,216339


## How to find the insecure password threshold frequency?

In [18]:
hash_occurences_df.frequency.describe().compute()

count    1000000.000000
mean      500408.352068
std       288412.939385
min            1.000000
25%       250691.750000
50%       500523.000000
75%       750107.000000
max      1000000.000000
dtype: float64

In [19]:
hash_occurences_df.frequency.quantile(q=0.5).compute()

500523.0