<a href="https://colab.research.google.com/github/fb89zila/md5_match_tester/blob/master/MD5_Tester.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MD5 collision testing

The goal is to search for the longest possible connected match of two md5-hashes by simply comparing two md5-hashes of random strings.

### Code for testing

The matches are only counted from the front of the hash until they differ.

In [1]:
# importing libs
#from google.colab import drive
import sys
import threading
import multiprocessing as multiproc
from multiprocessing import Pool
import hashlib as hash
import random as rand
import pandas as pd
import string

# mount google drive for csv export
#drive.mount('/drive')
#csv_path = '/drive/My Drive/ColabNotebooks/out_files/MD5-Tester.csv'

# definitions
NUM_OF_THREADS = 10                                   # number of threads used for the calculation of the md5-hashes
NUM_OF_CORES = multiproc.cpu_count()                  # for multiprocessing
CHARACTERS = string.ascii_letters.join(string.digits) # used characters for random strings
STRING_LENGTH1 = 10                                   # length of random string 1
STRING_LENGTH2 = 10                                   # length of random string 2
ROUNDS = 1000000                                      # number of pairs of strings to be compared
MIN_NUM_OF_MATCHES = 5                                # minimum of matches a pair of string needs to be added to the dataframe

The definitions can be adjusted  
> be aware that with my default values (string lengths 10, `ROUNDS = 100000000`) the execution takes around 17 minutes.  
Right now, there is no difference in time when using multithreading/-processing because I can't find any help on how to reliably implement the usage of GPU for multiprocessing in Colab.

In [2]:
def rand_str(length: int) -> str:
  """
  Generates a random string.  

  Args:
    length: length of the random string
  Returns:
    random string
  """
  return ''.join(rand.choices(CHARACTERS, k=length))

In [3]:
def test_md5():
  """
  Generate md5-hashes of two random strings and compares them.

  Returns:
    tuple: contains match length, string pair and hash pair.
    None: returned if match length was smaller than `MIN_NUM_OF_MATCHES`
  """
  str1 = rand_str(STRING_LENGTH1)
  str2 = rand_str(STRING_LENGTH2)
  md5_hash1 = hash.md5(str1.encode('utf-8')).hexdigest()
  md5_hash2 = hash.md5(str2.encode('utf-8')).hexdigest()

  prefix_matches = 0

  for i in range(len(md5_hash1)):
    if md5_hash1[i] != md5_hash2[i]:
      break
    prefix_matches += 1
    
  if prefix_matches >= MIN_NUM_OF_MATCHES:
    return prefix_matches, str1, str2, md5_hash1, md5_hash2
  else:
    return None

In [4]:
def run_md5_tests(res):
  """
  Tests `ROUNDS` pairs of strings if they have (partially) matching md5-hashes.

  Args:
    res - List used to append test results 
  """
  
  for a in range(ROUNDS):
    if (a % (ROUNDS/100)) == 0:
      print('[progress: ' + str(int(a*100/ROUNDS)) + '%]')
      # print('[' + threading.current_thread().getName() + ' - progress: ' + str(int(a*100/ROUNDS)) + '%]')
      # print('[' + multiproc.current_process().name + ' - progress: ' + str(int(a*100/ROUNDS)) + '%]')

    test_result = test_md5()
    if test_result != None:
      res.append(test_result)
  print('[progress: 100%]')
  # print('[' + threading.current_thread().getName() + ' - progress: 100%]')
  # print('[' + multiproc.current_process().name + ' - progress: 100%]')

In [5]:
def threading_func(res):
  """
  Create threads to run multiple tests.

  Args:
    res - List used to append test results
  """
  jobs = []
  for _ in range(NUM_OF_THREADS):
    t = threading.Thread(target=run_md5_tests, args=(res,))
    t.setDaemon(True)
    jobs.append(t)
  
  for i in range(NUM_OF_THREADS):
    jobs[i].start()

  for i in range(NUM_OF_THREADS):
    jobs[i].join()

In [6]:
def multiprocess_func(res):
  """
  `NOT WORKING`
  Create multiprocessing pool to run multiple tests.

  Args:
    res - List used to append test results
  """
  with Pool(NUM_OF_CORES) as p:
    for i in range(ROUNDS):
      p.apply_async(run_md5_tests, (res,))
      
    p.close()
    p.join()

In [7]:
# main method initializes random generator, starts tests and displays result
def main():
  """
  Initializes random generator, starts tests and displays (and saves) results.
  """
  rand.seed()
  results = []

  df_columns_list = ['matches', 'string 1', 'string 2', 'md5-hash 1', 'md5-hash 2']

  run_md5_tests(results)
  # threading_func(results)
  # multiprocess_func(results)

  df = pd.DataFrame(results, columns=df_columns_list)

  df = df.sort_values(by='matches', ascending=False)

  display(df)

  # saves df as csv-file in my drive (needs authentication)
  #df.to_csv(csv_path, index=False)

In [8]:
# entry point
if __name__ == '__main__':
    main()

[progress: 0%]
[progress: 1%]
[progress: 2%]
[progress: 3%]
[progress: 4%]
[progress: 5%]
[progress: 6%]
[progress: 7%]
[progress: 8%]
[progress: 9%]
[progress: 10%]
[progress: 11%]
[progress: 12%]
[progress: 13%]
[progress: 14%]
[progress: 15%]
[progress: 16%]
[progress: 17%]
[progress: 18%]
[progress: 19%]
[progress: 20%]
[progress: 21%]
[progress: 22%]
[progress: 23%]
[progress: 24%]
[progress: 25%]
[progress: 26%]
[progress: 27%]
[progress: 28%]
[progress: 29%]
[progress: 30%]
[progress: 31%]
[progress: 32%]
[progress: 33%]
[progress: 34%]
[progress: 35%]
[progress: 36%]
[progress: 37%]
[progress: 38%]
[progress: 39%]
[progress: 40%]
[progress: 41%]
[progress: 42%]
[progress: 43%]
[progress: 44%]
[progress: 45%]
[progress: 46%]
[progress: 47%]
[progress: 48%]
[progress: 49%]
[progress: 50%]
[progress: 51%]
[progress: 52%]
[progress: 53%]
[progress: 54%]
[progress: 55%]
[progress: 56%]
[progress: 57%]
[progress: 58%]
[progress: 59%]
[progress: 60%]
[progress: 61%]
[progress: 62%]
[p

Unnamed: 0,matches,string 1,string 2,md5-hash 1,md5-hash 2
0,5,DmSvyhBQsx,uwzXxilqHj,9e961969b9dd3bf8954320408a360398,9e961fc10ce39bc5b60a6d9839ff97c4
