## Finding the duplicates

In [1]:
from functools import reduce
import operator
import time
import numpy as np
import math

### Case when character order is not important

#### For example: "AABA" = "AAAB"

In [2]:
start_time = time.time()

#whole is a set which will contain all the unique hash values
#duplicates is a list which will contain all the duplicate lines from a passwords2.txt file

whole = set()
duplicates = []

with open("passwords2.txt", "rb") as FileObj:
    #reading the text line by line
    #in order not to overwhelm computer's memory
    for line in FileObj:
        #obtaining potentially large number by multiplying all the ascii values of characters in a line
        #dividing obtained large number (about 38 digits) by two large prime numbers
        #as a result we obtain numbers of 13 digit size
        multi = int(reduce(operator.mul, list(line[0:20]))/29996224275833/299962246079)
        if multi in whole:
            duplicates.append(multi)
        whole.add(multi)
        
end_time = time.time()
print("Elapsed time was %g seconds" % (end_time - start_time))

Elapsed time was 575.496 seconds


In [3]:
print('The number of uniqe lines:',len(whole))

The number of uniqe lines: 99985182


In [4]:
print('The number of lines which occured for the second or more times:',len(duplicates))

The number of lines which occured for the second or more times: 10014818


### Case when character order is important

#### "AABA" != "AAAB"

In [60]:
start_time = time.time()

#whole is a set which will contain all the unique hash values
#duplicates is a list which will contain all the duplicate lines from a passwords2.txt file

whole_2 = set()
duplicates_2 = []
with open("passwords2.txt", "rb") as FileObj:
    #reading the text line by line
    #in order not to overwhelm computer's memory
    for line in FileObj:
        #we take logarithm with base of index of each character
        #this makes the order to play a role
        #then all the logarithm values are multiplyed
        multi = reduce(operator.mul, [(math.log(x,i+2)) for i, x in enumerate(list(line[:20]))])
        if multi in whole_2:
            duplicates_2.append(multi)
        whole_2.add(multi)        
        
end_time = time.time()
print("Elapsed time was %g seconds" % (end_time - start_time))

Elapsed time was 1310.31 seconds


In [67]:
print('The number of uniqe lines:',len(whole_2))

The number of uniqe lines: 104187728


In [70]:
print('The number of lines which occured for the second or more times:',len(duplicates_2))

The number of lines which occured for the second or more times: 5812272


In [63]:
import collections
counter_2=collections.Counter(duplicates)

In [64]:
counter_2

Counter({1960271.4095823322: 1,
         1111871.6265538293: 1,
         571723.855669221: 1,
         1405443.804988404: 1,
         848584.8106628828: 1,
         1104244.6229939808: 1,
         1519521.8954594608: 1,
         664277.2164286156: 1,
         820588.4126778657: 1,
         1455613.5510637301: 1,
         995392.7347501746: 1,
         850580.1561724056: 1,
         604069.4635101893: 1,
         804678.8710576494: 1,
         762027.5577371006: 1,
         990138.6934767069: 1,
         1695758.8440318287: 1,
         952247.0133869377: 1,
         692655.6734823658: 1,
         480036.0253492041: 1,
         1506523.8492539788: 1,
         1177759.0700705322: 1,
         1024190.2874744713: 1,
         870736.106651471: 1,
         857348.6341142564: 1,
         699647.5996097889: 1,
         1513148.682729041: 1,
         894604.1719771057: 1,
         1359282.8409051578: 1,
         1174998.4807278155: 1,
         1048039.4751615932: 1,
         1384700.6770464843: 

As it can be seen from the counter dictionary, each of the duplicates occur only 1 time. It means that the procedure found overall about 5,812,272 * 2 number of duplicates in this txt file.
Changing the alorithm to be order dependent decreased the number of duplicates about 2 times. It shows that there are about 10 million lines of duplicates wich are in fact only duplicate because they have same characters (e.g. 'aba'='baa').