In [1]:
import time
# Following program is the python implementation of
# Rabin Karp Algorithm given in CLRS book
# This code can be found from <https://www.geeksforgeeks.org/rabin-karp-algorithm-for-pattern-searching/>
  
# d is the number of characters in the input alphabet
d = 256
  
# pat  -> pattern
# txt  -> text
# q    -> A prime number
  
def rbk(pat, txt, q):
    M = len(pat)
    N = len(txt)
    i = 0
    j = 0
    p = 0    # hash value for pattern
    t = 0    # hash value for txt
    h = 1
    
    counter = 0 
  
    # The value of h would be "pow(d, M-1)%q"
    for i in range(M-1):
        h = (h * d) % q
  
    # Calculate the hash value of pattern and first window
    # of text
    for i in range(M):
        p = (d * p + ord(pat[i])) % q
        t = (d * t + ord(txt[i])) % q
  
    # Slide the pattern over text one by one
    for i in range(N - M + 1):
        # Check the hash values of current window of text and
        # pattern if the hash values match then only check
        # for characters on by one
        if p == t:
            # Check for characters one by one
            for j in range(M):
                if txt[i + j] != pat[j]:
                    break
                else: j += 1
  
            # if p == t and pat[0...M-1] = txt[i, i+1, ...i+M-1]
            if j == M:
                counter += 1
  
        # Calculate hash value for next window of text: Remove
        # leading digit, add trailing digit
        if i < N - M:
            t = (d * (t - ord(txt[i]) * h) + ord(txt[i + M])) % q
  
            # We might get negative values of t, converting it to
            # positive
            if t < 0:
                t = t + q   
# This code is contributed by Bhavya Jain
    return counter

In [2]:
def main():
    # load the dataset into array
    dataset = []
    for i in range(10):
        f = open("./dataset/"+str(i) + ".txt","r")
        dataset.append(f.read())
    
    # Load input file for word checking
    wordFile = open("./dataset/paragraph.txt","r")
    # store in varaible
    wordData = wordFile.read()
    
    words = []
    tempWord = ""

    timerStart = time.perf_counter()

    for j in range(len(wordData)):
        tempWord += wordData[j]
        if(wordData[j] == " "):
            words.append(tempWord.strip())
            tempWord = ""
    
    # Keeps track of how many documents we found
    foundDocuments = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    for word in words:
        if len(word) <= 4:
            continue 
        for index, checkPlagarism in enumerate(dataset):  
            theRbk = rbk(word, checkPlagarism, 101)
            if theRbk > 0:
                foundDocuments[index] += 1
                continue
    
    # Print number of documents that are plagarized 
    timerEnd = time.perf_counter()
    print("Total time: " + str(timerEnd - timerStart))
    for i, val in enumerate(foundDocuments): 
        print("File " + str(i) + " used " + str(val) + " many of the words used in the file you are comparing to")

In [3]:
main()

Total time: 0.12030990000000008
File 0 used 14 many of the words used in the file you are comparing to
File 1 used 11 many of the words used in the file you are comparing to
File 2 used 11 many of the words used in the file you are comparing to
File 3 used 12 many of the words used in the file you are comparing to
File 4 used 11 many of the words used in the file you are comparing to
File 5 used 11 many of the words used in the file you are comparing to
File 6 used 13 many of the words used in the file you are comparing to
File 7 used 11 many of the words used in the file you are comparing to
File 8 used 10 many of the words used in the file you are comparing to
File 9 used 12 many of the words used in the file you are comparing to
