In [11]:
from nltk.tokenize import sent_tokenize #make sure to install the corpus
from difflib import SequenceMatcher

'''
Compares `file1` and `file2` for similarity. Set the `threshold`
value to the level of similarity you want to detect (0.5 is a good start).
Then set the `maxLen` variable to around 30 to avoid very short sentences.
This verision supports txt and docx files, use the corresponding two 
functions, or write your own ones for other file formats. 
Note: This algorithm is not very efficient on large files.

Author: christopher.kullenberg@gmail.com
'''

# Settings:
threshold = 0.5
maxLen = 30
file1 = 'path/filename'
file2 = 'path/filename2'

def txtParser(fn):
    '''Input: txt-file
    Output: text as string'''
    try:
        thefile = open(fn, 'r', encoding="utf-8")
        thetext = thefile.read()
    except UnicodeError:
        print("Unicode error. Save as Unicode / UTF-8.")
    return(thetext)

def docxParser(fn):
    from docx import Document
    '''Input: docx-file
    Output: text as string
    Note: Build docx from source 
    https://github.com/python-openxml/python-docx'''
    text = ""
    documenttext = Document(fn)
    for d in documenttext.paragraphs:
        text += d.text
    return(text)

In [12]:
def compareSent(document1, document2, threshold, maxLen):
    '''
    Input: Two texts as strings, threshold value as float, maxLen as int.  
    Output: Prints sentences that are more similar than the threshold value for m.ratio().
    Filter: Excludes sentences shorter than maxLen chars. (Recommended: 30)
    '''
    sentences1 = sent_tokenize(document1)
    sentences2 = sent_tokenize(document2)
    for s in sentences1:
        if len(s) > maxLen: 
            for x in sentences2:
                if len(x) > maxLen:
                    m = SequenceMatcher(None, s, x)
                    if m.ratio() > threshold:
                        print("*" * 10)
                        print("Similarity ratio: " + str(round(m.ratio(), 2)))
                        print("\nSentence in File1:\n\n\t" + s)
                        print("\nSentence in File2:\n\n\t" + x)

In [None]:
compareSent(txtParser(file1), txtParser(file2), threshold, maxLen)