In [1]:
from collections import Counter
import math
from operator import itemgetter


def triplets(s: bytes):
    s = b' ' + s.strip() + b' '
    memview = memoryview(s)
    for i in range(len(s)-2):
        yield memview[i:i+3]
        

def all_triplets_from_many_lines(lines):
    for line in lines:
        yield from triplets(line)
        

def load_file(filename):
    with open(filename, 'rb') as file:
        return {line.rstrip(b'\r\n'): i for i, line in enumerate(file.readlines())}
    
    
def account_triplets(lines):
    c = Counter(all_triplets_from_many_lines(lines))
    m = max(c.values())
    for key in c:
        c[key] /= m  # Нормализуем по максимальному значению
    return c


def get_score(s: bytes):
    # return sum(c[t] for t in triplets(s)) / len(s)
    return sum(c[t] for t in triplets(s)) / math.log(len(s)+1)  # Преимущество длинным строкам

In [2]:
old_file = load_file('../stringdumps/stringdump_0_47_04.txt')
old_file.update(load_file('../stringdumps/stringdump_0_47_03.txt'))
old_file.update(load_file('../stringdumps/stringdump_0_47_02.txt'))
old_file.update(load_file('../stringdumps/stringdump_0_47_01.txt'))
old_file.update(load_file('../stringdumps/stringdump_0_44_12.txt'))

c = account_triplets(old_file)  # Обучаем на старых файлах

In [3]:
new_file = load_file('../stringdumps/stringdump_0_47_05.txt')

In [4]:
diff = sorted(new_file.keys()-old_file.keys(), key=lambda s: get_score(s))

In [5]:
for item in diff:
    print(item, get_score(item))

b'HAS_ANY_CAN_SWIM' 0.0945736140972517
b' problems with ' 0.4049707121470912
b'a friend' 0.5315027071271764
b'an acquaintance' 0.5638938998585207
b'I discussed my problems with my child.' 0.8246479558002962
b'I discussed my problems with my spouse.' 0.8340309141485386
b'I discussed my problems with somebody.' 0.8441903805443917
b'I discussed my problems with my friend.' 0.8468645865362693
b'I discussed my problems with my lover.' 0.895085421680098
b"discussing somebody's problems" 0.943172884388792
b"discussing a child's problems" 1.037606234755724
b"discussing a friend's problems" 1.0529545242598408
b'I discussed my problems with my mother.' 1.0647214350197935
b'I discussed my problems with an acquaintance.' 1.0658386581410095
b'I discussed my problems with my father.' 1.0735577340408542
b'I discussed my problems with my sibling.' 1.1087396398204195
b'a sibling' 1.1169979922602844
b"discussing a lover's problems" 1.1242588198672723
b'relation abstract chat' 1.150594803973084
b"discuss

In [6]:
threshold = 0.07

In [7]:
with open('output2.txt', 'wb') as output:
    for line, number in sorted(new_file.items(), key=itemgetter(1)):
        if line in diff and get_score(line) < threshold: # Отсеиваем только добавившиеся строки
            continue
        
        output.write(line)
        output.write(b'\r\n')