In [1]:
from collections import Counter
from operator import itemgetter

In [2]:
def triplets(s: bytes):
    s = b' ' + s.strip() + b' '
    memview = memoryview(s)
    for i in range(len(s)-2):
        yield memview[i:i+3]

In [3]:
def all_triplets_from_many_lines(lines):
    for line in lines:
        yield from triplets(line)

In [4]:
def load_file(filename):
    with open(filename, 'rb') as file:
        return {line.rstrip(b'\r\n'): i for i, line in enumerate(file.readlines())}

In [5]:
def account_triplets(lines):
    c = Counter(all_triplets_from_many_lines(lines))
    m = max(c.values())
    for key in c:
        c[key] /= m  # Нормализуем по максимальному значению
    return c

In [6]:
old_file = load_file('../stringdumps/stringdump_0_44_09.txt')

In [7]:
new_file = load_file('../dfrus-py/output.txt')

In [8]:
c = account_triplets(old_file)  # Обучаем на старом файле

In [9]:
import math

def get_score(s: bytes):
    # return sum(c[t] for t in triplets(s)) / len(s)
    return sum(c[t] for t in triplets(s)) / math.log(len(s)+1)  # Преимущество длинным строкам

In [10]:
diff = sorted(new_file.keys()-old_file.keys(), key=lambda s: get_score(s))

In [11]:
for item in diff:
    print(item, get_score(item))

b'p=' 0.0
b'\\f' 0.0
b'0g' 0.0
b'p+`' 0.0
b'`]q' 0.0
b' Xq' 0.0
b'@X' 0.0
b'`Kq' 0.0
b'Xx' 0.0
b'@a' 0.0
b'`Vq' 0.0
b'0Nq' 0.0
b'`Wg' 0.0
b'x]' 0.0
b'pKq' 0.0
b'Pv' 0.0
b'h0' 0.0
b',J' 0.0
b'P|G' 0.0
b'p:Q' 0.0
b'p[q' 0.0
b'tA' 0.0
b'0AA' 0.0
b'8w' 0.0
b'0,k' 0.0
b'0xG' 0.0
b'XK' 0.0
b'@dD' 0.0
b'0km' 0.0
b'0?k' 0.0
b'pYq' 0.0
b'PVp' 0.0
b'|H' 0.0
b'pdm' 0.0
b'P}G' 0.0
b'Xu' 0.0
b'Dt' 0.0
b'PMk' 0.0
b'pm' 0.0
b'|z' 0.0
b'0s' 0.0
b'P6' 0.0
b'pNq' 0.0
b'P\\' 0.0
b'db' 0.0
b'p!A' 0.0
b'p;k' 0.0
b'@tD' 0.0
b'0#A' 0.0
b'@_q' 0.0
b'dk' 0.0
b'XG' 0.0
b'p\\t' 0.0
b'`-k' 0.0
b'T"' 0.0
b' xG' 0.0
b'd1' 0.0
b'XL' 0.0
b' 4@' 0.0
b'k' 0.0
b'pVp' 0.0
b'\\!' 0.0
b'dp' 0.0
b'D&' 0.0
b' uG' 0.0
b'`3q' 0.0
b'0*M' 0.0
b'HV' 0.0
b'h[' 0.0
b'|#' 0.0
b'4Q' 0.0
b',O' 0.0
b'hw' 0.0
b't;' 0.0
b'h_' 0.0
b'`l' 0.0
b'pHk' 0.0
b'8W' 0.0
b"p'G" 0.0
b'tM' 0.0
b'h' 0.0
b'0f' 0.0
b'pXq' 0.0
b'H\\t' 0.0
b"0'k" 0.0
b'D4' 0.0
b't`' 0.0
b'xI' 0.0
b'4H' 0.0
b'p.A' 0.0
b' 5Q' 0.0
b'8E' 0.0
b' FQ' 0.0
b'tx' 0.0
b'xJ' 0.0
b',

b'forced to drink bloody water' 1.2828569879756364
b'sleeping without a proper room' 1.2845645008988207
b'being near to a ' 1.2877435144536062
b'a strengthening ' 1.2905302031523096
b'petitioning for citizenship' 1.2907549992908898
b'being granted residency' 1.2961581384344942
b'Does the' 1.3067313393679894
b' incorporated into your holdings.' 1.3080080040597553
b'giving somebody water' 1.3124792724248866
b'chaining up a creature' 1.3169984738132234
b'becoming a parent' 1.3219412741831187
b'being yelled at by an unhappy citizen' 1.3222180588325338
b'caging a creature' 1.333370782752728
b'gelding a creature' 1.3358421276223353
b'Expected unit file missing or corrupted: ' 1.3381210500476768
b'creating an artifact' 1.3426339551688191
b'the Surrender' 1.3442994654801406
b': Recenter on commander' 1.3454521582256573
b'receiving water' 1.3665586310851412
b'slaughtering an animal' 1.3771662531451687
b'forced to drink purulent water' 1.3808871746038238
b'This is common knowledge where you are 

In [12]:
threshold = 0.04

In [13]:
with open('output2.txt', 'wb') as output:
    for line, number in sorted(new_file.items(), key=itemgetter(1)):
        if line in diff and get_score(line) < threshold: # Отсеиваем только добавившиеся строки
            continue
        
        output.write(line)
        output.write(b'\r\n')