In [1]:
from collections import Counter
import math
from operator import itemgetter


def triplets(s: bytes):
    s = b' ' + s.strip() + b' '
    memview = memoryview(s)
    for i in range(len(s)-2):
        yield memview[i:i+3]
        

def all_triplets_from_many_lines(lines):
    for line in lines:
        yield from triplets(line)
        

def load_file(filename):
    with open(filename, 'rb') as file:
        return {line.rstrip(b'\r\n'): i for i, line in enumerate(file.readlines())}
    
    
def account_triplets(lines):
    c = Counter(all_triplets_from_many_lines(lines))
    m = max(c.values())
    for key in c:
        c[key] /= m  # Нормализуем по максимальному значению
    return c


def get_score(s: bytes):
    # return sum(c[t] for t in triplets(s)) / len(s)
    return sum(c[t] for t in triplets(s)) / math.log(len(s)+1)  # Преимущество длинным строкам

In [2]:
old_file = load_file('../stringdumps/stringdump_0_47_04.txt')
old_file.update(load_file('../stringdumps/stringdump_0_47_05.txt'))
old_file.update(load_file('../stringdumps/stringdump_0_47_03.txt'))
old_file.update(load_file('../stringdumps/stringdump_0_47_02.txt'))
old_file.update(load_file('../stringdumps/stringdump_0_47_01.txt'))
old_file.update(load_file('../stringdumps/stringdump_0_44_12.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_01.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_02.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_05.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_06.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_08.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_09.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_10.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_11.txt'))
old_file.update(load_file('../stringdumps/stringdump_steam_50_12.txt'))

c = account_triplets(old_file)  # Обучаем на старых файлах

In [3]:
new_file = load_file('../stringdumps/stringdump_steam_51_01_beta_12_dirty.txt')

In [4]:
diff = sorted(new_file.keys()-old_file.keys(), key=lambda s: get_score(s))

In [5]:
for item in diff:
    print(item, get_score(item))

b'\xe2\x89\xa1F\xc3\xbf' 0.0
b'&t\xc3\xa2' 0.0
b'QMx' 0.0
b'\xe2\x95\x90\xe2\x95\xa0L?\xe2\x81\xbf\xe2\x8c\x90\xc2\xb1\xe2\x95\xa5MbP?' 0.0
b'X\xce\xa3t' 0.0
b'C\xc2\xbaZ' 0.0
b'\xe2\x94\x94\xc2\xbdB' 0.0
b'ZA\xc2\xa3' 0.0
b'%h5' 0.0
b'\xce\xa37n' 0.0
b'\xc3\xb7L\xc3\xa1' 0.0
b'%f' 0.0
b'7\xc2\xb2x' 0.0
b'`\xc3\xbfB' 0.0
b'd\xc3\xa4,\xc3\xa9h' 0.0
b'\xe2\x89\xa1\xc2\xa2x' 0.0
b'\xc3\x89wI' 0.0
b'\xe2\x89\xa5\xe2\x95\x93g' 0.0
b'\xe2\x96\x88C' 0.0
b'\xe2\x96\x91Z\xc3\x9c' 0.0
b'\xe2\x95\x91\xc3\x89\xc2\xbaN' 0.0
b']\xc3\x9fI' 0.0
b'\xc3\xba\xe2\x94\x9cK' 0.0
b'+0x%llX' 0.0
b':\xe2\x96\x91\xc2\xa1A' 0.0
b'U\xe2\x95\xa3\xe2\x82\xa7' 0.0
b'r+b' 0.0
b'\xc3\x87q\xc3\xa1' 0.0
b'!\xe2\x8c\xa0B' 0.0
b'0Pg' 0.0
b'h' 0.0
b'\xc3\x89\xc3\x9cB' 0.0
b'\xc3\x87\xc3\xbbf' 0.0
b'\xce\x93\xe2\x95\xa4t' 0.0
b'M\xce\x93w' 0.0
b'ww' 0.0
b'SR' 0.0
b'\xc3\x9f\xc2\xabK' 0.0
b"'d\xc3\xa4" 0.0
b'0\xc2\xacK' 0.0
b'\xe2\x96\xa0G' 0.0
b'%u> ' 0.0
b'\xc3\x87\xcf\x83y' 0.0
b'\xc3\xa0\xe2\x95\x96u' 0.0
b'\xe2\x95\x9c\

b'Where will you jump?' 0.740573773653041
b'SAND_RED_RAMP_WITH_WALL_W_SE' 0.7417200887763538
b'SAND_BLACK_RAMP_WITH_WALL_NW' 0.7432002725496053
b'SAND_WHITE_RAMP_WITH_WALL_NW_SE' 0.7432312315647944
b'SAND_TAN_RAMP_WITH_WALL_NW_SW_SE' 0.7433425483322539
b'SAND_RED_RAMP_WITH_WALL_E_NW_SW' 0.7436147358122479
b'SAND_RED_RAMP_WITH_WALL_SE' 0.7439896298269643
b'SAND_WHITE_RAMP_WITH_WALL_N_S' 0.744145349263361
b'[PCG_LAYERING:EXPERIMENT_HUMANOID_PORTRAIT_HORN_SHORT_3]' 0.7443561340842717
b'SAND_WHITE_RAMP_WITH_WALL_S_E' 0.7444384352614954
b'SAND_WHITE_RAMP_WITH_WALL_S_W' 0.7445849782605626
b'SAND_WHITE_RAMP_WITH_WALL_E_SW' 0.7447324085485767
b'[PCG_LAYERING:EXPERIMENT_HUMANOID_PORTRAIT_HORN_SHORT_5]' 0.7447670627218362
b'[PCG_LAYERING:EXPERIMENT_HUMANOID_PORTRAIT_HORN_SHORT_2]' 0.7448903413131054
b'SAND_RED_RAMP_WITH_WALL_S_NW' 0.7449271536183988
b'[PCG_LAYERING:EXPERIMENT_HUMANOID_PORTRAIT_HORN_SHORT_4]' 0.7450547127681313
b'SAND_WHITE_RAMP_WITH_WALL_SW_SE' 0.7465868937300123
b'[PCG_LAYERING

In [6]:
threshold = 0.0065

In [7]:
with open('../stringdumps/stringdump_steam_51_01_beta_12.txt', 'wb') as output:
    for line, number in sorted(new_file.items(), key=itemgetter(1)):
        if line in diff and get_score(line) < threshold: # Отсеиваем только добавившиеся строки
            continue
        
        output.write(line)
        output.write(b'\n')