In [1]:
from collections import Counter
import math
from operator import itemgetter


def triplets(s: bytes):
    s = b' ' + s.strip() + b' '
    memview = memoryview(s)
    for i in range(len(s)-2):
        yield memview[i:i+3]
        

def all_triplets_from_many_lines(lines):
    for line in lines:
        yield from triplets(line)
        

def load_file(filename):
    with open(filename, 'rb') as file:
        return {line.rstrip(b'\r\n'): i for i, line in enumerate(file.readlines())}
    
    
def account_triplets(lines):
    c = Counter(all_triplets_from_many_lines(lines))
    m = max(c.values())
    for key in c:
        c[key] /= m  # Нормализуем по максимальному значению
    return c


def get_score(s: bytes):
    # return sum(c[t] for t in triplets(s)) / len(s)
    return sum(c[t] for t in triplets(s)) / math.log(len(s)+1)  # Преимущество длинным строкам

In [2]:
old_file = load_file('../stringdumps/stringdump_0_47_04.txt')
old_file.update(load_file('../stringdumps/stringdump_0_47_05.txt'))
old_file.update(load_file('../stringdumps/stringdump_0_47_03.txt'))
old_file.update(load_file('../stringdumps/stringdump_0_47_02.txt'))
old_file.update(load_file('../stringdumps/stringdump_0_47_01.txt'))
old_file.update(load_file('../stringdumps/stringdump_0_44_12.txt'))

c = account_triplets(old_file)  # Обучаем на старых файлах

In [3]:
new_file = load_file('../stringdumps/stringdump_steam_2022_12_06.txt')

In [4]:
diff = sorted(new_file.keys()-old_file.keys(), key=lambda s: get_score(s))

In [5]:
for item in diff:
    print(item, get_score(item))

b'r+b' 0.0
b'p\xe2\x95\xa7' 0.0
b'h' 0.0
b'k' 0.0
b'P7' 0.0
b'SR' 0.0
b'C' 0.0011095520406759958
b'P' 0.0013869400508449948
b'HFEXP' 0.002360783269395778
b'GYPSUM' 0.002470190070994764
b'AB' 0.0028001975823936545
b'KIDNEY' 0.002865420482353926
b'HEC' 0.0033286561220279873
b'EYE' 0.0036060441321969857
b'FL' 0.0036752593268916714
b'Dabu' 0.004420186998405237
b'[FPS:' 0.005258108190926959
b'EPOP' 0.005375903106168532
b'%[R]' 0.005614832133109356
b'TAB' 0.0062412302288024765
b'KOGANUSAN' 0.006346160473879473
b'Audio' 0.007618891460322738
b'GIZZARD' 0.0089688789954643
b'SHRUB' 0.009443133077583113
b'HUNGRY' 0.009485529872619894
b'GUTS_E_2' 0.00971318536392799
b'GUTS_E_1' 0.00971318536392799
b'GUTS_W_1' 0.009800691538377789
b'GUTS_W_2' 0.009888197712827592
b'GUTS_N_1' 0.0102382224106268
b'GUTS_N_2' 0.010325728585076601
b'GUTS_NW_1' 0.010688270271797006
b'GUTS_NW_2' 0.01077177238329542
b'TUNNEL_0' 0.011900839725173033
b'SFX_VOLUME' 0.011947284430342762
b'BROOK_0' 0.01202014710732329
b'TOOLTIP

b'BEAST_SNAKE_TRUNK' 0.08867263018576639
b'SITE_FORT' 0.08867924241131576
b'SCROLLBAR_BOTTOM_SCROLLER' 0.08869684631311145
b'WORLD_EDGE_DESERT_EVIL_INLET_NWE' 0.08869783958504396
b'RIVER_MOUTH_NARROW_E' 0.08872996115132413
b'BEAST_SMALL_HUMANOID_EYE_ONE' 0.08873253481949954
b'IS_CRAFTED_ARTIFACT' 0.08876307278631936
b'ITEM_BUCKET_WOOD' 0.08883283332800852
b'GRASS_RAMP_WITH_WALL_N_W_SE' 0.08885916321681141
b'SAND_WHITE_FLOOR_1' 0.08887276928460733
b'TREE_LEAFLESS_TWIGS_SWE' 0.08887378836231868
b'BEAST_SMALL_QUADRUPED_BULKY_OCT' 0.08887511845814726
b'ITEM_FLASK_METAL' 0.08890069645507347
b'MAP_ATTACK' 0.08892307673322229
b'BEAST_FRONT_GRASP_EYE_ONE' 0.08893289913097734
b'GRASS_RAMP_WITH_WALL_NE' 0.08893428787788185
b'OVERLAY_RAMP_WITH_WALL_W' 0.08894133027872161
b'GRASS_RAMP_WITH_WALL_NW_SE' 0.08896461069063173
b'MULTILEVEL_RAMP_WITH_WALL_E' 0.08897456472748261
b'RIVER_MOUTH_NARROW_N' 0.08898257314036705
b'WORK_ORDERS_PRIORITY_UP' 0.08899478739344503
b'SAND_WHITE_FLOOR_2' 0.0890033685047

b'WORLD_EDGE_GLACIER_EVILSAV_THIN_CORNER_NW' 0.13055810973496615
b'SPEECH_MERCENARY_PROFESSION' 0.1305768093244443
b'BUTTON_PICTURE_BOX_SELECTED' 0.13057680932444432
b'TREE_CAP_WALL_THICK_SW' 0.13061310255048958
b'WORLD_EDGE_BEACH_STRAIGHT_N' 0.13063451007977991
b'TREE_OVERLEAVES_TRUNK_E' 0.1306789536164795
b'DESIGNATE_PRIORITY_UP' 0.13068755375223243
b'[C:6:0:0]Hungry' 0.13071909979214075
b'ITEM_STATUE_DAMAGE' 0.1307298193297457
b'STONE_RAMP_WITH_WALL_N_W_SE' 0.13074991159045107
b'TREE_OVERLEAVES_TRUNK_WE_AUTUMN' 0.1307607079936661
b'WORLD_EDGE_GLACIER_THICK_CORNER_NW' 0.13076373566692504
b'WORLD_EDGE_TUNDRA_GOOD_THIN_CORNER_SW' 0.13076741921456045
b'WORLD_EDGE_WETLAND_EVIL_THIN_CORNER_NW' 0.13078492283411233
b'WORK_DETAIL_PLANTERS' 0.13078985732696954
b'INTERFACE_BACKGROUND' 0.1307898573269696
b'TRAP_WEAPON_GENERIC_HAMMER' 0.13079256207763695
b'ZONE_INACTIVE_SELECTED_N_S' 0.130792562077637
b'STOCKPILE_ICON_FOOD' 0.13080198289119221
b'TOOL_GRAPHICS_CONTAINER_WOOD_LIQUID' 0.13080885479

b'HAULING_ADD_STOP' 0.19137401832313525
b'BLD_BRIDGE_METAL_RAISE_S_END_CENTER' 0.19138440731669862
b'WORLD_EDGE_GRASS_TEMP_GOOD_THINNING_N_E' 0.19139093110240937
b'BUTTON_PAINT_RECTANGLE_BORDER_INACTIVE' 0.19150649414995027
b'[PCG_LAYERING:BEAST_WORM_SHORT_EYE_THREE]' 0.19151609241263942
b'New bodywear' 0.19159986596082906
b'WORLD_EDGE_SAND_DESERT_RED_TINY_CORNER_SE' 0.19161897508382544
b'[PCG_LAYERING:BEAST_SMALL_HUMANOID_ANTENNAE]' 0.19163114746906296
b'BEAST_SMALL_WORM_LONG_WINGS_BAT_BACK' 0.19163612351057058
b'Vanilla Items Graphics' 0.19168852515156354
b'WORLD_EDGE_GRASS_TEMP_GOOD_THINNING_N_W' 0.19175578309525163
b'WORLD_EDGE_WATER_GOOD_THICK_CORNER_NW' 0.19176402462022055
b'[PCG_LAYERING:BEAST_WORM_LONG_EYE_TWO]' 0.19176890370619845
b'WORLD_EDGE_GRASS_TEMP_GOOD_THINNING_W_S' 0.19180790480851484
b'[PCG_LAYERING:BEAST_BIPEDAL_DINOSAUR_ANTENNAE]' 0.19181389937390786
b'Fresh raw hide' 0.19184115478408892
b'WORLD_EDGE_SAND_DESERT_WHITE_TINY_CORNER_SE' 0.19190541939049316
b'WORLD_EDGE

b'BUILDING_PLACEMENT_PRESSURE_PLATE_WATER_7_OFF' 0.4140560316039278
b'BUILDING_PLACEMENT_PRESSURE_PLATE_WATER_6_OFF' 0.4140560316039278
b'data/sound/tracks/forgotten_beast/FB_Full.ogg' 0.4140560316039279
b'Meager Study' 0.4142335130279895
b'Unnamed zoo' 0.41426804496450603
b'[C:4:0:1].Smashed open' 0.41440532724704626
b'BUILDING_PLACEMENT_PRESSURE_PLATE_WATER_5_OFF' 0.41440756492366426
b'BUILDING_PLACEMENT_PRESSURE_PLATE_WATER_4_OFF' 0.4144577839693409
b'BUILDING_PLACEMENT_PRESSURE_PLATE_WATER_3_OFF' 0.4146084411063709
b'BUILDING_PLACEMENT_PRESSURE_PLATE_WATER_1_OFF' 0.41465866015204744
b'BUILDING_PLACEMENT_PRESSURE_PLATE_WATER_2_OFF' 0.4150604125174607
b'Current amount: ' 0.4151187482562477
b'data/vanilla/vanilla_interface/' 0.4164148808657011
b'Weapon rack' 0.41658930782385156
b'Infamous brigand' 0.4173582314493907
b'Trade at depot' 0.4174071609828492
b'Abandon' 0.4176538806444561
b'Do not automate kiln' 0.4185780658441111
b'Pet owner' 0.41859608494154993
b'Very rare' 0.4185960849415

b'Carving fortifications through walls' 1.6979950938117772
b'These goods are too heavy and cheap for me to consider.' 1.698903696796145
b'Some [C:6:0:1]Refuse[C:7:0:0] can be put to use, for instance [C:6:0:1]Shells[C:7:0:0] can be turned into [C:6:0:1]Crafts[C:7:0:0]. ' 1.6999213717267745
b"The task also requires a [C:6:0:1]Barrel[C:7:0:0] from the [C:6:0:0]Carpenter's Workshop[C:7:0:0]. " 1.7023575471181691
b'Erase a portion of this stockpile.' 1.7029029249031442
b').  Using the first encountered: ' 1.707795717752714
b'Click to set this task as highest priority among all tasks everywhere.' 1.7095923063373355
b'Just the beginning' 1.7132658690082607
b'Noted protector of the weak' 1.7146933463077825
b'Spouse does the expelling' 1.7153368142270196
b'Note that some [C:2:0:0]Livestock[C:7:0:0] must eat vegetation to survive. ' 1.7158476621158496
b'Converting to chopping blueprint' 1.7175926964902966
b'Child does the expelling' 1.7205279230008708
b"Leave this menu and view this creature's 

In [6]:
threshold = 0.0023

In [9]:
with open('../stringdumps/output2.txt', 'wb') as output:
    for line, number in sorted(new_file.items(), key=itemgetter(1)):
        if line in diff and get_score(line) < threshold: # Отсеиваем только добавившиеся строки
            continue
        
        output.write(line)
        output.write(b'\n')