## Загружаем все

In [89]:
import math
import pickle
from itertools import accumulate
from collections import defaultdict

import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")  # dateutil бросает тупые варнинги

from tqdm import tqdm
from dateutil.parser import parse

import ndd
import bz2
import lz4
import zlib
import lzma
import zstd
import brotli
import lz4.frame as lz4
from snappy import snappy
from zxcvbn import zxcvbn
from brotli import MODE_TEXT
from nltk import ngrams
from password_strength import PasswordStats

import xgboost as xgb

pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
def save_submission(predict):
    """
    Записываем итоговый файл
    """
    sub = pd.DataFrame(columns=['Id', 'Times'])
    sub['Id'] = range(len(predict))
    sub = sub.set_index('Id')
    sub['Times'] = list(map(int, predict))
    with open(f'sub_{id(predict)}.csv', 'w') as f:
        f.write(sub.to_csv())

In [3]:
def rmsle(y, y_pred):
    """
    Метрика
    """
    assert len(y) == len(y_pred)
    terms_to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
    return (sum(terms_to_sum) * (1.0/len(y))) ** 0.5

In [4]:
def add_features(df: pd.DataFrame, func, force=False):
    """
    Добавляем сгенерированные функцией func фичи в DataFrame
    :param force Игнорить существующие
    :param df: Датафрейм
    :param func: принимает строку и возвращает массив фичей
    """
    if not force and func.__name__ + "0" in df.columns:
        return

    print(func.__name__, end=' ')
    rows = []
    for w in tqdm(df.Password):
        rows += [func(str(w))]

    columns = zip(*rows)
    for i, c in enumerate(columns):
        df[func.__name__ + str(i)] = c

def load_words():
    """Загружаем словарь часто используемых слов"""
    df = pd.read_csv("data/freques.tsv", sep="\t", header=0)
    res = defaultdict(int)

    for i, r in df.iterrows():
        res[r.word.strip()] = int(r.freq)

    return res

In [88]:
def ngramer():
    '''Расчет частот нграмм'''
    d = defaultdict(int)
    for word in tqdm(train_df.Password.values):
        for n in range(2, len(word) + 1):
            for grama in ngrams(word, n):
                d[grama] += 1
    return d

In [5]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('Xtest.csv')

In [6]:
freques = load_words()

# Генерим фичи

In [7]:
train_df = train_df.replace({np.nan: ''})
train_df.Password = train_df.Password.apply(str)
train_df.astype({'Password': 'U20', 'Times': 'i4'})

test_df = test_df.replace({np.nan: ''})
test_df.Password = test_df.Password.apply(str)
test_df.astype({'Password': 'U20'})

Unnamed: 0,Id,Password
0,0,ThaisCunha
1,1,697775113
2,2,922a16922a
3,3,andy74
4,4,joemack
5,5,dmartin2448
6,6,utstar
7,7,phemsa
8,8,vovaest73
9,9,a08011979


In [8]:
length = lambda df: df.Password.astype(str).apply(len)

### Извлекаем всякие сущности

In [9]:
def is_date(line):
    try:
        parse(line)
        return [True]
    except:
        return [False]
    
def is_current_year(year):
    try:
        return [2025 >= int(year) >= 2007]
    except:
        return [False]

def is_birth_year(year):
    try:
        return [1945 < int(year) < 2010]
    except:
        return [False]
    
def is_year(line):
    try:
        return [2025 >= int(num) >= 1900]
    except:
        return [False]
    
def is_palindrome(line):
    return [line == line[::-1]]

def unique_length(line):
    return [len(set(line))]

def isotropness(line):
    return [(len(line) - len(set(line))) / (len(line) + 0.1)]

def repeats(line):
    res = 0
    for i, l in enumerate(line[:-1]):
        if line[i+1] == l:
            res += 1
    return [res]

def is_numeric(df):
    return [line.isdigit()]

def is_letters(line):
    return [line.isalpha()]
    
def get_frequency(word):
    return [freques[word]]

def is_upper(line):
    return [line.isupper()]

def is_lower(line):
    return [line.islower()]

def has_symbols(line):
    symbs = '''[!#$%&'()*+,-./[\]^_`{|}~"+'"]"'''
    return [bool(set(symbs) & set(line))]

def has_seq(line):  # TODO: shit
    for s in ['123', '345', '321', '654', 'asd', 'xyz', 'abc', 'qwe', 'rty', 'ewq', 'ytr', '0987']:
        if s in line:
            return [True]
    return [False]
    
add_features(train_df, unique_length)
add_features(train_df, get_frequency)
add_features(train_df, isotropness)
add_features(train_df, repeats)
add_features(train_df, is_year)
add_features(train_df, is_current_year)
add_features(train_df, is_birth_year)

add_features(train_df, is_letters)
add_features(train_df, is_lower)
add_features(train_df, is_upper)
add_features(train_df, has_symbols)
add_features(train_df, has_seq)
add_features(train_df, is_date)  # терпимая скорость


add_features(test_df, unique_length)
add_features(test_df, get_frequency)
add_features(test_df, isotropness)
add_features(test_df, repeats)
add_features(test_df, is_year)
add_features(test_df, is_current_year)
add_features(test_df, is_birth_year)

add_features(test_df, is_letters)
add_features(test_df, is_lower)
add_features(test_df, is_upper)
add_features(test_df, has_symbols)
add_features(test_df, has_seq)
add_features(test_df, is_date)  # терпимая скорость

  2%|▏         | 96375/4151496 [00:00<00:09, 421405.34it/s]

unique_length 

100%|██████████| 4151496/4151496 [00:07<00:00, 524896.01it/s]
  3%|▎         | 140787/4151496 [00:00<00:05, 705038.72it/s]

get_frequency 

100%|██████████| 4151496/4151496 [00:07<00:00, 587951.97it/s]
  3%|▎         | 108653/4151496 [00:00<00:07, 545780.98it/s]

isotropness 

100%|██████████| 4151496/4151496 [00:09<00:00, 441323.49it/s]
  2%|▏         | 90641/4151496 [00:00<00:08, 451753.91it/s]

repeats 

100%|██████████| 4151496/4151496 [00:10<00:00, 394057.21it/s]
  2%|▏         | 64800/4151496 [00:00<00:06, 647992.89it/s]

is_year 

100%|██████████| 4151496/4151496 [00:07<00:00, 573365.28it/s]
  2%|▏         | 79893/4151496 [00:00<00:10, 397390.09it/s]

is_current_year 

100%|██████████| 4151496/4151496 [00:11<00:00, 369345.78it/s]
  2%|▏         | 80098/4151496 [00:00<00:10, 397627.66it/s]

is_birth_year 

100%|██████████| 4151496/4151496 [00:11<00:00, 368189.30it/s]
  5%|▍         | 197563/4151496 [00:00<00:04, 986387.24it/s]

is_letters 

100%|██████████| 4151496/4151496 [00:04<00:00, 884022.10it/s] 
  2%|▏         | 97063/4151496 [00:00<00:04, 970621.67it/s]

is_lower 

100%|██████████| 4151496/4151496 [00:05<00:00, 776606.79it/s]
  5%|▍         | 195001/4151496 [00:00<00:04, 979153.07it/s]

is_upper 

100%|██████████| 4151496/4151496 [00:04<00:00, 833209.58it/s] 
  2%|▏         | 65359/4151496 [00:00<00:12, 325418.06it/s]

has_symbols 

100%|██████████| 4151496/4151496 [00:13<00:00, 303471.18it/s]
  3%|▎         | 129978/4151496 [00:00<00:06, 643542.59it/s]

has_seq 

100%|██████████| 4151496/4151496 [00:07<00:00, 571789.72it/s]
  0%|          | 6013/4151496 [00:00<02:17, 30077.73it/s]

is_date 

100%|██████████| 4151496/4151496 [02:23<00:00, 28895.31it/s]
 12%|█▏        | 122626/1037875 [00:00<00:01, 613673.68it/s]

unique_length 

100%|██████████| 1037875/1037875 [00:02<00:00, 481405.89it/s]
 14%|█▎        | 141766/1037875 [00:00<00:01, 713089.94it/s]

get_frequency 

100%|██████████| 1037875/1037875 [00:01<00:00, 556085.68it/s]
 11%|█         | 110021/1037875 [00:00<00:01, 547191.95it/s]

isotropness 

100%|██████████| 1037875/1037875 [00:02<00:00, 415906.95it/s]
  4%|▍         | 42906/1037875 [00:00<00:02, 429059.39it/s]

repeats 

100%|██████████| 1037875/1037875 [00:02<00:00, 364171.92it/s]
 11%|█▏        | 119263/1037875 [00:00<00:01, 568953.20it/s]

is_year 

100%|██████████| 1037875/1037875 [00:01<00:00, 575539.51it/s]
  8%|▊         | 86674/1037875 [00:00<00:02, 431388.86it/s]

is_current_year 

100%|██████████| 1037875/1037875 [00:02<00:00, 352262.66it/s]
  8%|▊         | 85395/1037875 [00:00<00:02, 426920.29it/s]

is_birth_year 

100%|██████████| 1037875/1037875 [00:02<00:00, 350637.11it/s]
 19%|█▉        | 201216/1037875 [00:00<00:00, 1002799.55it/s]

is_letters 

100%|██████████| 1037875/1037875 [00:01<00:00, 653464.65it/s]
 19%|█▊        | 193211/1037875 [00:00<00:00, 960856.69it/s]

is_lower 

100%|██████████| 1037875/1037875 [00:01<00:00, 771761.10it/s]
 19%|█▉        | 201206/1037875 [00:00<00:00, 998029.61it/s]

is_upper 

100%|██████████| 1037875/1037875 [00:01<00:00, 747468.65it/s]
  7%|▋         | 67938/1037875 [00:00<00:02, 339230.73it/s]

has_symbols 

100%|██████████| 1037875/1037875 [00:03<00:00, 277721.53it/s]
 13%|█▎        | 133208/1037875 [00:00<00:01, 669205.50it/s]

has_seq 

100%|██████████| 1037875/1037875 [00:02<00:00, 509499.09it/s]
  1%|          | 5622/1037875 [00:00<00:36, 28152.64it/s]

is_date 

100%|██████████| 1037875/1037875 [00:37<00:00, 27822.89it/s]


## Играемся со сжатием

In [10]:
def lzma_compression(line, line_repeats=50):
    line = (line + ' ') * line_repeats
    return [len(lzma.compress(line.encode('ascii'), preset=None))]

def zstd_compression(line, line_repeats=50):  # если качество больше 20 он сильно виснет
    line = (line + ' ') * line_repeats
    return [len(zstd.compress(line.encode('ascii'), 19))]

def brotli_compression(line, line_repeats=50):
    line = (line + ' ') * line_repeats
    res = len(brotli.compress(line.encode('utf8'), mode=MODE_TEXT, quality=5))  # если больше 6, то очень тормозит
    return [res]  

def compression_size(line, line_repeats=50):
    line = (line + ' ') * line_repeats
    
    res = []
    res += [len(zlib.compress(line.encode('ascii'), level=5))]
    res += [len(bz2.compress(line.encode('ascii'), compresslevel=9))]
    res += [len(snappy.compress(line.encode('ascii')))]
    res += [len(lz4.compress(line.encode('ascii'), compression_level=16))]
    
    return res

# add_features(train_df, lzma_compression)  # неподъемно медленно
add_features(train_df, compression_size)
add_features(train_df, zstd_compression)
add_features(train_df, brotli_compression)

add_features(test_df, compression_size)
add_features(test_df, zstd_compression)
add_features(test_df, brotli_compression)

  0%|          | 1887/4151496 [00:00<07:21, 9388.48it/s]

compression_size 

100%|██████████| 4151496/4151496 [07:28<00:00, 9263.13it/s]
  1%|          | 25435/4151496 [00:00<00:32, 126545.34it/s]

zstd_compression 

100%|██████████| 4151496/4151496 [00:32<00:00, 127030.09it/s]
  0%|          | 5622/4151496 [00:00<01:13, 56213.22it/s]

brotli_compression 

100%|██████████| 4151496/4151496 [01:20<00:00, 51270.00it/s]
  0%|          | 2321/1037875 [00:00<01:29, 11623.83it/s]

compression_size 

100%|██████████| 1037875/1037875 [01:36<00:00, 10715.31it/s]
  2%|▏         | 20911/1037875 [00:00<00:09, 105544.76it/s]

zstd_compression 

100%|██████████| 1037875/1037875 [00:09<00:00, 104023.75it/s]
  0%|          | 4975/1037875 [00:00<00:20, 49722.07it/s]

brotli_compression 

100%|██████████| 1037875/1037875 [00:21<00:00, 48685.51it/s]


## Тут всякие клавиатурные фичи

In [11]:
keyboard = [
    '1234567890-=',
    'qwertyuiop',
    'asdfghjkl;',
    'zxcvbnm,./',
]

shift_keyboard = [
    '!@#$%^&*()_+',
    'QWERTYUIOP{}|',
    'ASDFGHJKL:"',
    'ZXCVBNM<>?',
]

def keyboard_coords(symb):
    keyboard = [
        '1234567890-=',
        'qwertyuiop{}|',
        'asdfghjkl;',
        'zxcvbnm,./',
    ]
    
    for r_num, row in enumerate(keyboard):
        pos = row.find(symb)
        if pos != -1:
            return r_num, pos
        
    for r_num, row in enumerate(shift_keyboard):
        pos = row.find(symb)
        if pos != -1:
            return r_num, pos
    
    return None

def keyboard_dist(a, b):
    a = keyboard_coords(a)
    b = keyboard_coords(b)
    
    try:
        return ((a[0] - b[0])**2 + (a[1] - b[1])**2) ** 0.5
    except:
        return 7

def keyboard_complexity(line):
    keyboard = [
        '1234567890-=',
        'qwertyuiop',
        'asdfghjkl;',
        'zxcvbnm,./',
    ]
    
    if len(line) == 1:
        return [0]
    
    path = 0
    for i in range(len(line) - 1):
        path += keyboard_dist(line[i+1], line[i])
        
    return [path / (len(line)-1)]

add_features(train_df, keyboard_complexity)
add_features(test_df, keyboard_complexity)

  0%|          | 7395/4151496 [00:00<01:50, 37361.76it/s]

keyboard_complexity 

100%|██████████| 4151496/4151496 [01:53<00:00, 36434.47it/s]
  1%|          | 7540/1037875 [00:00<00:27, 37636.70it/s]

keyboard_complexity 

100%|██████████| 1037875/1037875 [00:28<00:00, 36104.65it/s]


## Тут всякие фичи со сложностью пароля

In [12]:
def to_numbers(line):
    return [ord(a) for a in line]

def to_diff(line):
    return np.diff(to_numbers(line))

def diff_repeats(line):
    return repeats(to_diff(line))

def zxc_stgrength(line):
    
    try:
        output = zxcvbn(line)

        res = []
        res += output['score']
        res += output['guesses']
        res += output['guesses_log10']
    except:
        return [0, 0, 0]
    
    return res

def ndd_entropy(line):
    try:
        res = ndd.entropy(to_numbers(line), k=90)
    except:
        res = 1.8  # отпизды
    return [res]

def entropy(string):
        prob = [float(string.count(c)) / len(string) for c in dict.fromkeys(list(string))]
        res = -sum([p * math.log(p) / math.log(2.0) for p in prob])
        return [res]

def strength(line):
    try:
        stats = PasswordStats(line)
        res = stats.strength()
    except:
        res = 0.5
    
    return [res]

add_features(train_df, entropy)
add_features(train_df, diff_repeats)
add_features(train_df, strength)


add_features(test_df, entropy)
add_features(test_df, diff_repeats)
add_features(test_df, strength)

add_features(train_df, ndd_entropy)  # Модная энтропия, но считается 20 мин, не стопается
add_features(test_df, ndd_entropy)  # Модная энтропия, но считается 20 мин, не стопается

# add_features(train_df, zxc_stgrength)  # too fucking long

  0%|          | 20474/4151496 [00:00<00:40, 101963.38it/s]

entropy 

100%|██████████| 4151496/4151496 [00:41<00:00, 100465.66it/s]
  0%|          | 13514/4151496 [00:00<01:01, 67373.96it/s]

diff_repeats 

100%|██████████| 4151496/4151496 [01:00<00:00, 68079.82it/s]
  1%|          | 30244/4151496 [00:00<00:27, 151792.09it/s]

strength 

100%|██████████| 4151496/4151496 [00:30<00:00, 137062.70it/s]
  2%|▏         | 20845/1037875 [00:00<00:09, 104305.43it/s]

entropy 

100%|██████████| 1037875/1037875 [00:10<00:00, 100629.67it/s]
  1%|          | 6402/1037875 [00:00<00:16, 64014.11it/s]

diff_repeats 

100%|██████████| 1037875/1037875 [00:16<00:00, 64459.77it/s]
  3%|▎         | 30366/1037875 [00:00<00:06, 151084.48it/s]

strength 

100%|██████████| 1037875/1037875 [00:07<00:00, 141378.02it/s]
  0%|          | 143/4151496 [00:00<48:23, 1429.54it/s]

ndd_entropy 

100%|██████████| 4151496/4151496 [51:18<00:00, 1348.74it/s] 
  0%|          | 283/1037875 [00:00<12:10, 1419.44it/s]

ndd_entropy 

100%|██████████| 1037875/1037875 [12:08<00:00, 1424.74it/s]


# Тут марковская цепь

In [5]:
class MarkovChain:
    def __init__(self, order):
        self.order = order
        
    def _destroy_leading_spaces(self, s):
        i = 0
        while s[i] == " ":
            i += 1
        return s[i:]
    
    def _destroy_new_line(self, s):
        if s[-1] == "\n":
            return s[:-1]+" "
        else:
            return s
        
    def _destroy_characters(self, s, to_exclude): 
        return [c for c in s if c not in to_exclude]

    def _destroy_characters_apart_from(self, s, to_keep): 
        return [c for c in s if c in to_keep]

    def _remove_multiple_spaces(self, line):
        new_line = []
        space = False
        for x in list(line):
            if ((x == " ") and (space == False)):
                new_line.append(x)
                space = True
            elif (x != " "):
                new_line.append(x)
                space = False
        return new_line
    
    def word_proba(self, word: str):
        probs = []
        if len(word) <= self.order:
            return [(1 / len(self.charset)) ** len(word)]
        else:
            for i in range(len(word) - self.order):
                comb = word[i: i + self.order]
                next_char = word[i + self.order]
                try:
                    proba = self._counts[tuple(comb)][next_char] / sum(self._counts[tuple(comb)].values())
                    probs.append(proba)
                except:
                    probs.append(0)

            return [list(accumulate(probs, lambda x, y: x * y))[-1]]
        
    def fit(self, filepath):
        with open(filepath, 'r') as f:
            self.input = f.readlines()
        maxorder = self.order
        
        for order in range(1, maxorder + 1):
            counts = {} 

            for j, line in enumerate(self.input):
                line = self._destroy_leading_spaces(line)
                line = self._destroy_new_line(line)
                
                chars = list(line)
                self.charset = set(line)

                line = self._destroy_characters_apart_from(line, self.charset)
                line = self._remove_multiple_spaces(line) 

                for i in tqdm(range(len(chars) - order)):
                    key_history = tuple(chars[i:i + order])
                    key_current = chars[i + order]
                    
                    if key_history in counts:
                        if key_current in counts[key_history]:
                            (counts[key_history])[key_current] += 1
                        else:
                            (counts[key_history])[key_current] = 1
                    else:
                        counts[key_history] = {}
                        (counts[key_history])[key_current] = 1

            print("Done loading order " + str(order) + ".")

        self._counts = counts

In [14]:
mc = MarkovChain(order=2)
mc.fit('input.txt')

100%|██████████| 38647753/38647753 [00:37<00:00, 1017105.39it/s]


Done loading order 1.


100%|██████████| 38647752/38647752 [00:44<00:00, 873047.59it/s]


Done loading order 2.


In [15]:
add_features(train_df, mc.word_proba)
add_features(test_df, mc.word_proba)

  0%|          | 5165/4151496 [00:00<01:20, 51645.74it/s]

word_proba 

100%|██████████| 4151496/4151496 [01:25<00:00, 48369.85it/s]
  1%|          | 10147/1037875 [00:00<00:20, 50878.63it/s]

word_proba 

100%|██████████| 1037875/1037875 [00:21<00:00, 48213.35it/s]


In [90]:
with open('test_df.pickle', 'wb') as f:
    pickle.dump(test_df, f)

with open('train_df.pickle', 'wb') as f:
    pickle.dump(train_df, f)

# Тестим

In [6]:
with open('train_df.pickle', 'rb') as f:
    train_df = pickle.load(f)

with open('test_df.pickle', 'rb') as f:
    test_df = pickle.load(f)

In [39]:
X_train = train_df.drop(['Password', 'Times',], axis=1)
y_train = train_df['Times']

In [40]:
X_test = test_df.drop(['Password', 'Id',], axis=1)

In [88]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.3)

In [41]:
X_train = X_train[['word_proba0', 'grams_probs', 'ndd_entropy0', 'brotli_compression0', 'get_frequency0', 'is_letters0', 'is_date0', 'keyboard_complexity0', 'compression_size0', 'compression_size1', 'compression_size2', 'is_lower0', 'diff_repeats0', 'unique_length0', 'isotropness0']]
X_test = X_test[['word_proba0', 'grams_probs', 'ndd_entropy0', 'brotli_compression0', 'get_frequency0', 'is_letters0', 'is_date0', 'keyboard_complexity0', 'compression_size0', 'compression_size1', 'compression_size2', 'is_lower0', 'diff_repeats0', 'unique_length0', 'isotropness0']]

In [43]:
dtrain = xgb.DMatrix(X_train, np.log(y_train + 1))
dtest = xgb.DMatrix(X_test)

In [67]:
params = {
 'colsample_bytree': 0.7,
 'eta': 0.5,
 'gamma': 0.78,
 'max_depth': 10,
 'min_child_weight': 3,
 'subsample': 0.9,
 'objective':'reg:squarederror',
 'eval_metric': 'rmse'
}

num_rounds = 40
watchlist  = [(dtrain,'train'),]

In [68]:
xgb_model = xgb.train(params, dtrain, num_rounds, watchlist)

[0]	train-rmse:0.366899
[1]	train-rmse:0.300918
[2]	train-rmse:0.291658
[3]	train-rmse:0.290497
[4]	train-rmse:0.289618
[5]	train-rmse:0.287956
[6]	train-rmse:0.28695
[7]	train-rmse:0.286753
[8]	train-rmse:0.285336
[9]	train-rmse:0.284559
[10]	train-rmse:0.283434
[11]	train-rmse:0.28303
[12]	train-rmse:0.282541
[13]	train-rmse:0.282328
[14]	train-rmse:0.282118
[15]	train-rmse:0.282074
[16]	train-rmse:0.281721
[17]	train-rmse:0.281176
[18]	train-rmse:0.281127
[19]	train-rmse:0.280974
[20]	train-rmse:0.280936
[21]	train-rmse:0.280843
[22]	train-rmse:0.280798
[23]	train-rmse:0.280652
[24]	train-rmse:0.280602
[25]	train-rmse:0.280441
[26]	train-rmse:0.280379
[27]	train-rmse:0.2803
[28]	train-rmse:0.280208
[29]	train-rmse:0.279962
[30]	train-rmse:0.279715
[31]	train-rmse:0.279704
[32]	train-rmse:0.279676
[33]	train-rmse:0.279203
[34]	train-rmse:0.279145
[35]	train-rmse:0.279016
[36]	train-rmse:0.278693
[37]	train-rmse:0.278199
[38]	train-rmse:0.278117
[39]	train-rmse:0.278096


In [69]:
xgb_pred = np.exp(xgb_model.predict(dtest) - 1)

In [70]:
save_submission(list(map(round, xgb_pred)))

In [36]:
grams_probs = []

In [37]:
combs = sum(d.values())
for word in tqdm(test_df.Password.values):
    probs = []
    for n in range(2, len(word) + 1):
        prob = []
        for grama in ngrams(word, n):
            if grama in d:
                prob.append(d[grama] / combs)
            else:
                prob.append(0)
        probs.append([*accumulate(prob, lambda x, y: x * y)][-1])
    grams_probs.append(max(probs) if probs else 0)

100%|██████████| 1037875/1037875 [00:59<00:00, 17440.09it/s]


In [38]:
test_df['grams_probs'] = grams_probs