In [1]:
import os
import io
import numpy as np
import random
import re

In [2]:
def unique(arr, dic=None):
    if (dic is None):
        dic = {}
    for el in arr:
        if isinstance(el, list):
            unique(el, dic)
        else:
            if (el not in dic):
                dic[el] = 1
            else:
                dic[el] += 1
    return np.array(dic.keys())

Классификация будет происходить по след формуле:
$$p(c\mid d,\lambda)=\frac
{\exp\sum_i^{n \times k}{\lambda_i}f_i\left(c,d\right )}
{\sum_{\tilde{c}\in C}{\exp\sum_i^{n \times k}{\lambda_i}f_i\left(\tilde{c},d\right )}}$$

In [3]:
def predict(x, weights, y_patterns):
    # начальное приведение
    probas = np.ones(weights.shape[1]) * np.log(1.0 / weights.shape[1])

    # считаем условные вероятности
    for xi in x:
        v =  weights[xi] * y_patterns[xi]
        probas += v

    # далее сглаживаем выходы через softmax
    probas = np.exp(probas / weights.shape[1])
    return probas / np.sum(probas)

Задачу будем решать с помощью максимизации функции правдоподобия
$$\log p(C|D,\lambda)
=\sum_{(c,d)\in(C,D)}\log p(c|d,\lambda)
=\sum_{(c,d)\in(C,D)}\log\frac
{\exp\sum_i^{n \times k}{\lambda_i}f_i\left(c,d\right )}
{\sum_{\tilde{c}\in C}{\exp\sum_i^{n \times k}{\lambda_i}f_i\left(\tilde{c},d\right )}}$$

Соответственно градиент у нас будет в частных производных

$$\frac{\partial\log p(C|D,\lambda)}{\partial\lambda_i}=
\sum_{(c,d)\in(C,D)}{f_i(c,d)}-
\sum_{d\in D}{\sum_{c\in C}{p(c|d,\lambda)f_i(c,d)}}$$

In [4]:
def fit(X, y, f_count, c_count, alpha=0.85, max_iter=100, tol=0.00001, random_state=None, verbose=1):
    n_samples = len(X)
    if random_state is not None:
        random.seed(random_state)

#     # определяем сколько у нас уникальных токенов
#     features = unique(X)
#     f_count = features.shape[0]
#     # определяем сколько у нас уникальных классов
#     classes = unique(y)
#     c_count = classes.shape[0]
    
    # матрица индикаторов(условных признаков)
    feature_patterns = np.zeros((f_count, c_count), dtype=np.int)

    # матрица весов индикаторов
    weights = np.zeros((f_count, c_count))

    # инициализация индикаторов
    for i in range(n_samples):
        for xi in X[i]:
            feature_patterns[xi, y[i]] = 1

    #
    prev_logl = 0.
    iter_num = 0
    all_iter = 0
    # ограничим сверху max_iter итерациями
    for iter_num in range(max_iter):
        if verbose:
            print 'Start iteration #%d\t' % iter_num,

        logl = 0.
        ncorrect = 0

        # random прохождение существенно улучшает схождение SGD
        r = range(n_samples)
        r = random.sample(r, n_samples)
        iter_sample = 0
        for i in r:
            iter_sample += 1

            if verbose and iter_sample % (n_samples / 20) == 0:
                print '.',

            all_iter += 1
            eta = alpha ** (all_iter / n_samples)
            # предсказываем вероятности
            probas = predict(X[i], weights, feature_patterns)

            # смотрим, правильно ли мы предсказали, это нужно только для verbose
            if np.argmax(probas) == y[i]:
                ncorrect += 1
            # считаем "правдоподобие"
            logl += np.log(probas[y[i]]) / features.shape[0]

            # обновляем веса
            for j in range(len(X[i])):
                conditional_y = feature_patterns[X[i][j]]
                for y_i in range(len(conditional_y)):
                    # ожидание
                    expected_ent = 1.0 if conditional_y[y_i] == 1 and y_i == y[i] else 0.0
                    # реальность
                    max_ent = probas[y_i]
                    weights[X[i][j], y_i] -= (max_ent - expected_ent) * eta  #
        if verbose:
            print '\tAccuracy: %.5f, Loss: %.8f' % (1.0 * ncorrect / n_samples, logl - prev_logl)
        if iter_num > 0:
            if prev_logl > logl:
                print('there is model diverging')
                break
            if (logl - prev_logl) < tol:
                break
        prev_logl = logl
    print iter_num
    return weights, feature_patterns

In [5]:
digits_regex = re.compile('\d')
punc_regex = re.compile('[\%\(\)\-\/\:\;\<\>\«\»\,]')
delim_regex = re.compile('([\.])\s+')

In [6]:
def read_and_tokenize(foldername):
    '''
    метод для считывания текстов из файлов папки
    здесь применяется довольно простая токенизация
    '''
    word2index = {}
    word_counts = {}
    
    index2word = []
    i = 0
    tokenized_text = []
    for path, subdirs, files in os.walk('data'):
        for name in files:
            filename = os.path.join(path, name)
            with io.open(filename, 'r', encoding='utf-8') as data_file:
                for line in data_file:
                    if len(line) < 50:
                        continue
                    text = digits_regex.sub(u'0', line.lower())
                    text = punc_regex.sub(u'', text)
                    text = delim_regex.sub(r' \1 ', text)
                    for word in text.split():
                        if word and word not in word2index:
                            word2index[word] = i
                            index2word.append(word)
                            i += 1
                        tokenized_text.append(word)
    return tokenized_text, word2index, index2word

In [7]:
def generate_train(tokenized_text, word2index,context_len = 4):
    ''' 
    метод для генерации обучающих данных
    '''
    X = []
    y = []
    for i, y_word in enumerate(tokenized_text):
        x = []
        for j in range(i - context_len, i):
            if (j >= 0):
                x_word = tokenized_text[j]
                x.append(word2index[x_word])
        if (len(x) > 0):
            X.append(x)
            y.append(word2index[y_word])
        if(i % 10000 == 0):
            print 'i =', i
    print 'end'
    return X, y

In [8]:
tokenized_text, word2index, index2word = read_and_tokenize('data')      

In [9]:
unique_words = len(index2word)
print 'all words:', len(tokenized_text)
print 'all unique words', unique_words

all words: 287632
all unique words 43030


In [10]:
context_len = 4
X,y = generate_train(tokenized_text, word2index,context_len=context_len)

i = 0
i = 10000
i = 20000
i = 30000
i = 40000
i = 50000
i = 60000
i = 70000
i = 80000
i = 90000
i = 100000
i = 110000
i = 120000
i = 130000
i = 140000
i = 150000
i = 160000
i = 170000
i = 180000
i = 190000
i = 200000
i = 210000
i = 220000
i = 230000
i = 240000
i = 250000
i = 260000
i = 270000
i = 280000
end


In [11]:
weights, patterns = fit(X, y,unique_words,unique_words,random_state=241)

MemoryError: 

In [None]:
test = [word2index[u'экономическая'],word2index[u'ситуация']]
for i in range(10):
    pred = predict(test, weights, patterns)
    index = np.argmax(pred)
    print index2word[index],
    test.append(index)
    if len(test) > context_len:
        del test[0]
    print test