In [1]:
import gensim
import pandas as pd

In [2]:
df = pd.read_csv('data/judicial_orders.csv')
code = pd.read_csv('data/criminal_code.csv')

In [3]:
df['lemmas'] = df['lemmas'].map(eval)
df['labels'] = df['labels'].map(eval)
code['lemmas'] = code['lemmas'].map(eval)

## Модель, обученная на судебных решениях
Метка документа (label) - одна из статей, указанная судьёй в решении

In [9]:
td = lambda row: gensim.models.doc2vec.TaggedDocument(
    words=row['lemmas'], 
    tags=[row['labels'][0]]
)
documents = df.apply(td, axis=1).tolist()

In [10]:
model = gensim.models.doc2vec.Doc2Vec(alpha=0.025, min_alpha=0.025)  # use fixed learning rate
model.build_vocab(documents)

model.train(documents, total_examples=model.corpus_count, epochs=25, 
            start_alpha=0.025, end_alpha=0.02)

Вектора, построенные doc2vec для каждой из меток:

In [11]:
doc_vectors = dict(zip(model.docvecs.offset2doctag, model.docvecs.vectors_docs))

In [12]:
code['vectors'] = code['number'].map(lambda x: list(doc_vectors[x]) if x in doc_vectors else None)

In [13]:
code.dropna(subset=['vectors']).drop(['lemmas'], axis=1).to_csv('results/doc2vec.csv', index=False)

In [14]:
df['vectors'] = df['lemmas'].map(lambda x: list(model.infer_vector(x, steps=25)))

Best of one score - для каждого судебного решения ставится 1, если модель верно угадала одну из статей в решении судьи, иначе 0

In [18]:
from scipy.spatial.distance import cosine
def find_nearest(vector):
    distances = []
    for num, v in doc_vectors.items():
        distances.append((num, cosine(vector, v)))
    return min(distances, key=lambda x: x[1])[0]

In [19]:
df['nearest'] = df['vectors'].map(find_nearest)
df['res'] = df.apply(lambda row: 1 if row['nearest'] in row['labels'] else 0, axis=1)
df['res'].sum() / df.shape[0]

0.46147386101196725

## Модель, обученная только на статьях УК РФ

In [9]:
td = lambda row: gensim.models.doc2vec.TaggedDocument(
    words=row['lemmas'], 
    tags=[row['number']]
)
documents = code.apply(td, axis=1).tolist()

In [12]:
model = gensim.models.doc2vec.Doc2Vec(alpha=0.025, min_alpha=0.025)  # use fixed learning rate
model.build_vocab(documents)
model.train(documents, total_examples=model.corpus_count, epochs=10, 
            start_alpha=0.025, end_alpha=0.02)

In [27]:
doc_vectors = dict(zip(model.docvecs.offset2doctag, model.docvecs.vectors_docs))

In [28]:
df['vector'] = df['lemmas'].map(model.infer_vector)

Вычисление метрики best of one

In [61]:
from scipy.spatial.distance import cosine
def find_nearest(vector):
    distances = []
    for num, v in doc_vectors.items():
        distances.append((num, cosine(vector, v)))
    return min(distances, key=lambda x: x[1])[0]

In [40]:
df['nearest'] = df['vector'].map(find_nearest)

In [43]:
df['res'] = df.apply(lambda row: 1 if row['nearest'] in row['labels'] else 0, axis=1)

In [44]:
df['res'].sum() / df.shape[0]

0.012597102666386731

## Модель обученная на судебных решениях с несколькими метками
Каждый набор меток (labels) - статьи УК, встречающиеся в решении судьи

In [20]:
td = lambda row: gensim.models.doc2vec.TaggedDocument(
    words=row['lemmas'], 
    tags=row['labels']
)
documents = df.apply(td, axis=1).tolist()

In [21]:
model = gensim.models.doc2vec.Doc2Vec(alpha=0.025, min_alpha=0.025)  # use fixed learning rate
model.build_vocab(documents)
model.train(documents, total_examples=model.corpus_count, epochs=25, 
            start_alpha=0.025, end_alpha=0.02)

In [22]:
doc_vectors = dict(zip(model.docvecs.offset2doctag, model.docvecs.vectors_docs))

In [23]:
code['vectors'] = code['number'].map(lambda x: list(doc_vectors[x]) if x in doc_vectors else None)

In [24]:
code.dropna(subset=['vectors']).drop(['lemmas'], axis=1)\
    .to_csv('results/doc2vec_multilabels.csv', index=False)

In [25]:
df['vectors'] = df['lemmas'].map(lambda x: list(model.infer_vector(x, steps=25)))

In [26]:
df['nearest'] = df['vectors'].map(find_nearest)
df['res'] = df.apply(lambda row: 1 if row['nearest'] in row['labels'] else 0, axis=1)
df['res'].sum() / df.shape[0]

0.452865840856603

## Модель, обученная на судебных решениях с несколькими метками и выделением устойчивых выражений
Устойчивое выражение = часто встречающиеся друг за другом слова. Использован gensim Phraser, обученный на корпусе русского языка.

In [61]:
phraser = gensim.models.phrases.Phraser.load('data/opencorpora/phraser.model')

In [62]:
td = lambda row: gensim.models.doc2vec.TaggedDocument(
    words=phraser[row['lemmas']], #Выделение фраз в тексте
    tags=row['labels']
)
documents = df.apply(td, axis=1).tolist()

In [63]:
model = gensim.models.doc2vec.Doc2Vec(alpha=0.025, min_alpha=0.025)  # use fixed learning rate
model.build_vocab(documents)
model.train(documents, total_examples=model.corpus_count, epochs=25, 
            start_alpha=0.025, end_alpha=0.02)

In [64]:
doc_vectors = dict(zip(model.docvecs.offset2doctag, model.docvecs.vectors_docs))

In [65]:
code['vectors'] = code['number'].map(lambda x: list(doc_vectors[x]) if x in doc_vectors else None)

In [66]:
code.dropna(subset=['vectors']).drop(['lemmas'], axis=1)\
    .to_csv('results/doc2vec_multilabels_phraser.csv', index=False)

In [67]:
from scipy.spatial.distance import cosine
def find_nearest(vector):
    distances = []
    for num, v in doc_vectors.items():
        distances.append((num, cosine(vector, v)))
    return min(distances, key=lambda x: x[1])[0]

In [68]:
df['vectors'] = df['lemmas'].map(lambda x: list(model.infer_vector(phraser[x], steps=25)))

In [69]:
df['nearest'] = df['vectors'].map(find_nearest)

In [70]:
df['res'] = df.apply(lambda row: 1 if row['nearest'] in row['labels'] else 0, axis=1)

In [71]:
df['res'].sum() / df.shape[0]

0.3837917279025824

### Создание объединенного файла с векторами УК и предсказанными векторами судебных решений

In [72]:
code['is_code'] = 1
df['is_code'] = 0

In [75]:
pd.concat([
    code.dropna(subset=['vectors'])[['name', 'is_code', 'vectors']],
    df[:500][['title', 'is_code', 'vectors']].rename(columns={'title':'name'})
]).to_csv('results/doc2vec_final.csv', index=False)