In [1]:
import pandas as pd
import numpy as np
from pymystem3 import Mystem
from many_stop_words import get_stop_words
import gensim
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.spatial.distance import cosine
%matplotlib inline

ru_stopwords = get_stop_words('ru')

In [2]:
df = pd.read_csv('data/judicial_orders_lemmas.csv') # Параллельный корпус (текст-статьи)
code = pd.read_csv('data/criminal_code.csv') # УК РФ
dictionary = gensim.corpora.Dictionary.load('data/opencorpora/opencorpora.dict') # Известные слова русского языка (из opencorpora)

In [3]:
df['lemmas'] = df['lemmas'].map(eval)
code['lemmas'] = code['lemmas'].map(eval)

In [4]:
code['bow'] = code['lemmas'].map(dictionary.doc2bow)
df['bow'] = df['lemmas'].map(dictionary.doc2bow)

In [6]:
tf_idf = gensim.models.TfidfModel.load('data/opencorpora/tfidf/opencorpora_tfidf.model')

In [7]:
def map_vector(vector, size):
    a = np.zeros(size)
    for n, value in vector:
        a[n] = value
    return a

In [8]:
size = len(dictionary)
code['tfidf'] = code['bow'].map(lambda bow: map_vector(tf_idf[bow], size))
df['tfidf'] = df['bow'].map(lambda bow: map_vector(tf_idf[bow], size))

Расстояния между каждым делом и каждой статьёй УК РФ:

In [None]:
distances = np.zeros((df.shape[0], code.shape[0]))

for i, text in enumerate(df['tfidf'].tolist()):
    for j, article in enumerate(code['tfidf'].tolist()):
        distances[i, j] = cosine(text, article)

In [None]:
fig, ax = plt.subplots(figsize=(15, 30))
ax.set_title('Cosine distance между содержательной частью судебного решения и статьями УК РФ')
ax = sns.heatmap(distances, ax=ax)

Аномально близкие ко многим делам статьи:

In [None]:
print('\n'.join(['{number}:\t{name}'.format(**text[1]) for text in code.loc[290:300][['number', 'name']].iterrows()]))

In [None]:
num_predictions = 5
for i in range(distances.shape[0]):
    df.loc[i, 'predicted'] = str([code.loc[j, 'number'] for j in distances[i].argsort()[:num_predictions]])

df['predicted'] = df['predicted'].map(eval)
df['labels'] = df['labels'].map(eval)

In [None]:
def intersection(row):
    return len(set(row['labels']).intersection(set(row['predicted'])))

df['intersection'] = df.apply(intersection, axis=1)

In [None]:
def has_intersection(row):
    if len(set(row['labels']).intersection(set(row['predicted']))) > 0:
        return 1
    return 0
df['is_right'] = df.apply(has_intersection, axis=1)

In [None]:
df['is_right'].sum() / df.shape[0]

In [None]:
df[['title', 'labels', 'predicted', 'is_right', 'intersection']].to_csv('results/tf_idf.csv', index=False)
df[['title', 'labels', 'predicted', 'is_right', 'intersection']].head(15)