# Кластеризация статей уголовного кодекса РФ

In [342]:
from lxml import etree
import pandas as pd
import re
import numpy as np
import nltk
from nltk.stem.snowball import SnowballStemmer

from sklearn.feature_extraction.text import TfidfVectorizer

import plotly
import plotly.graph_objs as go

plotly.offline.init_notebook_mode(connected=True)

Читаем XML файл с УК РФ и записываем в dataframe `articles` все статьи кодекса

In [343]:
with open('RFCriminalCode.xml') as file:
    tree = etree.parse(file)

In [344]:
root = tree.getroot()

In [345]:
articles = pd.DataFrame(columns = ['number', 'section', 'section_name', 'chapter', 'chapter_name', 'name', 'body'])

for part in root.getchildren():
    for section in part.getchildren():
        for chapter in section.getchildren():
            for article in chapter.getchildren():
                articles = articles.append({'number': article.get('number'),
                                            'section': section.get('number'),
                                            'section_name': section.get('name'),
                                            'chapter': chapter.get('number'),
                                            'chapter_name': chapter.get('name'),
                                            'name': article.get('name'),
                                            'body': article.getchildren()[0].text
                                            }, ignore_index=True)

In [346]:
articles = articles.dropna()

## Стемминг и токенизация текстов статей

In [347]:
stemmer = SnowballStemmer('russian', ignore_stopwords=True)

In [348]:
def tokenize_and_stem(text):
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    only_word_tokens = [token for token in tokens if re.search('[а-яА-Я]', token)]
    return [stemmer.stem(token) for token in only_word_tokens]

In [349]:
articles['stemmed_tokens'] = articles.body.map(tokenize_and_stem)

In [350]:
articles.body[0]

'1. Уголовное законодательство Российской Федерации состоит из настоящего Кодекса. Новые законы, предусматривающие уголовную ответственность, подлежат включению в настоящий Кодекс. 2. Настоящий Кодекс основывается на Конституции Российской Федерации и общепризнанных принципах и нормах международного права.'

In [351]:
articles.head()

Unnamed: 0,number,section,section_name,chapter,chapter_name,name,body,stemmed_tokens
0,1,I,УГОЛОВНЫЙ ЗАКОН,1,ЗАДАЧИ И ПРИНЦИПЫ УГОЛОВНОГО КОДЕКСА РОССИЙСКО...,Уголовное законодательство Российской Федерации,1. Уголовное законодательство Российской Федер...,"[уголовн, законодательств, российск, федерац, ..."
1,2,I,УГОЛОВНЫЙ ЗАКОН,1,ЗАДАЧИ И ПРИНЦИПЫ УГОЛОВНОГО КОДЕКСА РОССИЙСКО...,Задачи Уголовного кодекса Российской Федерации,1. Задачами настоящего Кодекса являются: охран...,"[задач, настоя, кодекс, явля, охра, прав, и, с..."
2,3,I,УГОЛОВНЫЙ ЗАКОН,1,ЗАДАЧИ И ПРИНЦИПЫ УГОЛОВНОГО КОДЕКСА РОССИЙСКО...,Принцип законности,"1. Преступность деяния, а также его наказуемос...","[преступн, деян, а, такж, его, наказуем, и, ин..."
3,4,I,УГОЛОВНЫЙ ЗАКОН,1,ЗАДАЧИ И ПРИНЦИПЫ УГОЛОВНОГО КОДЕКСА РОССИЙСКО...,Принцип равенства граждан перед законом,"Лица, совершившие преступления, равны перед за...","[лиц, соверш, преступлен, равн, перед, закон, ..."
4,5,I,УГОЛОВНЫЙ ЗАКОН,1,ЗАДАЧИ И ПРИНЦИПЫ УГОЛОВНОГО КОДЕКСА РОССИЙСКО...,Принцип вины,1. Лицо подлежит уголовной ответственности тол...,"[лиц, подлеж, уголовн, ответствен, только, за,..."


## TF-IDF and Cosine simularity

Посчитаем tf-idf для наших документов, убирая экстремальные значения частот (0.2 < tfidf < 0.8)

In [404]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, #max_features=200,
                                   min_df=0.2, stop_words=nltk.corpus.stopwords.words('russian'),
                                   use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(articles.body) #fit the vectorizer to synopses

print(tfidf_matrix.shape)


CPU times: user 7.18 s, sys: 44.5 ms, total: 7.22 s
Wall time: 7.25 s
(469, 213)


In [392]:
terms = tfidf_vectorizer.get_feature_names()

In [395]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

### Different clustering

In [396]:
from sklearn.cluster import KMeans
num_clusters = 12 # Количество разделов уголовного кодекса
%time articles['km_cluster'] = KMeans(n_clusters=num_clusters).fit(tfidf_matrix).labels_.tolist()

CPU times: user 2.32 s, sys: 11.2 ms, total: 2.33 s
Wall time: 2.44 s


In [357]:
from sklearn.cluster import DBSCAN
%time articles['db_cluster'] = DBSCAN(metric='precomputed').fit_predict(tfidf_matrix).tolist()

CPU times: user 3.79 ms, sys: 4.01 ms, total: 7.8 ms
Wall time: 12.5 ms


In [358]:
from sklearn.cluster import AgglomerativeClustering
%time articles['agg_cluster'] = AgglomerativeClustering(n_clusters=12).fit(tfidf_matrix.toarray()).labels_.tolist()

CPU times: user 28.4 ms, sys: 4.55 ms, total: 33 ms
Wall time: 92.1 ms


In [359]:
from sklearn.cluster import SpectralClustering
%time articles['spec_cluster'] = SpectralClustering().fit(tfidf_matrix).labels_.tolist()

CPU times: user 144 ms, sys: 146 ms, total: 291 ms
Wall time: 288 ms


## Multidimentional scaling

In [398]:
from sklearn.manifold import MDS

mds = MDS(n_components=2, dissimilarity="precomputed", random_state=42)
%time pos2 = mds.fit_transform(dist)
mds = MDS(n_components=3, dissimilarity='precomputed', random_state=42)
%time pos3 = mds.fit_transform(dist) 

In [399]:
articles['2d_pos'] = pos2.tolist()
articles['3d_pos'] = pos3.tolist()

In [400]:
def textfunc(row): return 'Кластер: ' + \
    str(row['km_cluster']) + '<br>Номер статьи: ' + \
    row['number'] + '<br>' + row['name']

data = go.Data([
        go.Scatter(x=articles['2d_pos'].map(lambda x: x[0]),
                           y=articles['2d_pos'].map(lambda x: x[1]),
                           mode='markers',
                           marker=go.Marker(
                               size=8, color=articles['km_cluster'].astype(float), colorscale='Jet'),
                           text=articles.apply(textfunc, axis=1),
                           showlegend=False,
                           hoverinfo='text'),
               
               ])

figure = go.Figure(data=data, layout=go.Layout(
    title='УК РФ TF-IDF KMeans(n_clusters=12)'))
plotly.offline.iplot(figure)

In [401]:
data = go.Data([go.Scatter3d(x=articles['3d_pos'].map(lambda x: x[0]),
                             y=articles['3d_pos'].map(lambda x: x[1]),
                             z=articles['3d_pos'].map(lambda x: x[2]),
                             mode='markers',
                             marker=go.Marker(
                                 size=3, color=articles['km_cluster'], colorscale='Jet'),
                             text=articles.apply(textfunc, axis=1),
                             hoverinfo='text')])

figure = go.Figure(data=data, layout=go.Layout(
    title='УК РФ TF-IDF KMeans(n_clusters=12)'))
plotly.offline.iplot(figure)


In [402]:
def sectiontextfunc(row): return 'Кластер: ' + \
    str(row['km_cluster']) + '<br>Номер статьи: ' + \
    row['number'] + '<br>' + row['name'] + '<br>Раздел ' + row['section'] + ' ' + row['section_name']
    
data = go.Data([
        go.Scatter(x=articles['2d_pos'].map(lambda x: x[0]),
                           y=articles['2d_pos'].map(lambda x: x[1]),
                           mode='markers',
                           marker=go.Marker(
                               size=8, color=articles['section'].astype('category').cat.codes, colorscale='Jet'),
                           text=articles.apply(sectiontextfunc, axis=1),
                           showlegend=False,
                           hoverinfo='text'),
               
               ])

figure = go.Figure(data=data, layout=go.Layout(
    title='УК РФ TF-IDF Разделы кодекса'))
plotly.offline.iplot(figure)

In [375]:
data = go.Data([go.Scatter3d(x=articles['3d_pos'].map(lambda x: x[0]),
                             y=articles['3d_pos'].map(lambda x: x[1]),
                             z=articles['3d_pos'].map(lambda x: x[2]),
                             mode='markers',
                             marker=go.Marker(
                                 size=3, color=articles['section'].astype('category').cat.codes, colorscale='Jet'),
                             text=articles.apply(textfunc, axis=1),
                             hoverinfo='text')])

figure = go.Figure(data=data, layout=go.Layout(
    title='УК РФ TF-IDF KMeans(n_clusters=12)'))
plotly.offline.iplot(figure)

In [154]:
import plotly.figure_factory as ff
from scipy.cluster.hierarchy import ward

fig = ff.create_dendrogram(tfidf_matrix.toarray(), linkagefun=ward,labels=articles['name'].tolist())
plotly.offline.iplot(fig)

## t-SNE

In [366]:
from sklearn.manifold import TSNE
%time articles['2d_tsne'] = TSNE(perplexity=7).fit_transform(tfidf_matrix.toarray()).tolist()
%time articles['3d_tsne'] = TSNE(n_components=3).fit_transform(tfidf_matrix.toarray()).tolist()

CPU times: user 3.59 s, sys: 470 ms, total: 4.06 s
Wall time: 4.23 s
CPU times: user 21.6 s, sys: 942 ms, total: 22.5 s
Wall time: 23 s


In [367]:
data = go.Data([go.Scatter(x=articles['2d_tsne'].map(lambda x: x[0]),
                           y=articles['2d_tsne'].map(lambda x: x[1]),
                           mode='markers',
                           marker=go.Marker(
                               size=10, color=articles['agg_cluster'], colorscale='Jet'),
                           text=articles.apply(textfunc, axis=1),
                           hoverinfo='text')])
layout = go.Layout(title='УК РФ TF-IDF t-SNE Agglomerative Clustering')
figure = go.Figure(data=data, layout=layout)
plotly.offline.iplot(figure)

In [386]:
data = go.Data([go.Scatter3d(x=articles['3d_tsne'].map(lambda x: x[0]),
                             y=articles['3d_tsne'].map(lambda x: x[1]),
                             z=articles['3d_tsne'].map(lambda x: x[2]),
                             mode='markers',
                             marker=go.Marker(
                                 size=3, color=articles['agg_cluster'], colorscale='Jet'),
                             text=articles.apply(textfunc, axis=1),
                             hoverinfo='text')])

figure = go.Figure(data=data, layout=go.Layout(
    title='УК РФ TF-IDF t-SNE Spectral clustering'))
plotly.offline.iplot(figure)

In [387]:
data = go.Data([go.Scatter(x=articles['2d_tsne'].map(lambda x: x[0]),
                           y=articles['2d_tsne'].map(lambda x: x[1]),
                           mode='markers',
                           marker=go.Marker(
                               size=10, color=articles['section'].astype('category').cat.codes, colorscale='Jet'),
                           text=articles.apply(sectiontextfunc, axis=1),
                           hoverinfo='text')])
layout = go.Layout(title='УК РФ TF-IDF t-SNE Разделы кодекса')
figure = go.Figure(data=data, layout=layout)
plotly.offline.iplot(figure)

In [388]:
data = go.Data([go.Scatter3d(x=articles['3d_tsne'].map(lambda x: x[0]),
                             y=articles['3d_tsne'].map(lambda x: x[1]),
                             z=articles['3d_tsne'].map(lambda x: x[2]),
                             mode='markers',
                             marker=go.Marker(
                                 size=3, color=articles['section'].astype('category').cat.codes, colorscale='Jet'),
                             text=articles.apply(sectiontextfunc, axis=1),
                             hoverinfo='text')])

figure = go.Figure(data=data, layout=go.Layout(
    title='УК РФ TF-IDF t-SNE Spectral clustering'))
plotly.offline.iplot(figure)

## Latent Dirichlet Allocation (LDA)

Возможно стоило удалить все имена собственные из текстов статей. Однако, стоит заметить, что кроме `российская федерация` формулировок с именами собственными natasha не находит:

In [None]:
# from natasha import LocationExtractor
# extractor = LocationExtractor()
# for text in articles['body']:
#     matches = extractor(text)
#     for match in matches:
#         print(match.span, match.fact)

In [217]:
import string
def strip_proppers(text):
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent) if word.islower()]
    return ''.join([" " + i if not i.startswith("'") and i not in string.punctuation else i for i in tokens]).strip()

In [219]:
from nltk.tag import pos_tag

def strip_proppers_POS(text):
    tagged = pos_tag(text.split()) #use NLTK's part of speech tagger
    non_propernouns = [word for word,pos in tagged if pos != 'NNP' and pos != 'NNPS']
    return non_propernouns

In [222]:
from gensim import corpora, models, similarities 

#remove proper names
%time preprocess = [strip_proppers(doc) for doc in articles['body']]

#tokenize
%time tokenized_text = [tokenize_and_stem(text) for text in preprocess]

#remove stop words
%time texts = [[word for word in text if word not in nltk.corpus.stopwords.words('russian')] for text in tokenized_text]

CPU times: user 1.31 s, sys: 27.3 ms, total: 1.34 s
Wall time: 1.4 s
CPU times: user 6.08 s, sys: 91.3 ms, total: 6.17 s
Wall time: 6.26 s
CPU times: user 13.1 s, sys: 844 ms, total: 14 s
Wall time: 14.2 s


In [224]:
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=1, no_above=0.8)

corpus = [dictionary.doc2bow(text) for text in texts]


In [228]:
%time lda = models.LdaModel(corpus, num_topics=5, id2word=dictionary, update_every=5, chunksize=10000, passes=100)

CPU times: user 3min 23s, sys: 1.42 s, total: 3min 24s
Wall time: 3min 26s


In [245]:
lda.show_topics(formatted=True, num_words=5)

[(0,
  '0.042*"преступлен" + 0.035*"наказан" + 0.025*"стат" + 0.023*"част" + 0.020*"совершен"'),
 (1,
  '0.057*"размер" + 0.049*"лет" + 0.032*"тысяч" + 0.031*"работ" + 0.030*"ин"'),
 (2,
  '0.032*"лет" + 0.032*"размер" + 0.022*"определен" + 0.022*"занима" + 0.021*"ин"'),
 (3,
  '0.067*"лет" + 0.053*"занима" + 0.052*"определен" + 0.037*"свобод" + 0.029*"прав"'),
 (4,
  '0.018*"служб" + 0.017*"лет" + 0.016*"работ" + 0.015*"воен" + 0.013*"государствен"')]