In [29]:
import re
import gzip
import json
from glob import glob

import pandas as pd
from bs4 import BeautifulSoup
from pymystem3 import Mystem
from many_stop_words import get_stop_words
pd.set_option('display.max_colwidth', 1)

In [2]:
criminal_code = pd.read_csv('tmp/criminal_code.csv')

In [4]:
with gzip.GzipFile('data/criminal_code/court_orders.json.gz', 'r') as f:
    df = pd.DataFrame(list(map(lambda line: json.loads(line), f)))

In [5]:
df['title'] = df['title'].map(lambda x: x[0])
df['body']= df['body'].map(lambda x: x[0])

In [7]:
def parse_table(table_body):
    data = dict()
    for line in BeautifulSoup(table_body, 'lxml').find_all('tr'):
        data[line.find('td').get_text()] = line.find('th').get_text()
    return data

Выкидываем записи с пустой таблицей метаданных:

In [8]:
df = df.drop(df[df['table'].map(lambda x: len(x)==0)].index)

Парсим таблицу метаданных:

In [9]:
df['table'] = df['table'].map(lambda x: parse_table(x[0]))
df = df.join(df['table'].apply(pd.Series)).drop('table', axis=1)

Фильтруем только статьи с вынесением приговора:

In [63]:
parts_pattern = re.compile(r'\s?у\s?с\s?т\s?а\s?н\s?о\s?в\s?и\s?л\s?:\s?|\s?п\s?р\s?и\s?г\s?о\s?в\s?о\s?р\s?и\s?л\s?:\s?|\s?п\s?о\s?с\s?т\s?а\s?н\s?о\s?в\s?и\s?л\s?:\s?', flags=re.UNICODE | re.IGNORECASE)
def split_parts(text):
    parts = re.split(parts_pattern, text)
    return tuple(map(str.strip, parts))

In [64]:
df['parts'] = df['body'].map(split_parts)

In [66]:
df = df.drop(df[df['parts'].map(len) != 3].index).reset_index(drop=True)

In [72]:
df['intro'], df['case'], df['result'] = df['parts'].str

In [75]:
df = df.drop(columns=['parts'])

Из решения суда выделяем номера статей кодекса:

In [79]:
pattern = re.compile(r'(статьей|ст\.|статьи)\s*(\d+\.?\d*)', flags=re.UNICODE | re.IGNORECASE)
df['labels'] = df['result'].map(lambda text: list(map(lambda x: float(x[1]), re.findall(pattern, text))))

In [82]:
df = df[df['labels'].map(len) != 0]

Оставляем только те статьи кодекса, которые мы знаем:

In [83]:
only_known = lambda arr: list(filter(lambda x: x in criminal_code['article_number'].tolist(), arr))
df['labels'] = df['labels'].map(only_known)

In [84]:
df = df[df['labels'].map(len) != 0]

In [85]:
df.loc[:, 'labels'] = df['labels'].map(lambda x: ' '.join(list(map(str, x))))

In [87]:
df = df.rename(columns={
    'Категория':'category',
    'Дата':'date',
    'Регион':'region',
    'Суд':'court',
    'Судья': 'judge',
})
df = df[['title', 'date', 'category', 'region', 'court', 'judge', 'body', 'intro', 'case', 'result', 'labels']]

In [88]:
print('Количество судебных решений ', df.shape[0])

Количество судебных решений  25696


In [89]:
m = Mystem(entire_input=False)
ru_stopwords= get_stop_words('ru')

In [90]:
def lemm_and_filter(text):
    lemmas = m.lemmatize(text)
    filter_cond = lambda word: word not in ru_stopwords and len(word) > 3
    return ' '.join(list(filter(filter_cond, lemmas)))

In [None]:
df['lemmas'] = df['case'].map(lemm_and_filter)

In [None]:
df.to_csv('tmp/criminal_code_court_orders.csv.gz', index=False, compression='gzip')