In [149]:
import re
import gzip
import json
from collections import Counter

import pandas as pd
from pymystem3 import Mystem
from many_stop_words import get_stop_words

In [4]:
data = []
with gzip.GzipFile("./data/civil_code/raw_documents.json.gz", 'r') as f:
    for line in f:
        data.append(json.loads(line))

In [132]:
target_names = [
    "case_doc_kind",
    "case_id",
    "case_user_doc_number_rewrite",
    "case_doc_instance",
    "case_doc_vnkod",
    "case_common_doc_court",
    "case_doc_subject_number",
    "case_doc_subject_rf",
    "case_court_type_cat",
    "case_document_text"
]

res = []
for doc in data:
    tmp = {
        field["comment"]: field["value"]
        for field in doc["fields"]
        if field["name"] in target_names
    }
    res.append(tmp)
res = pd.DataFrame(res)

res = res[
    (res["Вид судопроизводства"] == "Гражданское дело")
    & (res["Инстанция"] == "Первая инстанция")
    & (res["Уровень суда"] == "Районный, городской, межрайонный суд")
    & (res['Код субъекта РФ'] == '64')
    & (~pd.isnull(res["Текст документа"]))
]

res['Текст документа'] = res['Текст документа'].map(lambda text: " ".join(text.split()))
res = res[[col for col in res.columns if len(res[col].unique()) > 1]]
res = res.reset_index(drop=True)

In [133]:
parts_pattern = re.compile(
    "(?:суд[ья]*[\s|,]*)?"
    "(?:\s?у\s?с\s?т\s?а\s?н\s?о\s?в\s?и\s?л|"
    "\s?п\s?р\s?и\s?г\s?о\s?в\s?о\s?р\s?и\s?л|"
    "\s?п\s?о\s?с\s?т\s?а\s?н\s?о\s?в\s?и\s?л|"
    "\s?р\s?е\s?ш\s?и\s?л|"
    "о\s?п\s?р\s?е\s?д\s?е\s?л\s?и\s?л)"
    "(?:\s*:)+",
    flags=re.UNICODE | re.IGNORECASE,
)


def split_parts(text):
    parts = re.split(parts_pattern, text)
    return tuple(map(str.strip, parts))

In [150]:
most_common_words = Counter([word for row in res['Текст документа'] for word in row.split()]).most_common()

In [134]:
res['parts'] = res['Текст документа'].map(split_parts)

In [138]:
res = res[res['parts'].map(lambda x: len(x) == 3)].reset_index(drop=True)

In [142]:
res['intro'], res['case'], res['result'] = res['parts'].str

In [144]:
res = res.drop(columns=['parts'])

In [157]:
def lemm_and_filter(text, min_len=3, stopwords=get_stop_words("ru"), stem=Mystem(entire_input=False)):
    return ' '.join(list(filter(lambda word: word not in stopwords and len(word) > min_len, stem.lemmatize(text))))

In [None]:
res['lemmas'] = res['case'].map(lemm_and_filter)

In [None]:
res.to_csv('tmp/civil_code_court_orders.csv.gz', compression='gzip')