In [1]:
import pandas as pd
import re
from pymystem3 import Mystem
from many_stop_words import get_stop_words

In [2]:
df = pd.read_csv('data/civil_code/court_orders.csv.gz', compression='gzip')

In [3]:
patt = re.compile(r'((<данные изъяты>)|(<адрес>)|(ДД\.ММ\.ГГГГ)|(ФИО\d+)|(---))')

In [4]:
df['lemmas'] = df['text'].map(lambda text: re.sub(patt, ' ', text))

In [5]:
def split_body(text, split_pattern):
    def fix_keywords(text):
        text = text.lower()
        text = re.sub('р\s?е\s?ш\s?и\s?л', 'решил', text)
        text = re.sub('о\s?п\s?р\s?е\s?д\s?е\s?л\s?и\s?л', 'определил', text)
        return text
    return re.split(split_pattern, fix_keywords(text))

In [6]:
## bad code warning
split_pattern = re.compile(r'суд[ья]*[\s|,]+(определил|решил):')
df['parts'] = df['lemmas'].map(lambda doc: split_body(doc, split_pattern))
res = df[df['parts'].map(len) == 3]['parts'].map(lambda x: (x[0], x[-1])).apply(pd.Series).rename(columns={0:'description', 1:'resolution'})
df = df.join(res, how='inner').reset_index(drop=True)

In [7]:
rubles_pattern = re.compile(r"(\d[\d ]+)руб[лейя]+")

df['money'] = df['resolution'].map(lambda text: list(map(lambda x: int(''.join(x.split())), re.findall(rubles_pattern, text))))

In [8]:
df = df.drop('parts', axis=1)

In [9]:
art_pattern = re.compile(r"(статьей|ст\.|статьи)\s*(\d+\.?\d*)")
df['descr_articles'] = df['description'].map(lambda text: list(map(lambda x: x[1], re.findall(art_pattern, text))))

In [10]:
df['description'] = df['description'].map(lambda text: ' '.join(re.sub(art_pattern, ' ', text).split()))

In [11]:
def lemm_and_filter(text, min_len=3, stopwords=get_stop_words('ru'), stem=Mystem(entire_input=False)):
    return ' '.join(list(filter(lambda word: word not in stopwords and len(word) > min_len, stem.lemmatize(text))))

In [12]:
df['lemmas'] = df['description'].map(lemm_and_filter)

In [34]:
res = df[['text', 'description', 'resolution', 'lemmas', 'result', 'money', 'descr_articles']]

In [36]:
res.to_csv('tmp/civil_code_court_orders.csv.gz', compression='gzip')