In [1]:
import gensim
from pathlib import Path
import json
import csv
import re
from itertools import chain
from nltk.tokenize import wordpunct_tokenize, sent_tokenize
import random
from pymystem3 import Mystem

_m = Mystem()

URL_REGEX = re.compile(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))')
URL_CODE = ' _URL_ '
MONEY_REGEX = re.compile(r'\d+([\.\,]\d{,2})?(\ ?р(уб)?\.?)[^\w]')
MONEY_CODE = ' _MONEY_ '
DATE_REGEX = re.compile(r'\d{1,2}\.\d{1,2}.\d{2,4}( г\.?)?')
DATE_CODE = ' _DATE_ '
TIME_REGEX = re.compile(r'\d\d:\d\d')
TIME_CODE = ' _TIME_ '
ANON_REGEX = re.compile(r'(-{3,}|_{3,})')
ANON_CODE = ' _ANON_ '
PHONE_REGEX = re.compile(r'((\+?7|8)[ \-] ?)?((\(\d{3}\))|(\d{3}))?([ \-])?(\d{3}[\- ]?\d{2}[\- ]?\d{2})', re.S)
PHONE_CODE = ' _PHONE_ '
EMAIL_REGEX = re.compile(r'[A-Za-z0-9\.\+_-]+@[A-Za-z0-9\._-]+\.[a-zA-Z]*')
EMAIL_CODE = ' _EMAIL_ '

QUOTE_REGEX = re.compile(r'\[QUOTE=[^\]]+\]')


DOTS_REGEX = re.compile(r'[\.\!\?]{3,}')
SMILES_CLOSE_REGEX = re.compile(r'\){2,}')
SMILES_OPEN_REGEX = re.compile(r'\({2,}')



DATA_FOLDER = Path('/data/legal_ner')
RAW_DATA_FOLDER = DATA_FOLDER / 'raw'

yuristforum_data = RAW_DATA_FOLDER / 'yuristforum.jl'
yuristforum_sentences = DATA_FOLDER / 'yuristforum.txt'

zonazakona_data = RAW_DATA_FOLDER / 'zonazakona.jl'
zonazakona_sentences = DATA_FOLDER / 'zonazakona.txt'

legalforum_data = RAW_DATA_FOLDER / 'legalforum.jl'
legalforum_sentences = DATA_FOLDER / 'legalforum.txt'

find_people_data = RAW_DATA_FOLDER / 'find_people.jl'
find_people_sentences = DATA_FOLDER / 'find_people.txt'

vk_flat_data = RAW_DATA_FOLDER / 'vkdata_flat.csv'
vk_flat_sentences = DATA_FOLDER / 'vkdata_flat.txt'

vk_ads_data = RAW_DATA_FOLDER / 'vkdata_ads.csv'
vk_ads_sentences = DATA_FOLDER / 'vkdata_ads.txt'


# corpus_by_lines_data = DATA_FOLDER / 'corpus_by_lines.txt'
# corpus_by_posts_data = DATA_FOLDER / 'corpus_by_posts.txt'


In [2]:
def lemmatize(s):
    global _m
    try:
        return _m.lemmatize(s)
    except BrokenPipeError:
        _m = Mystem()
        return lemmatize(s)

In [5]:
def split_lines(raw_lines):
    ABRIDGED_SPLIT = re.compile(r'[^\w](ул|д|кв|г|п|ст|пр|см|тел|т\.е|\d*м|стр|мкр|\d{4}г|корп|т\.ч|зам|т\.к|им|руб|коп|адм|обл|тыс|эт|\d-?к|к\.у|\d*т\.р|сан|\d*к|\d*м)\.$')
    INITIALS_SPLIT = re.compile(r'[^\w][А-Я]\.[А-Я]\.$')
    # WRONG_SPLIT = re.compile('[^\w]ул\.$')
    sentences = []
    sentences_set = set()
    i = 0
    for lines in raw_lines:
        for line in lines:
            sents = sent_tokenize(line)            
            prefix = ''
            for s in sents:
                s = s.strip()
                if s.startswith('Сообщение от'):
                    continue
                elif ABRIDGED_SPLIT.search(s):
                    prefix += s
                elif INITIALS_SPLIT.search(s):
                    prefix += s                
    #                 print(prefix)
                else:
                    if prefix:
                        i += 1
                    sent = prefix + s
                    if sent not in sentences_set:
                        sentences.append(sent)
                        sentences_set.add(sent)                    
                    prefix = ''
    #     if i > 100:
    #         break
    print(i)
    return sentences

In [25]:
%%time

raw_lines = []
with find_people_data.open() as f:
    for line in f:
        text = json.loads(line)['text']
        raw_lines.append([text])

sentences = split_lines(raw_lines)

with find_people_sentences.open('w') as f:
    for s in sentences:
        print(s, file=f)

320
CPU times: user 258 ms, sys: 47 µs, total: 258 ms
Wall time: 258 ms


In [6]:
%%time

raw_lines = []
with vk_flat_data.open() as f:
    cr = csv.reader(f)
    for cell in cr:
        cell = cell[0].strip()
        if cell:
            raw_lines.append([' '.join(line.strip() for line in cell.splitlines() if line.strip())])

sentences = split_lines(raw_lines)

with vk_flat_sentences.open('w') as f:
    for s in sentences:
        print(s, file=f)

10088
CPU times: user 6.44 s, sys: 20 ms, total: 6.46 s
Wall time: 6.46 s


In [22]:
%%time

tags = re.compile(r'\ ?#\w+')
numbers = re.compile(r'^\d+\.? *')

raw_lines = []
with vk_ads_data.open() as f:
    cr = csv.reader(f)
    for cell in cr:
        if cell:
            cell = cell[0].strip()
        if cell:
            raw_lines.append([' '.join(numbers.sub('', tags.sub('', line)).strip() for line in cell.splitlines() if line.strip())])

sentences = split_lines(raw_lines)

with vk_ads_sentences.open('w') as f:
    for s in sentences:
        print(s, file=f)

233
CPU times: user 135 ms, sys: 0 ns, total: 135 ms
Wall time: 139 ms


In [7]:
%%time

raw_lines = []
with yuristforum_data.open() as f:
    for line in f:
        j = json.loads(line)
        raw_lines.append([line.strip() for line in j['answer'].splitlines() if line.strip()])

sentences = split_lines(raw_lines)

with corpus_sentences.open('w') as f:
    for s in sentences:
        print(s, file=f)

37182


NameError: name 'corpus_sentences' is not defined

In [5]:
%%time

raw_lines = []
with legalforum_data.open() as f:
    for line in f:
        j = json.loads(line)
        raw_lines.append(j['text'].splitlines())
        
sentences = split_lines(raw_lines)

with legalforum_sentences.open('w') as f:
    for s in sentences:
        print(s, file=f)

29426
CPU times: user 26.9 s, sys: 196 ms, total: 27.1 s
Wall time: 27.1 s


In [6]:
%%time

raw_lines = []
with zonazakona_data.open() as f:
    for line in f:
        j = json.loads(line)
        raw_lines.append(j['text'].splitlines())
        
sentences = split_lines(raw_lines)

with zonazakona_sentences.open('w') as f:
    for s in sentences:
        print(s, file=f)

291937
CPU times: user 5min 1s, sys: 6.38 s, total: 5min 7s
Wall time: 5min 10s
