In [1]:
import json
import re
from nltk.tokenize import wordpunct_tokenize
import csv
import numpy as np
from emoji import UNICODE_EMOJI
import dateutil.parser
import html
from datetime import datetime
import pytz

# DATA_INPUT_FILE = '/data/NER/VectorX/documents_description_objects_100K.json'
DATA_INPUT_FILE = '/data/NER/VectorX/documents_description_objects_100K_random.json'
DATA_ALL_OUTPUT_FILE = '/data/NER/VectorX/dataset.csv'
DATA_TOLOKA_FILE_TEMPLATE = '/data/NER/VectorX/toloka_{}.tsv'

In [2]:
san_dquoter = re.compile(r"(«|»|“|”|``|')")
san_tokenizer = re.compile(r"[\w']+|[^\w ]")


def tokenize(s):
    return san_tokenizer.findall(san_dquoter.sub('"', s))

In [3]:
def has_emoji(s):
    for emoji in UNICODE_EMOJI:
        if emoji in s:
            return True
    return False

_bad_beginnings = ['[id', 'Фото', 'Смотрите', 'Скачать', 'Оригинал взят']
_bad_substrings = ['!!', '...']
def check(t: str) -> bool:
    for bb in _bad_beginnings:           
        if t.startswith(bb):
            return False
    for bs in _bad_substrings:
        if bs in t:
            return False
    
    if has_emoji(t):
        return False
    
    return t.count('(') == t.count(')')

In [19]:
%%time
lines = []
data = []
NumberInt = re.compile(r'NumberInt\((\d+)\)')
ISODate = re.compile(r'ISODate\(("[^"]+")\)')

quoter = re.compile(r'& ?[qQ]uot ?;')
amper = re.compile(r'& ?amp ?; ?')
words_re = re.compile(r'\w+')

photo_re = re.compile(r'[Фф]ото: [\.a-zA-Z0-9\-]+\.[a-zA-Z]+')

counter = 0

descr_set = set()

with open(DATA_INPUT_FILE) as f:
    for line in f:
        line = line.rstrip()        
        m1 = NumberInt.findall(line)
        if m1:
            line = NumberInt.sub(m1[0], line)
        m2 = ISODate.findall(line)
        if m2:
            line = ISODate.sub(m2[0], line)
            
        lines.append(line)
        if line == '}':
#             print('\n'.join(lines))
            j_orig = json.loads('\n'.join(lines))
            pub_date = j_orig['rss_pubDate']
        
            assert len(j_orig) == 2
            j = j_orig['rss']            
            assert len(j) == 1
            j = j['channel']
            assert len(j) == 1
            j = j['item']
            
            raw_description = j['description']
            guid = j['guid']
            objects = j['objects'].get('object', [])

            
            if raw_description and isinstance(raw_description, str):
                # assert isinstance(description, str), f'"{description}" is not str in object {j_orig}'
                description = html.unescape(quoter.sub('&quot;', amper.sub('&', photo_re.sub('', raw_description))))
                
                words_only = tuple(words_re.findall(description.lower()))
                if words_only not in descr_set:
                    descr_set.add(words_only)
                    tokenized = tokenize(description)
                    description_tokenized = ' '.join(tokenized)
                    data.append([guid, description, description_tokenized, len(tokenized), pub_date, objects])            
                    counter += 1

            lines = []
print(f'Read {counter} articles')

Read 98968 articles
CPU times: user 26.5 s, sys: 597 ms, total: 27.1 s
Wall time: 27 s


In [20]:
%%time
with open(DATA_ALL_OUTPUT_FILE, 'w') as fw:
    cw = csv.writer(fw)
    cw.writerow(['guid', 'descriptions', 'description_tokenized', 'token_count', 'pub_date', 'objects'])
    for row in data:
        cw.writerow(row)       

CPU times: user 7.55 s, sys: 536 ms, total: 8.08 s
Wall time: 10.4 s


In [21]:
%%time

# target_date = datetime(2017, 8, 1).replace(tzinfo=pytz.UTC).date()
target_date = datetime(2017, 8, 31).replace(tzinfo=pytz.UTC).date()
toloka_file = DATA_TOLOKA_FILE_TEMPLATE.format(target_date)

print(f'Writing to file {toloka_file}')

c = 0
with open(toloka_file, 'w') as fw:
    cw = csv.writer(fw, delimiter='\t')
    cw.writerow(['INPUT:guid', 'INPUT:orig', 'INPUT:input'])
    for guid, description, description_tokenized, token_count, pub_date, objects in data:        
        row_date = dateutil.parser.parse(pub_date).date()
        if 40 < token_count < 1750 and row_date == target_date:
            c += 1
            cw.writerow([guid, description, description_tokenized])
print(f'wrote {c} articles')

Writing to file /data/NER/VectorX/toloka_2017-08-31.tsv
wrote 257 articles
CPU times: user 7.15 s, sys: 19.6 ms, total: 7.17 s
Wall time: 7.16 s


In [17]:
max([d[4] for d in data])

'2017-08-31T23:45:16.000+0000'

In [83]:
len([d for d in ds if d < datetime(2017, 8, 5).replace(tzinfo=pytz.UTC)])

NameError: name 'ds' is not defined

In [84]:
import matplotlib.pyplot as plt
plt.hist(ds)
plt.show()

NameError: name 'ds' is not defined

In [85]:
plt.hist(L[L<1750], bins=100)
plt.show()

NameError: name 'L' is not defined