In [8]:
def tree_to_comment_list(comments):
    replies, stack = [], [root for root in comments if 'error' not in root]
    roots = [root for root in comments if 'error' not in root]

    while len(stack) > 0:
        comment = stack.pop()
        if 'error' not in comment:
            replies += comment['replies']
            stack += comment['replies']

    return roots, [reply for reply in replies if 'error' not in reply]

In [9]:
# nlp = spacy.load('en_core_web_sm')
import en_core_web_sm
import spacy

nlp = spacy.load('en_core_web_sm', exclude=['tok2vec', 'parser'])

In [10]:
# NER処理の関数
import glob
import json
import os
import re

accept_labels = set(nlp.get_pipe('ner').labels)
# accept_labels = {'PERSON'}
# accept_labels = set()

p = re.compile('\s+')
def trim(sentence: str):
    return p.sub(' ', sentence)

def ner(sentences):
    sentence_list, joint_sentence_list = [], []

    for doc in nlp.pipe([trim(sentence) for sentence in sentences], batch_size=1000):
        tokens = list(doc)
        ents_dict = {ent.start: ent for ent in doc.ents}
        words, joint_words = [], []

        i = 0
        while i < len(tokens):
            if i in ents_dict and ents_dict[i].label_ in accept_labels:
                ent = ents_dict[i]
                words.append(f'[[{ent.label_}]]')
                joint_words.append(f'[[{ent.label_}:::{ent.text.lower()}]]')
                i = ent.end
            else:
                words.append(tokens[i].lemma_.lower())
                joint_words.append(tokens[i].lemma_.lower())
                i += 1

        sentence_list.append(' '.join(words))
        joint_sentence_list.append(' '.join(joint_words))

    return sentence_list, joint_sentence_list

In [12]:
# comments.jsonのコメント文にNERを処理する

import glob
import json
import os

url_set = set()

for dir_path in glob.glob('/Users/iijima.s.ad/git/article-extractor/articles/fox/military/*/'):
    if not os.path.exists(os.path.join(dir_path, 'info.json')) or os.path.exists(os.path.join(dir_path, 'NER_roots.json')):
        continue

    with open(os.path.join(dir_path, 'info.json'), 'r') as f:
        url = json.load(f)['URL']
        
    if url in url_set:
        continue
    url_set.add(url)

    with open(os.path.join(dir_path, 'comments.json'), 'r') as f:
        roots, replies = tree_to_comment_list(json.load(f))

    ner_roots, joint_roots = ner([root['context'] for root in roots])
    ner_replies, joint_replies = ner([reply['context'] for reply in replies])

    file_pairs = [('NER_roots.txt', ner_roots), ('joint_roots.txt', joint_roots), ('NER_comments.txt', ner_roots + ner_replies), ('joint_comments.txt', joint_roots + joint_replies)]
    for base_name, sentences in file_pairs:
        with open(os.path.join(dir_path, base_name), 'w') as f:
            f.write('\n'.join(sentences))


In [49]:
!find /Users/iijima.s.ad/git/article-extractor/articles/fox/ | grep 'NER_roots.txt' | xargs cat > /Users/iijima.s.ad/git/workspace/articles/fox/ner_train3.txt

In [3]:
!nkf -w --overwrite /Users/iijima.s.ad/git/workspace/articles/fox/insert_train.txt

In [3]:
!find /Users/iijima.s.ad/git/article-extractor/articles/fox | grep 'comments.json$' | xargs cat | grep 'context' | wc
!find /Users/iijima.s.ad/git/article-extractor/articles/fox | grep 'comments.json$' | wc

  632240 15709992 102484322
    2748    2748  399642


In [75]:
%%bash
for path in `find /Users/iijima.s.ad/git/article-extractor/articles/fox | grep 'joint_roots.txt'`
do
    sed -E 's@:::([^]]+)\]\]@\]\] \1@g' ${path} > `echo ${path} | sed -e 's/joint_roots/insert_roots/g'`
done

In [11]:
with open('./models/wordvec/insert_article.vec', 'r') as input_file, open('./models/wordvec/insert_article2.vec', 'w') as output_file:
    tmp_list = [l for l in input_file.readlines()[1:] if len(l.strip().split(' ')) == 101]
    output_file.write(f'{len(tmp_list)} 100\n')
    output_file.writelines(tmp_list)

In [13]:
with open('./models/wordvec/insert_article2.vec', 'r') as input_file:
    print(len([l for l in input_file.readlines()[1:] if len(l.strip().split(' ')) != 101]))

0


In [66]:
# test.txtをNERのタグごとに分割する
import re

sentences = []
with open('/Users/iijima.s.ad/git/JASen/datasets/fox/test_insert.txt', 'r') as input_file, open('/Users/iijima.s.ad/git/article-extractor/articles/fox/drones/biden-admin-plan-counter-threats-drones/joint_roots.txt', 'r') as sentence_file:
    sentence_id = 0

    for line, sentence in zip(input_file.readlines(), sentence_file.readlines()[:30]):
        _, aspect, senti, __ = line.strip().split('\t')
        for m in re.finditer(r'\[\[PERSON:::.*?\]\]', sentence.strip()):
            start, end = m.span()
            tmp_sentence = re.sub(r'\[\[[^:]+:::(.*?)\]\]', r'\1', sentence[:start]) + '[[PERSON]]' + re.sub(r'\[\[[^:]+:::(.*?)\]\]', r'\1', sentence[end:])
            sentences.append(f'{sentence_id}\t{aspect}\t{senti}\t{tmp_sentence.strip()}\n')
            sentence_id += 1
        if '[[PERSON:::' not in sentence:
            tmp_sentence = re.sub(r'\[\[[^:]+:::(.*?)\]\]', r'\1', sentence)
            sentences.append(f"{sentence_id}\t{aspect}\t{senti}\t{tmp_sentence}")
            sentence_id += 1

with open('/Users/iijima.s.ad/git/JASen/datasets/fox/test.txt', 'w') as f:
    f.writelines(sentences)

In [97]:
# joint.txtから行が1つのタグの文章に置き換える関数
def replace_to_split_sentences(lines):
    sentences = []
    for sentence in lines:
        for m in re.finditer(r'\[\[([^:]+):::.*?\]\]', sentence):
            start, end = m.span()
            tmp_sentence = re.sub(r'\[\[[^:]+:::(.*?)\]\]', r'\1', sentence[:start]) + '[[' + m.group(1) + ']]' + re.sub(r'\[\[[^:]+:::(.*?)\]\]', r'\1', sentence[end:])
            sentences.append(tmp_sentence)
            
    return sentences

In [86]:
# 's を除く関数
def trim_s(word):
    return word[:-2] if word[-2:] == "'s" else word

In [103]:
# jointからタグtxtに書き換える

import glob

for path in glob.glob('/Users/iijima.s.ad/git/article-extractor/articles/fox/*/*/article_joint.txt'):
    tag_path = path.replace('article_joint.txt', 'article_tag.txt')
    with open(tag_path, 'w') as f, open(path, 'r') as joint:
        f.writelines(replace_to_split_sentences(joint.readlines()))

In [101]:
# ner_trainをタグtxtに変換する

with open('./articles/fox/joint_train.txt', 'r') as train_file, open('./articles/fox/tag_train.txt', 'w') as f:
    f.writelines(replace_to_split_sentences(train_file.readlines()))