In [1]:
import json
import os
import re
import subprocess
from tqdm import tqdm

In [2]:
AUTOPHRASE_PATH = '/Users/rishimasand/Documents/school/college/research/text_mining/AutoPhrase'

In [3]:
def model2segmented_text_path(model_name):
    return os.path.join(AUTOPHRASE_PATH, model_name, 'segmentation.txt')

In [4]:
def train_autophrase(text_to_seg, model):
    if os.path.exists(model2segmented_text_path(model)):
        return

    os.environ['RAW_TRAIN'] =  os.path.abspath(text_to_seg)
    os.environ['MODEL'] = model
    mycwd = os.getcwd()
    print(os.getcwd())
    os.chdir(AUTOPHRASE_PATH)
    print(os.getcwd())
    proc = subprocess.Popen('bash auto_phrase.sh'.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    for line in proc.stdout:
        print(line)
    os.chdir(mycwd)

In [5]:
def segment(text_to_seg, model):
    os.environ['TEXT_TO_SEG'] = os.path.abspath(text_to_seg)   # tell autophrase with abspath
    os.environ['MODEL'] = model
    mycwd = os.getcwd()
    os.chdir(AUTOPHRASE_PATH)
    proc = subprocess.Popen('bash phrasal_segmentation.sh'.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    for line in proc.stdout:
        print(line)
    os.chdir(mycwd)

In [6]:
def remove_marker(text):
    return re.sub('</?phrase>', '', text)

In [7]:
def get_line_count(inFile):
    count = -1
    for count, line in enumerate(open(inFile, 'r')):
        pass
    count += 1
    return count

In [8]:
def condenseSpace(s):
    return re.sub('([\s])+', '\g<1>', s)

In [9]:
def validate_nps(nps, words_original):
    validated_nps = []
    for np in sorted(nps, key=lambda x:x['st']):
        st = np['st']
        ed = np['ed']
        token_span = words_original[st:ed]
        # 'A polynomial time algorithm for the Lambek calculus with brackets of  bounded order'
        if ' '.join(token_span).strip() != np['text'].strip():
            print(' '.join(token_span))
            print(np)
            return validated_nps
        validated_nps.append(np)
    return nps

In [10]:
all_ap_tokens = []
all_nps = []
def write_to_json(inFile, outFile, originalFile):
    with open(inFile, 'r') as fin, open(outFile, 'w') as fout, open(originalFile, 'r') as fOriginal:
        total = get_line_count(inFile)

        cnt = 0
        data = []
        for i, (line, line_original) in tqdm(enumerate(zip(fin, fOriginal)), total=total):
            text = line.strip()

            tokens = text.split(' ')
            original_tokens = line_original.split()
            all_ap_tokens.extend(original_tokens)
            clean_tokens = condenseSpace(remove_marker(text)).split(' ')
            nps = []
            for idx, token in enumerate(tokens):
                if '<phrase>' in token:
                    if token.startswith('<phrase>'):
                        span = {'st': idx}
                    else:
                        span = {}
                elif '</phrase>' in token:
                    if token.endswith('</phrase>'):
                        try:
                            if span:
                                span['ed'] = idx + 1
                                span['text'] = ' '.join(clean_tokens[span['st']:span['ed']])
                                nps.append(span)
                            span = {}
                        except Exception as e:
                            ipdb.set_trace()
                            print(e)
                    else:
                        span = {}
            if nps:
                nps_v = validate_nps(nps, original_tokens)
                if nps_v != nps:
                    ipdb.set_trace()
                nps = nps_v
            fout.write(json.dumps(nps))
            all_nps.extend(nps)
            fout.write('\n')
    with open('data/arxiv_abstracts_10000_autophrase_raw.txt', 'w') as f:
        f.write('\n'.join(list(set(all_ap_tokens))))
        f.close()
    with open('data/arxiv_abstracts_10000_autophrase_nps.txt', 'w') as f:
        f.write('\n'.join(list(set([x['text'] for x in all_nps]))))
        f.close()

In [11]:
data_dir = 'data'
input_file_name = 'arxiv_abstracts_10000.txt'
extensionless_input_file_name = input_file_name.split('.')[0]
input_file_path = f'{data_dir}/{input_file_name}'
tokenized_text_autophrase_file_path = f'{data_dir}/{extensionless_input_file_name}_autophrase.json'

autophrase_model_name = extensionless_input_file_name

train_autophrase(input_file_path, autophrase_model_name)
segment(input_file_path, autophrase_model_name)

write_to_json(model2segmented_text_path(autophrase_model_name), tokenized_text_autophrase_file_path, input_file_path)

b'\x1b[32m===Compilation===\x1b[m\n'
b'\x1b[32m===Tokenization===\x1b[m\n'
b'Current step: Tokenizing input file...\x1b[0K\r\n'
b'real\t0m2.143s\n'
b'user\t0m12.244s\n'
b'sys\t0m0.467s\n'
b'Detected Language: EN\x1b[0K\n'
b'\x1b[32m===Part-Of-Speech Tagging===\x1b[m\n'
b'Current step: Splitting files...\x1b[0K\rCurrent step: Tagging...\x1b[0K\rCurrent step: Merging...\x1b[0K\r\n'
b'\x1b[32m===Phrasal Segmentation===\x1b[m\n'
b'=== Current Settings ===\n'
b'Segmentation Model Path = arxiv_abstracts_10000/segmentation.model\n'
b'After the phrasal segmentation, only following phrases will be highlighted with <phrase> and </phrase>\n'
b'\tQ(multi-word phrases) >= 0.500000\n'
b'\tQ(single-word phrases) >= 0.800000\n'
b'POS guided model loaded.\n'
b'# of loaded patterns = 19370\n'
b'# of loaded truth patterns = 47857\n'
b'POS transition matrix loaded\n'
b'Phrasal segmentation finished.\n'
b'   # of total highlighted quality phrases = 204754\n'
b'   # of total processed sentences = 68231\n'
b

  3%|▎         | 321/10000 [00:00<00:03, 3204.90it/s]

b'\n'
b'real\t0m1.836s\n'
b'user\t0m3.542s\n'
b'sys\t0m0.223s\n'


100%|██████████| 10000/10000 [00:02<00:00, 3349.27it/s]
