In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# @Date    : 2019-03-04 17:31:29
# @Author  : Bruce Bai (guangtong.bai@wisc.edu)

import os

import pandas as pd
import re

In [17]:
FILTERED_DOC_DIR = '../filtered_documents/'
MAX_EXAMPLE_LEN = 3
FEATURE_LIST = [
    'all_lowercase',
    
    'surrounded_by_paren',
    'has_left_comma',
    'has_right_comma',
    'has_left_period',
    'has_right_period',
    'first_last_word_capital',
    'surrounding_word_capital',
    
    'prefix_in_whitelist',
    'prefix_in_blacklist',
    'suffix_in_whitelist',
    'suffix_in_blacklist',
    'end_with_prime_s',
    'tf',
    'df',
    'tf-idf'
]

In [3]:
def brackets_matching(example_padded, lbrace, rbrace):
    example_len = len(example_padded) - 4
    label = 0
    left_brace_max_index = -1
    for left_index in range(example_len-1, example_len+2):
        if lbrace in example_padded[left_index]:
            left_brace_max_index = left_index
    right_brace_min_index = len(example_padded)
    for right_index in range(4, 1, -1):
        if rbrace in example_padded[right_index]:
            right_brace_min_index = right_index
    if (left_brace_max_index > -1 and left_brace_max_index <= 2 and 
        right_brace_min_index < len(example_padded) and right_brace_min_index >= example_len+1):
        label = 1
    for left_index in range(example_len-1, example_len+1):
        if rbrace in example_padded[left_index] and left_index >= left_brace_max_index:
            label = 0
            break
    for right_index in range(4, 2, -1):
        if lbrace in example_padded[right_index] and right_index <= right_brace_min_index:
            label = 0
            break
    return label

In [4]:
def has_surrounded_symbol(example_padded, pos, symbol):
    example_len = len(example_padded) - 4
    if pos == 'left':
        if example_padded[1][-1] == symbol:
            return 1
        else:
            return 0
    else:
        if example_padded[example_len+1][-1] == symbol:
            return 1
        else:
            return 0

In [20]:
def gen_feature_label_example_len(text, example_len):
    X_len = pd.DataFrame(columns=(['example'] + FEATURE_LIST))
    y_len = pd.DataFrame(columns=['example', 'is_person_name'])
    parts = text.split(' ')
    index = 2
    while index+example_len+2 <= len(parts):
        example_padded = parts[index-2:index+example_len+2]
        example = ' '.join(example_padded[2:2+example_len])
        feature_dict = {'example': example}
        
        # ========================================================================
        # "example_padded" has the following form:                              ||
        # [pad_0, pad_1, word_1, ..., word_n,                 pad_-2, pad_-1]   ||
        #  0      1      2            len-3 (example_len+1)   len-2   len-1     ||
        # ========================================================================
        
        # generate "surrounded_by_paren" feature
        feature_dict['surrounded_by_paren'] = brackets_matching(example_padded, '(', ')')

        # generate "has_left_comma" feature
        feature_dict['has_left_comma'] = has_surrounded_symbol(example_padded, 'left', ',')
        
        # generate "has_right_comma" feature
        feature_dict['has_right_comma'] = has_surrounded_symbol(example_padded, 'right', ',')
        
        # generate "has_left_period" feature
        feature_dict['has_left_period'] = has_surrounded_symbol(example_padded, 'left', '.')
        
        # generate "has_right_period" feature
        feature_dict['has_right_period'] = has_surrounded_symbol(example_padded, 'right', '.')
        
        # generate "first_last_word_capital" feature
        first_capital = re.fullmatch('[^a-zA-Z]*[A-Z].*', example_padded[2])
        last_capital = re.fullmatch('[^a-zA-Z]*[A-Z].*', example_padded[1+example_len])
        feature_dict['first_last_word_capital'] = 1 if (first_capital and last_capital) else 0
        
        # generate "surrounding_word_capital" feature
        left_capital = re.fullmatch('[^a-zA-Z]*[A-Z].*', example_padded[1])
        right_capital = re.fullmatch('[^a-zA-Z]*[A-Z].*', example_padded[2+example_len])
        feature_dict['surrounding_word_capital'] = 1 if (left_capital or right_capital) else 0
        
        # generate "all_lowercase" feature
        feature_dict['all_lowercase'] = 1
        for i in range(2, 2+example_len):
            if re.fullmatch('[^A-Z]+', example_padded[i]) == None:
                feature_dict['all_lowercase'] = 0
                break
                
        # generate "prefix_in_whitelist" feature
        
        # generate "prefix_in_blacklist" feature
        
        # generate "suffix_in_whitelist" feature
        
        # generate "suffix_in_blacklist" feature
        
        # generate "end_with_prime_s" feature
        
        # generate "tf" feature
        
        # generate "idf" feature
        
        # generate "tf-idf" feature

        
        X_len = X_len.append(feature_dict, ignore_index=True)

        # generate label
        label = brackets_matching(example_padded, '{', '}')
        y_len = y_len.append({'example': example, 'is_person_name': label}, ignore_index = True)

        index += 1

    return X_len, y_len

In [13]:
def gen_feature_label_doc(doc_name):
    X_doc = pd.DataFrame(columns=(['example'] + FEATURE_LIST))
    y_doc = pd.DataFrame(columns=['example', 'is_person_name'])
    doc = open(FILTERED_DOC_DIR+doc_name, 'r')
    text = ' '.join(doc.readlines()[2:]) # skip the title and empty line
    text = '. . ' + text + ' . .' # pad with '. .' at both ends
    for example_len in range(1, MAX_EXAMPLE_LEN+1):
        X_len, y_len = gen_feature_label_example_len(text, example_len)
        X_doc = X_doc.append(X_len, ignore_index=True)
        y_doc = y_doc.append(y_len, ignore_index=True)
    return X_doc, y_doc

In [21]:
doc_list = sorted(os.listdir(FILTERED_DOC_DIR), key = lambda x: int(x.split('.')[0]))
X = pd.DataFrame(columns=(['example'] + FEATURE_LIST))
y = pd.DataFrame(columns=['example', 'is_person_name'])
for doc_name in doc_list[16:17]:
    X_doc, y_doc = gen_feature_label_doc(doc_name)
    X = X.append(X_doc, ignore_index=True)
    y = y.append(y_doc, ignore_index=True)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(X.head(1000), y.head(1000))

                     example all_lowercase surrounded_by_paren has_left_comma  \
0                     {Henry             0                   0              0   
1                      Hill}             0                   0              0   
2                      might             1                   0              0   
3                         be             1                   0              0   
4                          a             1                   0              0   
5                      small             1                   0              0   
6                       time             1                   0              0   
7                  gangster,             1                   0              0   
8                        who             1                   0              1   
9                        may             1                   0              0   
10                      have             1                   0              0   
11                     taken