In [6]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# @Date    : 2019-03-04 17:31:29
# @Author  : Bruce Bai (guangtong.bai@wisc.edu)

import os

import pandas as pd

In [7]:
FILTERED_DOC_DIR = '../filtered_documents/'
MAX_EXAMPLE_LEN = 3
FEATURE_LIST = [
    'has_left_paren',
    'has_right_paren',
    'has_left_comma',
    'has_right_comma',
    'has_left_period',
    'has_right_period',
    'first_last_word_capital',
    'surrounding_word_capital',
    'all_lowercase',
    'prefix_in_whitelist',
    'prefix_in_blacklist',
    'suffix_in_whitelist',
    'suffix_in_blacklist',
    'end_with_prime_s',
    'tf',
    'df',
    'tf-idf'
]

In [36]:
def gen_feature_label_example_len(text, example_len):
    X_len = pd.DataFrame(columns=(['example'] + FEATURE_LIST))
    y_len = pd.DataFrame(columns=['example', 'is_person_name'])
    parts = text.split(' ')
    index = 2
    while index+example_len+2 <= len(parts):
        example_padded = parts[index-2:index+example_len+2]
        example = ' '.join(example_padded[2:2+example_len])
        feature_dict = {'example': example}
        
        # ========================================================================
        # "example_padded" has the following form:                              ||
        # [pad_0, pad_1, word_1, ..., word_n,                 pad_-2, pad_-1]   ||
        #  0      1      2            len-3 (example_len+1)   len-2   len-1     ||
        # ========================================================================
        
        # generate "has_left_paren" feature
        if '(' in example_padded[1] or '(' in example_padded[2]:
            feature_dict['has_left_paren'] = 1
        else:
            feature_dict['has_left_paren'] = 0
        
        # generate "has_right_paren" feature
        
        # generate "has_left_comma" feature
        
        # generate "has_right_comma" feature
        
        # generate "has_left_period" feature
        
        # generate "has_right_period" feature
        
        # generate "first_last_word_capital" feature
        
        # generate "surrounding_word_capital" feature
        
        # generate "all_lowercase" feature
        
        # generate "prefix_in_whitelist" feature
        
        # generate "prefix_in_blacklist" feature
        
        # generate "suffix_in_whitelist" feature
        
        # generate "suffix_in_blacklist" feature
        
        # generate "end_with_prime_s" feature
        
        # generate "tf" feature
        
        # generate "idf" feature
        
        # generate "tf-idf" feature

        
        X_len = X_len.append(feature_dict, ignore_index=True)

        # generate label
        label = 0
        
        left_brace_max_index = -1
        for left_index in range(example_len-1, example_len+2):
            if '{' in example_padded[left_index]:
                left_brace_max_index = left_index
        right_brace_min_index = len(example_padded)
        for right_index in range(4, 1, -1):
            if '}' in example_padded[right_index]:
                right_brace_min_index = right_index
        if (left_brace_max_index > -1 and left_brace_max_index <= 2 and 
            right_brace_min_index < len(example_padded) and right_brace_min_index >= example_len+1):
            label = 1
        for left_index in range(example_len-1, example_len+1):
            if '}' in example_padded[left_index] and left_index >= left_brace_max_index:
                label = 0
                break
        for right_index in range(4, 2, -1):
            if '{' in example_padded[right_index] and right_index <= right_brace_min_index:
                label = 0
                break
        y_len = y_len.append({'example': example, 'is_person_name': label}, ignore_index = True)

        index += 1

    return X_len, y_len

In [37]:
def gen_feature_label_doc(doc_name):
    X_doc = pd.DataFrame(columns=(['example'] + FEATURE_LIST))
    y_doc = pd.DataFrame(columns=['example', 'is_person_name'])
    doc = open(FILTERED_DOC_DIR+doc_name, 'r')
    text = ' '.join(doc.readlines()[2:]) # skip the title and empty line
    text = '. . ' + text + ' . .' # pad with '. .' at both ends
    for example_len in range(1, MAX_EXAMPLE_LEN+1):
        X_len, y_len = gen_feature_label_example_len(text, example_len)
        X_doc = X_doc.append(X_len, ignore_index=True)
        y_doc = y_doc.append(y_len, ignore_index=True)
    return X_doc, y_doc

In [45]:
doc_list = sorted(os.listdir(FILTERED_DOC_DIR), key = lambda x: int(x.split('.')[0]))
X = pd.DataFrame(columns=(['example'] + FEATURE_LIST))
y = pd.DataFrame(columns=['example', 'is_person_name'])
for doc_name in doc_list[9:10]:
    X_doc, y_doc = gen_feature_label_doc(doc_name)
    X = X.append(X_doc, ignore_index=True)
    y = y.append(y_doc, ignore_index=True)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(X.head(1000), y.head(1000))

(                        example has_left_paren has_right_paren has_left_comma  \
0                             A              0             NaN            NaN   
1                      nameless              0             NaN            NaN   
2                         first              0             NaN            NaN   
3                        person              0             NaN            NaN   
4                      narrator              0             NaN            NaN   
5                      ({Edward              1             NaN            NaN   
6                      Norton})              1             NaN            NaN   
7                       attends              0             NaN            NaN   
8                       support              0             NaN            NaN   
9                        groups              0             NaN            NaN   
10                           in              0             NaN            NaN   
11                      att