# Feature Engineering Script

In [1]:
import pandas as pd
import nltk

In [7]:
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\638658\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\638658\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\638658\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


True

In [3]:
data_path = '../data/SQuAD_Processed_DF.h5'

In [4]:
df = pd.read_hdf(data_path, 'df')

## Apply Part of Speech (POS) and Name Entity Recognition (NER) to all text in the dataframe

In [5]:
def apply_pos_and_ner(text_list):
    """
    This function performs part of speech (POS) and name entity recognition (NER)
     
    Args:
        text_list: A list of tokenized strings to process
    
    Returns:
        A list of tagged tokens
    """
    output = []
    for word in text_list:
        tokens = nltk.word_tokenize(word)  # tokenize
        tagged = nltk.pos_tag(tokens)
        output.append(nltk.chunk.ne_chunk(tagged))
    return output

In [8]:
df['question'] = df['question'].apply(apply_pos_and_ner)
df['context'] = df['context'].apply(apply_pos_and_ner)
df['text'] = df['text'].apply(apply_pos_and_ner)

## Ensure POS and NER were applied

In [12]:
df['context'][0]

[Tree('S', [('architectur', 'NN')]),
 Tree('S', [(',', ',')]),
 Tree('S', [('school', 'NN')]),
 Tree('S', [('cathol', 'NN')]),
 Tree('S', [('charact', 'NN')]),
 Tree('S', [('.', '.')]),
 Tree('S', [('atop', 'NN')]),
 Tree('S', [('main', 'JJ')]),
 Tree('S', [('build', 'NN')]),
 Tree('S', [("'s", 'POS')]),
 Tree('S', [('gold', 'NN')]),
 Tree('S', [('dome', 'NN')]),
 Tree('S', [('golden', 'JJ')]),
 Tree('S', [('statu', 'NN')]),
 Tree('S', [('virgin', 'NN')]),
 Tree('S', [('mari', 'NN')]),
 Tree('S', [('.', '.')]),
 Tree('S', [('immedi', 'NN')]),
 Tree('S', [('front', 'NN')]),
 Tree('S', [('main', 'JJ')]),
 Tree('S', [('build', 'NN')]),
 Tree('S', [('face', 'NN')]),
 Tree('S', [(',', ',')]),
 Tree('S', [('copper', 'NN')]),
 Tree('S', [('statu', 'NN')]),
 Tree('S', [('christ', 'NN')]),
 Tree('S', [('arm', 'NN')]),
 Tree('S', [('uprais', 'NN')]),
 Tree('S', [('legend', 'NN')]),
 Tree('S', [('``', '``')]),
 Tree('S', [('venit', 'NN')]),
 Tree('S', [('ad', 'NN')]),
 Tree('S', [('omn', 'NN')]),

In [13]:
df.to_hdf('../data/SQuAD_Engineered_DF.h5', key = 'df')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->Index(['index', 'question', 'context', 'text'], dtype='object')]

  df.to_hdf('../data/SQuAD_Engineered_DF.h5', key = 'df')
