# Data Preprocessing
First we need to process given training and test datasets to a format that is more friendly to the machine learning framework. Let's start by understanding the raw inputs.

## Load libraries

In [1]:
import sys,os
import random
print(sys.path)
if ".." not in sys.path:
    sys.path.append("..")

['', '/home/hayley/miniconda3/envs/fastai/lib/python36.zip', '/home/hayley/miniconda3/envs/fastai/lib/python3.6', '/home/hayley/miniconda3/envs/fastai/lib/python3.6/lib-dynload', '/home/hayley/miniconda3/envs/fastai/lib/python3.6/site-packages', '/home/hayley/miniconda3/envs/fastai/lib/python3.6/site-packages/defusedxml-0.5.0-py3.6.egg', '/home/hayley/miniconda3/envs/fastai/lib/python3.6/site-packages/IPython/extensions', '/home/hayley/.ipython']


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats

# sklearn imports
import sklearn
from sklearn.metrics import make_scorer
# from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import RandomizedSearchCV, cross_val_predict

# sklearn_crfsuite imports
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

# pytorch imports 
import torch
import torch.autograd as autograd
from torch.utils.data import Dataset, DataLoader

from torch import Tensor
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# train logging
import logging
from tqdm import trange
# import .utils as my_utils
# from nlp_utils.model_evaluate import evaluate
# from nlp_utils import model_utils
# set a random seed
torch.manual_seed(10);

# model saving and inspection
import joblib
import eli5

import pdb 

# auto-reloads
%reload_ext autoreload
%autoreload 2

In [4]:
print(f"sklearn version: {sklearn.__version__}")
print(f"pytorch version: {torch.__version__}")
# make sure we are using pytorch > 0.4.0

sklearn version: 0.20.0
pytorch version: 0.4.1


In [56]:
print(sys.path)

['', '/home/hayley/miniconda3/envs/fastai/lib/python36.zip', '/home/hayley/miniconda3/envs/fastai/lib/python3.6', '/home/hayley/miniconda3/envs/fastai/lib/python3.6/lib-dynload', '/home/hayley/miniconda3/envs/fastai/lib/python3.6/site-packages', '/home/hayley/miniconda3/envs/fastai/lib/python3.6/site-packages/defusedxml-0.5.0-py3.6.egg', '/home/hayley/miniconda3/envs/fastai/lib/python3.6/site-packages/IPython/extensions', '/home/hayley/.ipython', '..']


In [59]:
def nprint(*args):
    print("="*80)
    for arg in args:
        print(arg)
def test_nprint():
    nprint("hi", "hey")
test_nprint()

hi
hey


## Create datasets from the raw files
We first collect sentences and labels from the raw files while tokenizing each sentence.
For each sentence, we replace `<e1>` and `</e1>` with `E1_START` and `E1_END` tags, and `<e2>` and `</e2>` with `E2_START` and `E2_END`.  

In [6]:
# Define globals
## Note space at the head or the tail
E1_START = "E1_START "
E1_END = " E1_END "
E2_START = "E2_START "
E2_END = " E2_END"

In [79]:
from os.path import dirname, abspath, join

# Set up POS tagger and Tokenizer
work_dir = !pwd
work_dir = work_dir[0]
postagger_path = '/home/hayley/Workspace/Class/IE/Relation-Classification/stanford/stanford-postagger-2017-06-09'
print(postagger_path)
print(work_dir)

/home/hayley/Workspace/Class/IE/Relation-Classification/stanford/stanford-postagger-2017-06-09
/home/hayley/Workspace/Class/IE/HW2/notebooks


In [8]:
os.environ['CLASSPATH'] = postagger_path

In [8]:
# Alternatively use Stanford coreNLP
# from nltk.tokenize.stanford import StanfordTokenizer
# from nltk.parse.corenlp import CoreNLPParser
# parser = CoreNLPParser()

# test
# list(parser.tokenize('what ?'))

In [95]:
# Read train and test text files
DATA_DIR = '../data'
TRAIN_FPATH = abspath(join(DATA_DIR, 'SemEval2010_task8_training', 'TRAIN_FILE.TXT'))
TEST_FPATH = abspath(join(DATA_DIR, 'SemEval2010_task8_testing', 'TEST_FILE.TXT'))

PROC_DATA_DIR = abspath(join(DATA_DIR, 'Processed'))

In [97]:
FPATH = TRAIN_FPATH #TEST_FPATH
verbose = True
def collect_sents_labels(fpath, verbose=False):
    """
    Create a 2D sentences matrix and label vector from the input txt file 
    Args:
    - fpath (str): path to the input data file (.txt)
    - verbose (bool): True to show progress
    
    Returns:
    - X (np.array or list): sentences collected from the input file
        Each row is a sentence.  A sentence is a list of words.
    - y (np.array or list): labels corresponding to the type of relation marked
        at each sentence
    """
        
    sentences = []
    y = []
    for i,line in enumerate(open(fpath, 'r')):
        line = line.rstrip()
    #     if i < 10:
    #         print(f"{i}: {line}")

        if (i%4 == 0):
            line = line.split("\t")[-1] # grab just the strings (ignore sentence index)
            line = line[1:-1] # remove quotation marks

            if i<2:
                print(line)

            # Replace XML tags
            line = line.replace("<e1>", E1_START).replace("</e1>", E1_END)
            line = line.replace("<e2>", E2_START).replace("</e2>", E2_END)

            # Tokenize the string
            line = StanfordTokenizer().tokenize(line)
            sentences.append(line)

            if i<2:
                print(sentences[-1])

        elif (i%4 == 1):
            y.append(line)
            
        if verbose and i%100==0:
            print(i)
    assert (len(sentences) == len(y))
    assert (len(y) == 8000)
    
    return (sentences, y)

In [101]:
def collect_sents(fpath, verbose=False):
    """
    Create a 2D array for sentences for input data without labels.
    Args:
    - fpath (str): path to the input data file (.txt)
    - verbose (bool): True to show progress
    
    Returns:
    - X (np.array or list): sentences collected from the input file
        Each row is a sentence.  A sentence is a list of words.
    """
    sentences = []
    for i,line in enumerate(open(fpath, 'r')):
        # grab just the strings (ignore sentence index)
        line = line.rstrip().split("\t")[-1]
        # remove quotation marks
        line = line[1:-1] 

        # Replace XML tags
        line = line.replace("<e1>", E1_START).replace("</e1>", E1_END)
        line = line.replace("<e2>", E2_START).replace("</e2>", E2_END)

        # Tokenize the string
        line = StanfordTokenizer().tokenize(line)
        sentences.append(line)
#         print(line)
#         pdb.set_trace()

        if i<2: print(sentences[-1]);
            
        if verbose and i%300==0: print(i);
    
    return sentences
    

In [102]:
# train_dev_sents, train_dev_labels = collect_sents_labels(TRAIN_FPATH)
TEST_FPATH = "/home/hayley/Workspace/Class/IE/HW2/data/SemEval2010_task8_testing/TEST_FILE.txt"
test_sents = collect_sents(TEST_FPATH)

['The', 'most', 'common', 'E1_START', 'audits', 'E1_END', 'were', 'about', 'E2_START', 'waste', 'E2_END', 'and', 'recycling', '.']
['The', 'E1_START', 'company', 'E1_END', 'fabricates', 'plastic', 'E2_START', 'chairs', 'E2_END', '.']


joblib.dump(train_dev_sents, '../data/Processed/train_val_sents.pkl')
joblib.dump(train_dev_labels, '../data/Processed/train_val_labels.pkl')

### Let's create a relation to index dictionary

In [66]:
labelset = np.sort(np.unique(y))

# Move 'Other' to the end of labelset
o_idx = np.argwhere(labelset=='Other')
labelset = np.append(np.delete(labelset, o_idx),['Other'])
nprint(labelset, len(labelset))


['Cause-Effect(e1,e2)' 'Cause-Effect(e2,e1)' 'Component-Whole(e1,e2)'
 'Component-Whole(e2,e1)' 'Content-Container(e1,e2)'
 'Content-Container(e2,e1)' 'Entity-Destination(e1,e2)'
 'Entity-Destination(e2,e1)' 'Entity-Origin(e1,e2)' 'Entity-Origin(e2,e1)'
 'Instrument-Agency(e1,e2)' 'Instrument-Agency(e2,e1)'
 'Member-Collection(e1,e2)' 'Member-Collection(e2,e1)'
 'Message-Topic(e1,e2)' 'Message-Topic(e2,e1)' 'Product-Producer(e1,e2)'
 'Product-Producer(e2,e1)' 'Other']
19


In [69]:
print(f"Unique relations:\n {labelset}")

Unique relations:
 ['Cause-Effect(e1,e2)' 'Cause-Effect(e2,e1)' 'Component-Whole(e1,e2)'
 'Component-Whole(e2,e1)' 'Content-Container(e1,e2)'
 'Content-Container(e2,e1)' 'Entity-Destination(e1,e2)'
 'Entity-Destination(e2,e1)' 'Entity-Origin(e1,e2)' 'Entity-Origin(e2,e1)'
 'Instrument-Agency(e1,e2)' 'Instrument-Agency(e2,e1)'
 'Member-Collection(e1,e2)' 'Member-Collection(e2,e1)'
 'Message-Topic(e1,e2)' 'Message-Topic(e2,e1)' 'Product-Producer(e1,e2)'
 'Product-Producer(e2,e1)' 'Other']


In [72]:
rel2idx = {rel:i for i,rel in enumerate(labelset)}
print(rel2idx)
                                        

{'Cause-Effect(e1,e2)': 0, 'Cause-Effect(e2,e1)': 1, 'Component-Whole(e1,e2)': 2, 'Component-Whole(e2,e1)': 3, 'Content-Container(e1,e2)': 4, 'Content-Container(e2,e1)': 5, 'Entity-Destination(e1,e2)': 6, 'Entity-Destination(e2,e1)': 7, 'Entity-Origin(e1,e2)': 8, 'Entity-Origin(e2,e1)': 9, 'Instrument-Agency(e1,e2)': 10, 'Instrument-Agency(e2,e1)': 11, 'Member-Collection(e1,e2)': 12, 'Member-Collection(e2,e1)': 13, 'Message-Topic(e1,e2)': 14, 'Message-Topic(e2,e1)': 15, 'Product-Producer(e1,e2)': 16, 'Product-Producer(e2,e1)': 17, 'Other': 18}


In [75]:
train_val_sents = sentences
train_val_labels = y #string labels
train_val_y = [rel2idx[rel] for rel in train_val_labels]
nprint(train_val_labels[:5])
nprint(train_val_y[:5])

['Component-Whole(e2,e1)', 'Other', 'Instrument-Agency(e2,e1)', 'Other', 'Member-Collection(e1,e2)']
[3, 18, 11, 18, 12]


In [76]:
assert(len(train_val_labels) == len(train_val_y))

Prepare test dataset

In [None]:
test_sents = collect_sents(TEST_FPATH, True)

The system as described above has its greatest application in an arrayed <e1>configuration</e1> of antenna <e2>elements</e2>.
['The', 'system', 'as', 'described', 'above', 'has', 'its', 'greatest', 'application', 'in', 'an', 'arrayed', 'E1_START', 'configuration', 'E1_END', 'of', 'antenna', 'E2_START', 'elements', 'E2_END', '.']
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11900
12000
12100
12200
12300
12400
12500
12600
12700
12800
12900
1

In [89]:
test_sents[:10]


[['The',
  'system',
  'as',
  'described',
  'above',
  'has',
  'its',
  'greatest',
  'application',
  'in',
  'an',
  'arrayed',
  'E1_START',
  'configuration',
  'E1_END',
  'of',
  'antenna',
  'E2_START',
  'elements',
  'E2_END',
  '.'],
 [],
 ['omment'],
 ['nstrument-Agency', '-LRB-', 'e2', ',', 'e1'],
 ['A',
  'misty',
  'E1_START',
  'ridge',
  'E1_END',
  'uprises',
  'from',
  'the',
  'E2_START',
  'surge',
  'E2_END',
  '.'],
 [],
 ['omment'],
 ['the'],
 ['The',
  'current',
  'view',
  'is',
  'that',
  'the',
  'chronic',
  'E1_START',
  'inflammation',
  'E1_END',
  'in',
  'the',
  'distal',
  'part',
  'of',
  'the',
  'stomach',
  'caused',
  'by',
  'Helicobacter',
  'pylori',
  'E2_START',
  'infection',
  'E2_END',
  'results',
  'in',
  'an',
  'increased',
  'acid',
  'production',
  'from',
  'the',
  'non-infected',
  'upper',
  'corpus',
  'region',
  'of',
  'the',
  'stomach',
  '.'],
 []]

In [85]:
# save results
# joblib.dump(train_dev_sents, os.path.join(DATA_PROCDIR, 'train_dev_sents.pkl'))
# joblib.dump(train_dev_labels, os.path.join(DATA_PROCDIR, 'train_dev_labels.pkl'))
joblib.dump(train_dev_y, os.path.join(DATA_PROCDIR, 'train_dev_y.pkl'))

joblib.dump(test_sents, os.path.join(DATA_PROCDIR, 'test_sents.pkl'))

The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.parse.corenlp.CoreNLPParser[0m instead.'
  """Entry point for launching an IPython kernel.


['The',
 'system',
 'as',
 'described',
 'above',
 'has',
 'its',
 'greatest',
 'application',
 'in',
 'an',
 'arrayed',
 'E1_START',
 'configuration',
 'E1_END',
 'of',
 'antenna',
 'E2_START',
 'elements',
 'E2_END',
 '.']

In [None]:
# split train_dev to train and dev dataset
train_dev_indices = np.arange(len(train_dev_set


## Features 
Now that we have the sentences and relations (labels) extracted from the input text files, the next step is to generate features with which we will create the data matrices for training and testing. I will generate three different kinds of features for the purpose of my experiments. 

1. Linguistic Features
    - Part of Speech tags (POS)
    - Word Embeddings
    - WordNet tags
    - Shortest Dependency Path (SDP)
    - Grammar Relation tags (GR)

2. Word Positional Indicitors
Replace <e1></e1> with E1_START and E1_END.  Similarly, replace <e1></e1> with E1_START and E1_END

3. Word Positional Embedding

    1. Linguistic Features

    2. Word Positional Indicators
We replace <e1></e1> with E1_START and E1_END, and replace <e1></e1> with E1_START and E1_END.



In [48]:
#code for replacing <e1> -> e1_start, etc


    3. Lastly, we use word positional embedding to encode the words between the two entities.  
For example:

<img/>



In [None]:
#code for word positional embedding