In [143]:
import os
import time
import sys
import csv
import pandas as pd
import numpy as np
import string
import ast
from IPython.display import display

from shutil import copyfile
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

csv.field_size_limit(sys.maxsize)

Mounted at /content/drive


9223372036854775807

In [0]:
### File configurations
drive_dir = "/content/drive/My Drive/W266_Project/"
data_src = os.path.join(drive_dir,"ontonotes")
embed_src = os.path.join(drive_dir,"embeddings")

train_file = 'onto.train.ner'
embedding_file = "glove.6B.50d.txt"

### Preprocessing Parameters
UNK_WORD = "<UNK-WORD>"
PAD_WORD = "<PAD-WORD>"

UNK_CHAR = "<UNK-CHAR>"
PAD_CHAR = "<PAD-CHAR>"

# max number of words in a sentence, pad to this length, might throw an error if the sentence is longer
SENTENCE_WIDTH = 256
# max number of characters in a word, pad to this length, will truncate if word is too long
WORD_WIDTH = 52
# symbols to map padding to
CHAR_PAD_SYMBOL = PAD_CHAR
LABEL_PAD_SYMBOL = 'O'
CASE_PAD_SYMBOL = 'other'


In [145]:
for file in os.listdir(os.path.join(drive_dir, 'data')):
  print(file)

TEST (1).CSV


In [0]:
z = pd.read_csv(os.path.join(drive_dir, 'data', 'TEST (1).CSV'), sep="\t")

In [147]:
z.label.unique()

array(['B-PERSON', 'I-PERSON', 'O', 'B-DATE', 'I-DATE', 'B-TITLE',
       'B-ORG', 'I-TITLE', nan, 'I-ORG', 'B-PERCENT', 'I-PERCENT',
       'B-GPE', 'I-GPE', 'B-PRODUCT', 'I-PRODUCT', 'B-CARDINAL',
       'B-MONEY', 'I-MONEY', 'B-LAW', 'I-LAW', 'B-NORP', 'I-NORP',
       ' B-GPE'], dtype=object)

In [148]:
z.shape

(10935, 4)

In [149]:
z.head()

Unnamed: 0,token,company,director,label
0,Jeffrey,AMAZON.COM INC,1.0,B-PERSON
1,P.,AMAZON.COM INC,1.0,I-PERSON
2,Bezos,AMAZON.COM INC,1.0,I-PERSON
3,",",AMAZON.COM INC,1.0,O
4,age,AMAZON.COM INC,1.0,B-DATE


In [0]:
def checkPrior(blah):
  if blah is np.NaN:
    return True
  else:
    return False

# see if prior row was a newline
z['prior'] = z.token.shift(1)
# drop empty rows
z = z.loc[~z.token.isnull()]
z['prior'] = z.prior.apply(checkPrior)
z['phrase'] = z.prior.cumsum()

In [151]:
z.head()

Unnamed: 0,token,company,director,label,prior,phrase
0,Jeffrey,AMAZON.COM INC,1.0,B-PERSON,True,1
1,P.,AMAZON.COM INC,1.0,I-PERSON,False,1
2,Bezos,AMAZON.COM INC,1.0,I-PERSON,False,1
3,",",AMAZON.COM INC,1.0,O,False,1
4,age,AMAZON.COM INC,1.0,B-DATE,False,1


In [152]:
z.groupby('label').agg('count')

Unnamed: 0_level_0,token,company,director,prior,phrase
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
B-GPE,1,1,1,1,1
B-CARDINAL,2,2,2,2,2
B-DATE,336,336,336,336,336
B-GPE,45,45,45,45,45
B-LAW,1,1,1,1,1
B-MONEY,1,1,1,1,1
B-NORP,1,1,1,1,1
B-ORG,671,671,671,671,671
B-PERCENT,2,2,2,2,2
B-PERSON,211,211,211,211,211


In [153]:
len(z[z.label == 'B-TITLE'])

511

In [154]:
print(f"Number of directors: {len(z.director.unique())}")
print(f"Number of companies: {len(z.company.unique())}")
print(f"Number of sentences: {z.phrase.max()}")
print(f"Number of unique tokens: {len(z.token.unique())}")
print(f"Average number of tokens per sentence: {z.groupby('phrase').size().mean()}")
print(f"Instances of ORG label:{len(z[z.label == 'B-ORG'])}")
print(f"Instances of TITLE label: {len(z[z.label == 'B-TITLE'])}")

Number of directors: 50
Number of companies: 11
Number of sentences: 400
Number of unique tokens: 1705
Average number of tokens per sentence: 26.34
Instances of ORG label:671
Instances of TITLE label: 511


# Validate that file transforms into format for model

In [0]:
# consider using tf.nn.embedding_lookup instead
# or maybe nltk.tokenize


def get_casing_ix(word):
  '''
  determines the casing of the word
  
  returns casing_ix
  '''
  if word.istitle():
    return case_to_ix['title']
  elif word.islower():
    return case_to_ix['lower']
  elif word.isupper():
    return case_to_ix['upper']
  elif word.isnumeric():
    return case_to_ix['numeric']
  return case_to_ix['other']

def get_word_ix(word):
  '''
  takes w and returns the index of the word embedding
  out of vocabulary terms return the UNK_WORD and the character embeddings
  
  returns word_ix
  '''
  w = word.lower()
  w_ix = word_to_ix.get(w)
  if w_ix is not None:
    return w_ix
  return word_to_ix[UNK_WORD]

def get_char_ix(char):
  char_ix = char_to_ix.get(char)
  if char_ix is not None:
    return char_ix
  return char_to_ix[UNK_CHAR]
  
def create_character_embeddings(words_df):
  '''
  Optional function to create pre-trained character embeddings from averaged word embeddings.  In the model we generate them from a uniform random distribution and train.
  '''
  characters = {}
  for i, word_vec in enumerate(words_df.reset_index().values):
    for char in word_vec[0]:
      if char in characters:
        characters[char] = [characters[char][0] + word_vec[1:].astype(float), characters[char][1] + 1]
      else:
        characters[char] = [word_vec[1:].astype(float), 1]

  for key in characters:
    characters[key] = np.round(characters[key][0]/characters[key][1],6)
    
def initialize_word_embeddings(file_name, use_cache=True, debug=True, save_cache=True):
  loaded = False
  df = None
  
  if use_cache:
    try:
      print("Attempting to load from cache")
      with pd.HDFStore(embed_store, 'r') as store:
        words = store[file_name]
      words = pd.read_hdf(embed_store, file_name)
      loaded=True
      print("Loaded successfully")
    except:
      print("Cache loading failed")
      loaded=False
  
  if not loaded:
    words = pd.read_csv(os.path.join(embed_src, embedding_file), sep=" ", index_col=0, header=None, quoting=csv.QUOTE_NONE)
    # some embeddings come back with word == NaN
    words = words[~words.index.isnull()]
    # add entries for special tokens
    words.loc[UNK_WORD] = [0 for x in words.columns]
    words.loc[PAD_WORD] = [0 for x in words.columns]
    if save_cache:
      with pd.HDFStore(embed_store, 'a') as store:
        store[file_name] = words
  
  word2ix = {word:i for i,word in enumerate(words.index)}
  ix2word = {i:word for i,word in enumerate(words.index)}
  words = words.to_numpy().astype(float)
  
  return words, word2ix, ix2word

def initialize_character_embeddings(vocab=string.printable):
  characters = [x for x in string.printable]
  characters += [UNK_CHAR, PAD_CHAR]
  char2ix = {ch:i for i, ch in enumerate(characters)}
  ix2char = {i:ch for i, ch in enumerate(characters)}
  
  return characters, char2ix, ix2char

def initialize_case_embeddings(vocab=['upper','lower','title','numeric','other']):
  case2ix = {case:i for i, case in enumerate(vocab)}
  ix2case = {}
  cases = []
  for k,v in case2ix.items():
    this_case = np.zeros(len(case2ix))
    this_case[v] = 1
    cases.append(this_case)
    ix2case[v] = k
  cases = np.array(cases)
  
  return cases, case2ix, ix2case

  
def initialize_labels(file_name):
  data = pd.read_csv(os.path.join(data_src, file_name), sep="\t",  quoting=csv.QUOTE_NONE, header=None, skip_blank_lines=False, engine='python', names =['token', 'pos', 'tree', 'BIO'])
  data.dropna(subset=['BIO'], inplace=True)
  label_list = data.BIO.unique()
  label2ix = {label:i for i, label in enumerate(label_list)}
  ix2label = {i:label for i, label in enumerate(label_list)}
  return label_list, label2ix, ix2label

In [0]:
# load embeddings and format
words, word_to_ix, ix_to_word = initialize_word_embeddings(embedding_file, use_cache=False, save_cache=False)
characters, char_to_ix, ix_to_char = initialize_character_embeddings()
cases, case_to_ix, ix_to_case = initialize_case_embeddings()
labels, label_to_ix, ix_to_label = initialize_labels(train_file)

In [0]:
def checkPrior(blah):
  if blah is None:
    return True
  else:
    return False
  
def phrase2char(w_vec):
  '''
  This function transforms a sequence of words in index format to a 2d array of character indexes
  
  w_vec - an iterable of word indexes
  
  returns np.ndarray of size (len(w_vec), WORD_WIDTH)
  '''
  phrase_vector = []
  for w_ix in w_vec:
    char_vector = []
    if w_ix not in (word_to_ix[PAD_WORD],word_to_ix[UNK_WORD]):
      for char in ix_to_word[w_ix]:
        char_vector.append(get_char_ix(char))
    phrase_vector.append(np.array(char_vector))
  return pad_sequences(phrase_vector, value=char_to_ix[PAD_CHAR], maxlen=WORD_WIDTH, padding='post')

def pad_truncate(x,width,pad_token):
  if(len(x) > width):
    print(f"Truncating input: {[ix_to_word[ix] for ix in x]}")
    x = x[:256]
  return np.pad(x,pad_width=(0,width-len(x)), mode='constant', constant_values=pad_token)

def verbosity(str, verbose):
  if verbose:
    print(str)

def preprocess_data(file_name, use_cache=True, debug=True):
  '''
  Prepares data for model.  It can be used for both training and test data.
  
  returns pd.DataFrame
  '''
  clean_name = os.path.join(cache_dir, file_name.replace(".", "_"))
  loaded = False
  phrase_vectors = None
      
  if use_cache and os.path.exists(clean_name+"word.npy"):
    verbosity("Attempting to load from cache", debug)
    try:
      word_vectors = np.load(clean_name+"word.npy", allow_pickle=True)
      char_vectors = np.load(clean_name+"char.npy", allow_pickle=True)
      case_vectors = np.load(clean_name+"case.npy", allow_pickle=True)
      label_vectors = np.load(clean_name+"label.npy", allow_pickle=True)
      phrase_vectors = [word_vectors, char_vectors, case_vectors, label_vectors]
      loaded = True
      verbosity("Loaded successfully", debug)
    except:
      verbosity("Loading failed",debug)
      loaded = False
  
  if not loaded:
    verbosity(f"Loading raw data file to process labels: {file_name}", debug)
    checkpoint = time.time()  
    data = pd.read_csv(os.path.join(data_src, file_name), sep="\t",  quoting=csv.QUOTE_NONE, header=None, skip_blank_lines=False, engine='python', names =['token', 'pos', 'tree', 'BIO'])
    verbosity(f"Parsed data loaded: {time.time()-checkpoint} s", debug)

    # see if prior row was a newline
    data['prior'] = data.token.shift(1)
    # drop empty rows
    data = data.loc[~data.token.isnull()]
    data.prior = data.prior.apply(checkPrior)
    data['phrase'] = data.prior.cumsum()
        
    verbosity("Processing data into phrase vectors", debug)
    verbosity("Step 1: Translating to indexes", debug)
    checkpoint = time.time()
    data['word_ix'] = data.token.apply(get_word_ix)
    data['case_ix'] = data.token.apply(get_casing_ix)
    data['label_ix'] = data.BIO.apply(lambda x: label_to_ix[x])
    verbosity(f"Step 1: Translated to indexes complete: {time.time()-checkpoint} s", debug)

    verbosity("Step 2: Creating phrase vectors", debug)
    verbosity("Step 2a: Aggregating phrases", debug)
    checkpoint = time.time()
    phrase_vectors = data.groupby('phrase').agg({'word_ix': list, 'case_ix': list, 'label_ix': list})
    verbosity(f"Step 2a: {time.time()-checkpoint} s", debug)
    
    verbosity("Step 2b: Padding word vectors", debug)
    checkpoint = time.time()
    phrase_vectors['word_vector'] = phrase_vectors.word_ix.apply(lambda x: pad_truncate(x, SENTENCE_WIDTH, word_to_ix[PAD_WORD]))
    verbosity(f"Step 2b: {time.time()-checkpoint} s", debug)
    
    verbosity("Step 2c: Creating and padding character vectors", debug)
    checkpoint = time.time()
    phrase_vectors['char_vector'] = phrase_vectors.word_vector.apply(lambda x: phrase2char(x))
    verbosity(f"Step 2c: {time.time()-checkpoint} s", debug)
    
    verbosity(f"Step 2d: Padding case vectors", debug)
    checkpoint = time.time()
    phrase_vectors['case_vector'] = phrase_vectors.case_ix.apply(lambda x: pad_truncate(x, SENTENCE_WIDTH, case_to_ix[CASE_PAD_SYMBOL]))
    verbosity(f"Step 2d: {time.time()-checkpoint}", debug)
    
    verbosity("Step 2e: Padding label vectors", debug)
    checkpoint = time.time()
    phrase_vectors['label_vector'] = phrase_vectors.label_ix.apply(lambda x: np.expand_dims(pad_truncate(x, SENTENCE_WIDTH, label_to_ix[LABEL_PAD_SYMBOL]), -1))
    verbosity(f"Step 2e: {time.time()-checkpoint} s", debug)
    
    verbosity("Saving data to disk", debug)
    checkpoint = time.time()
    phrase_vectors.drop(columns=['word_ix', 'case_ix', 'label_ix'], inplace=True)
    phrase_vectors = phrase_vectors.to_numpy()
    phrase_vectors = [np.stack(phrase_vectors[:,0]), np.stack(phrase_vectors[:,1]), np.stack(phrase_vectors[:,2]), np.stack(phrase_vectors[:,3])]
    
    # saving in multi parts because training data causes a memory error
    np.save(clean_name+'word', phrase_vectors[0], allow_pickle=True)
    np.save(clean_name+'char', phrase_vectors[1], allow_pickle=True)
    np.save(clean_name+'case', phrase_vectors[2], allow_pickle=True)
    np.save(clean_name+'label', phrase_vectors[3], allow_pickle=True)

    verbosity(f"Saved to disk: {time.time()-checkpoint} s", debug)
  
  return phrase_vectors