# End-to-end NLP pipeline
Steps:
1. Load data
2. Preprocess data
   - Tokenize
   - Vectorize
   - Zeropad sequences
   - Split data into train and test set
3. Build and train model
4. Evaluate model

In [7]:
import os
import pandas as pd

os.environ['KERAS_BACKEND']='tensorflow'

### Load data

In [8]:
data = open('../data/enron_with_categories/categories.txt', mode='r')

In [27]:
def load_files_in_directory(source_directory, file_ext):
    categories = {}
    for i, file in enumerate(os.listdir(source_dir)):
        if file_ext in file:
            contents = open('{}/{}'.format(source_directory, file), mode='r')
            key = int(file.split('.')[0])
            categories[key] = contents.read()
                    
    return categories


project_dir = os.getcwd()
source_dir = os.path.join(project_dir, '../data/enron_with_categories/1')
categories = load_files_in_directory(source_dir, '.cats')
assert len(categories) == 834
assert isinstance(categories, dict)
assert categories[10425] == """1,1,1
2,6,1
2,13,1
3,3,1
"""

In [50]:
from collections import OrderedDict

def sort_dictionary_by_keys(dictionary):
    return OrderedDict(sorted(dictionary.items()))

sorted_categories = sort_dictionary_by_keys(categories)
assert len(sorted_categories) == len(categories)
assert categories[10425] == """1,1,1
2,6,1
2,13,1
3,3,1
"""
sorted_categories_keys = list(sorted_categories.keys())
assert sorted_categories_keys[1] > sorted_categories_keys[0]
assert sorted_categories_keys[2] > sorted_categories_keys[1]


In [161]:
def simplify_categories(categories):
    # a simple/downsampled implementation of category-labelling for the first iteration
    simple_categories = {}
    for key, category in categories.items():
        simple_categories[key] = category.split(',')[0]
        
    return simple_categories

simple_categories = simplify_categories(categories)
assert len(simple_categories) == len(categories)

In [162]:
categories = load_files_in_directory(source_dir, '.cats')
simple_categories = simplify_categories(categories)
sorted_categories = sort_dictionary_by_keys(simple_categories)

emails = load_files_in_directory(source_dir, '.txt')
sorted_emails = sort_dictionary_by_keys(emails)

In [163]:
def get_data_and_labels(data, labels):

    data_list = []
    labels_list = []
    filenames_list = []
    
    if (len(data) != len(labels)):
        raise Exception('data and labels are of differing length')
    
    for k, v in data.items():
        data_list.append(v)
    
    for k, v in labels.items():
        labels_list.append(v)
        
    for i in range(len(labels)):
        data_key = list(data.items())[i][0]
        label_key = list(labels.items())[i][0]
        if data_key == label_key:
            filenames_list.append(data_key)
        else:
            raise Exception('data and labels are not sorted in sequence')
        
    return (data_list, labels_list, filenames_list)

emails_list, categories_list, filenames_list = get_data_and_labels(sorted_emails, sorted_categories)
assert len(emails_list) == len(categories_list)
assert len(emails_list) == len(filenames_list)

### Preprocess data

In [164]:
from keras.preprocessing.text import Tokenizer

def fit_tokenizer(texts, MAX_NB_WORDS=20000):
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='\"\'\\')
    tokenizer.fit_on_texts(texts)
    return tokenizer

tokenizer = fit_tokenizer(emails_list)
assert tokenizer.document_count == len (emails_list)

In [165]:
def convert_texts_to_sequences(texts):
    sequences = tokenizer.texts_to_sequences(emails_list)
    return sequences

sequences = convert_texts_to_sequences(emails_list)
assert len(sequences) == len(emails_list)
first_word_in_first_email = emails_list[0].split()[0].lower()
assert sequences[0][0] == tokenizer.word_index[first_word_in_first_email]

In [168]:
from keras.preprocessing import sequence 

def zeropad_data(sequences, MAX_SEQUENCE_LENGTH=1000):
    data = sequence.pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    return data

SEQUENCE_LENGTH_LIMIT=500
data = zeropad_data(sequences, SEQUENCE_LENGTH_LIMIT)
assert padded_sequences.shape == (len(sequences), SEQUENCE_LENGTH_LIMIT)