# End-to-end NLP pipeline
Steps:
1. Load data
2. Preprocess data
   - Tokenize
   - Vectorize
   - Zeropad sequences
   - Split data into train and test set
3. Build and train model
4. Evaluate model

In [1]:
import os
import pandas as pd

os.environ['KERAS_BACKEND']='tensorflow'

### Load data

In [2]:
def load_files_in_directory(source_directory, file_ext):
    categories = {}
    for i, file in enumerate(os.listdir(source_dir)):
        if file_ext in file:
            contents = open('{}/{}'.format(source_directory, file), mode='r')
            key = int(file.split('.')[0])
            categories[key] = contents.read()
                    
    return categories


project_dir = os.getcwd()
source_dir = os.path.join(project_dir, '../data/enron_with_categories/1')
categories = load_files_in_directory(source_dir, '.cats')
assert len(categories) == 834
assert isinstance(categories, dict)
assert categories[10425] == """1,1,1
2,6,1
2,13,1
3,3,1
"""

In [3]:
from collections import OrderedDict

def sort_dictionary_by_keys(dictionary):
    return OrderedDict(sorted(dictionary.items()))

sorted_categories = sort_dictionary_by_keys(categories)
assert len(sorted_categories) == len(categories)
assert categories[10425] == """1,1,1
2,6,1
2,13,1
3,3,1
"""
sorted_categories_keys = list(sorted_categories.keys())
assert sorted_categories_keys[1] > sorted_categories_keys[0]
assert sorted_categories_keys[2] > sorted_categories_keys[1]


In [4]:
def simplify_categories(categories):
    # a simple/downsampled implementation of category-labelling for the first iteration
    simple_categories = {}
    for key, category in categories.items():
        simple_categories[key] = []
        categories_in_a_single_email = category.split('\n')
        
        for cat in categories_in_a_single_email:
            if cat.split(',')[0] != '':
                simple_categories[key].append(cat.split(',')[0])
                
        simple_categories[key] = list(set(simple_categories[key]))
        
    return simple_categories

simple_categories = simplify_categories(categories)
assert len(simple_categories) == len(categories)

In [5]:
categories = load_files_in_directory(source_dir, '.cats')
simple_categories = simplify_categories(categories)
sorted_categories = sort_dictionary_by_keys(simple_categories)

emails = load_files_in_directory(source_dir, '.txt')
sorted_emails = sort_dictionary_by_keys(emails)

In [6]:
def get_data_and_labels(data, labels):

    data_list = []
    labels_list = []
    filenames_list = []
    
    if (len(data) != len(labels)):
        raise Exception('data and labels are of differing length')
    
    for k, v in data.items():
        data_list.append(v)
    
    for k, v in labels.items():
        labels_list.append(v)
        
    for i in range(len(labels)):
        data_key = list(data.items())[i][0]
        label_key = list(labels.items())[i][0]
        if data_key == label_key:
            filenames_list.append(data_key)
        else:
            raise Exception('data and labels are not sorted in sequence')
        
    return (data_list, labels_list, filenames_list)

emails_list, categories_list, filenames_list = get_data_and_labels(sorted_emails, sorted_categories)
assert len(emails_list) == len(categories_list)
assert len(emails_list) == len(filenames_list)

### Preprocess data

In [7]:
from keras.preprocessing.text import Tokenizer

def fit_tokenizer(texts, MAX_NB_WORDS=10000):
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='\"\'\\')
    tokenizer.fit_on_texts(texts)
    return tokenizer

tokenizer = fit_tokenizer(emails_list)
assert tokenizer.document_count == len (emails_list)

Using TensorFlow backend.


In [8]:
def convert_texts_to_sequences(texts):
    sequences = tokenizer.texts_to_sequences(texts)
    return sequences

sequences = convert_texts_to_sequences(emails_list)
assert len(sequences) == len(emails_list)
first_word_in_first_email = emails_list[0].split()[0].lower()
assert sequences[0][0] == tokenizer.word_index[first_word_in_first_email]

In [9]:
from keras.preprocessing import sequence 

def zeropad_data(sequences, MAX_SEQUENCE_LENGTH=1000):
    data = sequence.pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    return data

MAX_SEQUENCE_LENGTH=500
data = zeropad_data(sequences, MAX_SEQUENCE_LENGTH)
assert data.shape == (len(sequences), MAX_SEQUENCE_LENGTH)

In [10]:
from keras.utils import np_utils
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer

def convert_labels_to_categorical_vector(y):
    return MultiLabelBinarizer().fit_transform(y)

labels = convert_labels_to_categorical_vector(categories_list)
assert labels.shape == (len(categories_list), len(max(categories_list,key=len)))

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(data, labels, random_state=0)

In [12]:
print('Number of emails in each category in training set:')
print(y_train.sum(axis=0))
print('Number of emails in each category in validation set:')
print(y_val.sum(axis=0))

Number of emails in each category in training set:
[625 503 603 134]
Number of emails in each category in validation set:
[209 174 201  44]


### Build and train model

In [13]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding

def build_lstm_classifier(max_sequence_length, no_of_output_labels):
    model = Sequential()
    model.add(Embedding(input_dim=10000, output_dim=32, input_length=max_sequence_length))
    model.add(LSTM(100))
    model.add(Dense(no_of_output_labels, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    return model
    
model = build_lstm_classifier(MAX_SEQUENCE_LENGTH, len(max(categories_list,key=len)))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 32)           320000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 404       
Total params: 373,604
Trainable params: 373,604
Non-trainable params: 0
_________________________________________________________________
None


In [14]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=3, batch_size=64)

Train on 625 samples, validate on 209 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1200abdd8>

### Evaluate model

In [38]:
from sklearn.metrics import precision_score

def calculate_precision_score(y_true, y_predicted):
    return precision_score(y_true, y_predicted.round(), average='micro')

expected = y_val
predicted = model.predict(X_val)
precision = calculate_precision_score(expected, predicted)
assert precision > 0.9

In [35]:
from sklearn.metrics import recall_score

def calculate_recall_score(y_true, y_predicted):
    return recall_score(y_true, y_predicted.round(), average='micro')

recall = calculate_recall_score(expected, predicted)
assert recall > 0.9