In [1]:
# Basic Modules for data and text processing
import pandas as pd
import numpy as np
import string
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split

# Keras Modules 
from keras.preprocessing.text import Tokenizer  # This tokenizes the text
from keras.preprocessing.sequence import pad_sequences # This equalises the input we want to give
from keras.models import Sequential # We will build sequential models only
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation # All of the layers of our model
from keras.layers.embeddings import Embedding # How to build our word vectors

# Plotly
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)

Using TensorFlow backend.


## Data Processing Stuff

In [2]:
df = pd.read_csv('D:\Data\processed.csv',delimiter = "\t",error_bad_lines=False)

In [3]:
print(df.head(1))

   Unnamed: 0  Unnamed: 0.1                                   To Subject  \
0           0             0  frozenset({'tim.belden@enron.com'})     NaN   

                content     user  labeled  rep  
0  Here is our forecast  allen-p    False    0  


In [4]:
# Clearning the Dataframe a bit
df = df.drop(['Unnamed: 0','Unnamed: 0.1', 'Subject','user','labeled','To'], axis=1)
df['label'] = df['rep']
df = df.drop(['rep'],axis=1)
#df = df[df.label != 'unsup']
print(df.shape)

(100000, 2)


In [5]:
# Text Normalising Function

def clean_text(text):
    
    ## Remove puncuation 
    text = text.translate(string.punctuation)
    
    ## Convert Words to lower case and split them
    text = text.lower().split()
    
    ## Remove stop words (commonly used stuff eg, is and was)
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    
    text = " ".join(text) 
    
    # Common Dictionary Corpus
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    ## Stemming
    text = text.split()
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)
    
    return text

In [6]:
# Clear some text
print(df.shape)

# Drop empty rows (NaN)
df = df.dropna()

# Using the text cleaning function
df['content'] = df['content'].map(lambda x: clean_text(x))

print(df.shape)

(100000, 2)
(99598, 2)


In [7]:
## Creating Sequences
vocabulary_size = 20000
tokenizer = Tokenizer(num_words = vocabulary_size)
tokenizer.fit_on_texts(df['content'])

sequences = tokenizer.texts_to_sequences(df['content'])
data = pad_sequences(sequences,maxlen=50)

In [8]:
# Now splitting training and testing data

print("Reached here")

X_train, X_test, y_train, y_test = train_test_split(data, df['label'], test_size=.25)
print("X_train {} \n y_train {}".format(X_train.shape,y_train.shape))
#import csv
#csv.field_size_limit()
#csv.field_size_limit(10000000)

Reached here
X_train (74698, 50) 
 y_train (74698,)


## Neural Architecture begins here

In [9]:
# Defining the Model

def create_model():
    cnn_model = Sequential()
    cnn_model.add(Embedding(vocabulary_size, 100, input_length=50))
    cnn_model.add(Dropout(0.2))
    cnn_model.add(Conv1D(64, 5, activation='relu'))
    cnn_model.add(MaxPooling1D(pool_size = 4))
    cnn_model.add(LSTM(100))
    cnn_model.add(Dense(1, activation='sigmoid'))
    print(cnn_model.summary())
    return cnn_model

In [10]:
# Compile the model

cnn_model = create_model()
cnn_model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 100)           2000000   
_________________________________________________________________
dropout_1 (Dropout)          (None, 50, 100)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 46, 64)            32064     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 11, 64)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               66000     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 2,098,165
Trainable params: 2,098,165
Non-trainable params: 0
_________________________________________________________________


In [11]:
# Fit and Train the Model 

## HyperParameters
batch_size = 8000
num_epochs = 15

# Making validation set
X_valid, y_valid = X_train[:batch_size], y_train[:batch_size]
X_train2, y_train2 = X_train[batch_size:], y_train[batch_size:]

cnn_model.fit(X_train2,y_train2, validation_data=(X_valid,y_valid), batch_size=batch_size,epochs=num_epochs)

Train on 66698 samples, validate on 8000 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x1aca1f498d0>

In [13]:
# Finding the accuracy

scores = cnn_model.evaluate(X_test, y_test, verbose=0)
print("Test Accuracy: ",scores[1])

Test Accuracy:  0.9366666666666666
