## Text Classification

###  Develop an n-gram CNN Model for Sentiment Analysis

##### preparation data from dataset

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

In [2]:
df= pd.read_csv(r"D:\NLP\Deep_Learning_in_NLP\phishing_detection\Phishing_Email.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type
0,0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,1,the other side of * galicismos * * galicismo *...,Safe Email
2,2,re : equistar deal tickets are you still avail...,Safe Email
3,3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email
4,4,software at incredibly low prices ( 86 % lower...,Phishing Email


In [3]:
df = df.dropna()
print(df.isna().sum())

Unnamed: 0    0
Email Text    0
Email Type    0
dtype: int64


In [4]:
email_type_counts = df['Email Type'].value_counts()
print(email_type_counts)

Email Type
Safe Email        11322
Phishing Email     7312
Name: count, dtype: int64


In [5]:
Safe_Email = df[df["Email Type"]== "Safe Email"]
Phishing_Email = df[df["Email Type"]== "Phishing Email"]
Safe_Email = Safe_Email.sample(Phishing_Email.shape[0])

In [6]:
Data= pd.concat([Safe_Email, Phishing_Email], ignore_index = True)
Data.head()

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type
0,13046,"re : thursday visit frank , we shall have abou...",Safe Email
1,5009,\nhttp://www.nationalreview.com/hanson/hanson0...,Safe Email
2,2525,\nI stand corrected --- I'd thought I'd seen a...,Safe Email
3,10628,\n There was a server bug on the backup disco...,Safe Email
4,14668,"organizational changes in support of enron  ,...",Safe Email


#### convert the above Data similar to the book data

In [None]:
# Create a directory to store .txt files
clean_dir = r"D:\NLP\Deep_Learning_in_NLP\Clean"
phishing_dir = r"D:\NLP\Deep_Learning_in_NLP\Phishing"
os.makedirs(clean_dir, exist_ok=True)
os.makedirs(phishing_dir, exist_ok=True)

# Iterate through the column and write to .txt files
for i in range(len(Data)):

    #print(i)
    if Data['Email Type'][i] == 'Safe Email':

        # Define the file name for the .txt file
        filename = os.path.join(clean_dir, f"record_{Data['Unnamed: 0'][i]}.txt")
    
        # Write the record to the .txt file
        with open(filename, 'w', encoding="utf-8") as txt_file:
            txt_file.write(str(Data['Email Text'][i]))
    else:
        # Define the file name for the .txt file
        filename = os.path.join(phishing_dir, f"record_{Data['Unnamed: 0'][i]}.txt")

        # Write the record to the .txt file
        with open(filename, 'w', encoding="utf-8") as txt_file:
           txt_file.write(str(Data['Email Text'][i])) 

#### Data Preparation

In [1]:
import string
import re
from os import listdir
from nltk.corpus import stopwords
from pickle import dump


# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r', encoding="utf-8")
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# turn a doc into clean tokens
def clean_doc(doc):
    # split into tokens by white space
    tokens = doc.split()
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    # remove punctuation from each word
    tokens = [re_punc.sub('', w) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    tokens = ' '.join(tokens)
    return tokens


# load all docs in a directory
def process_docs(directory, is_train):
    documents = list()
    # walk through all files in the folder
    for filename in listdir(directory):
        # skip any reviews in the test set
        if is_train and filename.startswith('record_8') or filename.startswith('record_9'):
            continue
        if not is_train and not(filename.startswith('record_8') or filename.startswith('record_9')):
            continue
        # create the full path of the file to open
        path = directory + '/' + filename
        # load the doc
        doc = load_doc(path)
        # clean doc
        tokens = clean_doc(doc)
        # add to list
        documents.append(tokens)
    return documents


# load and clean a dataset
def load_clean_dataset(is_train):
    # load documents
    phishing = process_docs(r"D:\NLP\Deep_Learning_in_NLP\phishing_detection\Phishing", is_train)
    clean = process_docs(r"D:\NLP\Deep_Learning_in_NLP\phishing_detection\Clean", is_train)
    docs = phishing + clean
    # prepare labels
    labels = [0 for _ in range(len(phishing))] + [1 for _ in range(len(clean))]
    return docs, labels


# save a dataset to file
def save_dataset(dataset, filename):
    dump(dataset, open(filename, 'wb'))
    print('Saved: %s' % filename)

# load and clean all Emails
train_docs, ytrain = load_clean_dataset(True)
test_docs, ytest = load_clean_dataset(False)
# save training datasets
save_dataset([train_docs, ytrain], 'train.pkl')
save_dataset([test_docs, ytest], 'test.pkl')


Saved: train.pkl
Saved: test.pkl


#### Develop Multichannel Model

In [4]:
from pickle import load
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import plot_model
import numpy as np
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Embedding
from keras.layers import Conv1D
from keras.layers import MaxPooling1D
from keras.layers import concatenate

# load a clean dataset
def load_dataset(filename):
    return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

# calculate the maximum document length
def max_length(lines):
    return max([len(s.split()) for s in lines])

# encode a list of lines
def encode_text(tokenizer, lines, length):
    # integer encode
    encoded = tokenizer.texts_to_sequences(lines)
    # pad encoded sequences
    padded = pad_sequences(encoded, maxlen=length, padding='post')
    return padded

# define the model
def define_model(length, vocab_size):
    # channel 1
    inputs1 = Input(shape=(length,))
    embedding1 = Embedding(vocab_size, 100)(inputs1)
    conv1 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding1)
    drop1 = Dropout(0.5)(conv1)
    pool1 = MaxPooling1D(pool_size=2)(drop1)
    flat1 = Flatten()(pool1)
    # channel 2
    inputs2 = Input(shape=(length,))
    embedding2 = Embedding(vocab_size, 100)(inputs2)
    conv2 = Conv1D(filters=32, kernel_size=6, activation='relu')(embedding2)
    drop2 = Dropout(0.5)(conv2)
    pool2 = MaxPooling1D(pool_size=2)(drop2)
    flat2 = Flatten()(pool2)
    # channel 3
    inputs3 = Input(shape=(length,))
    embedding3 = Embedding(vocab_size, 100)(inputs3)
    conv3 = Conv1D(filters=32, kernel_size=8, activation='relu')(embedding3)
    drop3 = Dropout(0.5)(conv3)
    pool3 = MaxPooling1D(pool_size=2)(drop3)
    flat3 = Flatten()(pool3)
    # merge
    merged = concatenate([flat1, flat2, flat3])
    # interpretation
    dense1 = Dense(10, activation='relu')(merged)
    outputs = Dense(1, activation='sigmoid')(dense1)
    model = Model(inputs=[inputs1, inputs2, inputs3], outputs=outputs)
    # compile
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    # summarize
    model.summary()
    plot_model(model, show_shapes=True, to_file='model.png')
    return model

# load training dataset
trainLines, trainLabels = load_dataset('train.pkl')
# create tokenizer
tokenizer = create_tokenizer(trainLines)
# calculate max document length
length = max_length(trainLines)
print('Max document length: %d' % length)
# calculate vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary size: %d' % vocab_size)
# encode data
trainX = encode_text(tokenizer, trainLines, length)
# define model
model = define_model(length, vocab_size)
# fit model
model.fit([trainX,trainX,trainX], np.array(trainLabels), epochs=7, batch_size=16)
# save the model
model.save('model.h5')

Max document length: 9785
Vocabulary size: 77727
Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_7 (InputLayer)        [(None, 9785)]               0         []                            
                                                                                                  
 input_8 (InputLayer)        [(None, 9785)]               0         []                            
                                                                                                  
 input_9 (InputLayer)        [(None, 9785)]               0         []                            
                                                                                                  
 embedding_6 (Embedding)     (None, 9785, 100)            7772700   ['input_7[0][0]']             
                                           

  saving_api.save_model(


#### Evaluate Model

In [7]:
from pickle import load
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model

# load a clean dataset
def load_dataset(filename):
    return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

# calculate the maximum document length
def max_length(lines):
    return max([len(s.split()) for s in lines])

# encode a list of lines
def encode_text(tokenizer, lines, length):
    # integer encode
    encoded = tokenizer.texts_to_sequences(lines)
    # pad encoded sequences
    padded = pad_sequences(encoded, maxlen=length, padding='post')
    return padded

# load datasets
trainLines, trainLabels = load_dataset('train.pkl')
testLines, testLabels = load_dataset('test.pkl')
# create tokenizer
tokenizer = create_tokenizer(trainLines)
# calculate max document length
length = max_length(trainLines)
print('Max document length: %d' % length)
# calculate vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary size: %d' % vocab_size)
# encode data
trainX = encode_text(tokenizer, trainLines, length)
testX = encode_text(tokenizer, testLines, length)
# load the model
model = load_model('model.h5')
# evaluate model on training dataset
_, acc = model.evaluate([trainX,trainX,trainX], np.array(trainLabels), verbose=0)
print('Train Accuracy: %.2f' % (acc*100))
# evaluate model on test dataset dataset
_, acc = model.evaluate([testX,testX,testX], np.array(testLabels), verbose=0)
print('Test Accuracy: %.2f' % (acc*100))

Max document length: 9785
Vocabulary size: 77727
Train Accuracy: 99.17
Test Accuracy: 94.82
