## Text Classification

### Develop an Embedding + CNN Model for Sentiment Analysis

##### preparation data from dataset

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

In [2]:
df= pd.read_csv(r"D:\NLP\Deep_Learning_in_NLP\phishing_detection\Phishing_Email.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type
0,0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,1,the other side of * galicismos * * galicismo *...,Safe Email
2,2,re : equistar deal tickets are you still avail...,Safe Email
3,3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email
4,4,software at incredibly low prices ( 86 % lower...,Phishing Email


In [3]:
df = df.dropna()
print(df.isna().sum())

Unnamed: 0    0
Email Text    0
Email Type    0
dtype: int64


In [4]:
email_type_counts = df['Email Type'].value_counts()
print(email_type_counts)

Email Type
Safe Email        11322
Phishing Email     7312
Name: count, dtype: int64


In [5]:
Safe_Email = df[df["Email Type"]== "Safe Email"]
Phishing_Email = df[df["Email Type"]== "Phishing Email"]
Safe_Email = Safe_Email.sample(Phishing_Email.shape[0])

In [6]:
Data= pd.concat([Safe_Email, Phishing_Email], ignore_index = True)
Data.head()

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type
0,17160,"On Wed, 28 Aug 2002, Matthew Cline wrote:> The...",Safe Email
1,11682,empty,Safe Email
2,14453,"On Mon, 2 Sep 2002, Adam L. Beberg wrote:> Bat...",Safe Email
3,3395,URL: http://jeremy.zawodny.com/blog/archives/0...,Safe Email
4,12583,sum : recursos para el espanol ( spanish resou...,Safe Email


#### convert the above Data similar to the book data

In [None]:
# Create a directory to store .txt files
clean_dir = r"D:\NLP\Deep_Learning_in_NLP\Clean"
phishing_dir = r"D:\NLP\Deep_Learning_in_NLP\Phishing"
os.makedirs(clean_dir, exist_ok=True)
os.makedirs(phishing_dir, exist_ok=True)

# Iterate through the column and write to .txt files
for i in range(len(Data)):

    #print(i)
    if Data['Email Type'][i] == 'Safe Email':

        # Define the file name for the .txt file
        filename = os.path.join(clean_dir, f"record_{Data['Unnamed: 0'][i]}.txt")
    
        # Write the record to the .txt file
        with open(filename, 'w', encoding="utf-8") as txt_file:
            txt_file.write(str(Data['Email Text'][i]))
    else:
        # Define the file name for the .txt file
        filename = os.path.join(phishing_dir, f"record_{Data['Unnamed: 0'][i]}.txt")

        # Write the record to the .txt file
        with open(filename, 'w', encoding="utf-8") as txt_file:
           txt_file.write(str(Data['Email Text'][i])) 

#### Data Preparation

In [8]:
import string
import re
from os import listdir
from collections import Counter
from nltk.corpus import stopwords


############ Loading and Cleaning Emails

# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r', encoding="utf-8")
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# turn a doc into clean tokens
def clean_doc(doc):
    # split into tokens by white space
    tokens = doc.split()
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    # remove punctuation from each word
    tokens = [re_punc.sub('', w) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return tokens


# load doc and add to vocab
def add_doc_to_vocab(filename, vocab):
    # load doc
    doc = load_doc(filename)
    # clean doc
    tokens = clean_doc(doc)
    # update counts
    vocab.update(tokens)

# load all docs in a directory
def process_docs(directory, vocab):
    # walk through all files in the folder
    for filename in listdir(directory):
        # skip any reviews in the test set
        if filename.startswith('record_8') or filename.startswith('record_9'):
            continue
        # create the full path of the file to open
        path = directory + '/' + filename
        # add doc to vocab
        add_doc_to_vocab(path, vocab)

def save_list(lines, filename):
    # convert lines to a single blob of text
    data = '\n'.join(lines)
    # open file
    file = open(filename, 'w')
    # write text
    file.write(data)
    # close file
    file.close()



################### Define a Vocabulary

vocab = Counter()
# add all docs to vocab
process_docs(r"D:\NLP\Deep_Learning_in_NLP\phishing_detection\Clean", vocab)
process_docs(r"D:\NLP\Deep_Learning_in_NLP\phishing_detection\Phishing", vocab)
# print the size of the vocab
print(len(vocab))
# keep tokens with a min occurrence
min_occurane = 2
tokens = [k for k,c in vocab.items() if c >= min_occurane]
print(len(tokens))
# save tokens to a vocabulary file
save_list(tokens, 'vocab.txt')

87350
42264


#### Train CNN With Embedding Layer

In [9]:
import string
import re
from os import listdir
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import plot_model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers import Conv1D
from keras.layers import MaxPooling1D

# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r', encoding="latin-1")
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# turn a doc into clean tokens
def clean_doc(doc, vocab):
    # split into tokens by white space
    tokens = doc.split()
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    # remove punctuation from each word    
    tokens = [re_punc.sub('', w) for w in tokens]
    # filter out tokens not in vocab
    tokens = [w for w in tokens if w in vocab]
    tokens = ' '.join(tokens)
    return tokens


# load all docs in a directory
def process_docs(directory, vocab, is_train):
    documents = list()
    # walk through all files in the folder
    for filename in listdir(directory):
        # skip any reviews in the test set
        if is_train and filename.startswith('record_8') or filename.startswith('record_9'):
            continue
        if not is_train and not (filename.startswith('record_8') or filename.startswith('record_9')):
            continue
        # create the full path of the file to open
        path = directory + '/' + filename
        # load the doc
        doc = load_doc(path)
        # clean doc
        tokens = clean_doc(doc, vocab)
        # add to list
        documents.append(tokens)
    return documents

# load and clean a dataset
def load_clean_dataset(vocab, is_train):
    # load documents
    phishing = process_docs(r"D:\NLP\Deep_Learning_in_NLP\phishing_detection\Phishing", vocab, is_train)
    clean = process_docs(r"D:\NLP\Deep_Learning_in_NLP\phishing_detection\Clean", vocab, is_train)
    docs = phishing + clean
    # prepare labels
    labels = [0 for _ in range(len(phishing))] + [1 for _ in range(len(clean))]
    return docs, labels
    
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer


# integer encode and pad documents
def encode_docs(tokenizer, max_length, docs):
    # integer encode
    encoded = tokenizer.texts_to_sequences(docs)
    # pad sequences
    padded = pad_sequences(encoded, maxlen=max_length, padding='post')
    return padded


# define the model
def define_model(vocab_size, max_length):
    # define network
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length=max_length))
    model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # compile network
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    # summarize defined model
    model.summary()
    plot_model(model, to_file='model.png', show_shapes=True)
    return model


# load the vocabulary
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)
# load all reviews
train_docs, ytrain = load_clean_dataset(vocab, True)
# create the tokenizer
tokenizer = create_tokenizer(train_docs)
# define vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary size: %d' % vocab_size)
# calculate the maximum sequence length
max_length = max([len(s.split()) for s in train_docs])
print('Maximum length: %d' % max_length)
# encode data
Xtrain = encode_docs(tokenizer, max_length, train_docs)
# define model
model = define_model(vocab_size, max_length)
# fit network
model.fit(Xtrain, np.array(ytrain), epochs=10, verbose=2)
# save the model
model.save('model.h5')

Vocabulary size: 36730
Maximum length: 9683
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 9683, 100)         3673000   
                                                                 
 conv1d (Conv1D)             (None, 9676, 32)          25632     
                                                                 
 max_pooling1d (MaxPooling1  (None, 4838, 32)          0         
 D)                                                              
                                                                 
 flatten (Flatten)           (None, 154816)            0         
                                                                 
 dense (Dense)               (None, 10)                1548170   
                                                                 
 dense_1 (Dense)             (None, 1)                 11        
            

  saving_api.save_model(


#### Evaluate Model

In [10]:
## making a prediction on new text data

import string
import re
from os import listdir
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model

# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r', encoding="latin-1")
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# turn a doc into clean tokens
def clean_doc(doc, vocab):
    # split into tokens by white space
    tokens = doc.split()
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    # remove punctuation from each word    
    tokens = [re_punc.sub('', w) for w in tokens]
    # filter out tokens not in vocab
    tokens = [w for w in tokens if w in vocab]
    tokens = ' '.join(tokens)
    return tokens


# load all docs in a directory
def process_docs(directory, vocab, is_train):
    documents = list()
    # walk through all files in the folder
    for filename in listdir(directory):
        # skip any reviews in the test set
        if is_train and filename.startswith('record_8') or filename.startswith('record_9'):
            continue
        if not is_train and not (filename.startswith('record_8') or filename.startswith('record_9')):
            continue
        # create the full path of the file to open
        path = directory + '/' + filename
        # load the doc
        doc = load_doc(path)
        # clean doc
        tokens = clean_doc(doc, vocab)
        # add to list
        documents.append(tokens)
    return documents

# load and clean a dataset
def load_clean_dataset(vocab, is_train):
    # load documents
    phishing = process_docs(r"D:\NLP\Deep_Learning_in_NLP\phishing_detection\Phishing", vocab, is_train)
    clean = process_docs(r"D:\NLP\Deep_Learning_in_NLP\phishing_detection\Clean", vocab, is_train)
    docs = phishing + clean
    # prepare labels
    labels = [0 for _ in range(len(phishing))] + [1 for _ in range(len(clean))]
    return docs, labels
    
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

# integer encode and pad documents
def encode_docs(tokenizer, max_length, docs):
    # integer encode
    encoded = tokenizer.texts_to_sequences(docs)
    # pad sequences
    padded = pad_sequences(encoded, maxlen=max_length, padding='post')
    return padded


# classify a review as negative or positive
def predict_sentiment(review, vocab, tokenizer, max_length, model):
    # clean review
    line = clean_doc(review, vocab)
    # encode and pad review
    padded = encode_docs(tokenizer, max_length, [line])
    # predict sentiment
    yhat = model.predict(padded, verbose=0)
    # retrieve predicted percentage and label
    percent_pos = yhat[0,0]
    if round(percent_pos) == 0:
        return (1-percent_pos), 'Phishing'
    return percent_pos, 'safe'


# load the vocabulary
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)
# load all reviews
train_docs, ytrain = load_clean_dataset(vocab, True)
test_docs, ytest = load_clean_dataset(vocab, False)
# create the tokenizer
tokenizer = create_tokenizer(train_docs)
# define vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary size: %d' % vocab_size)
# calculate the maximum sequence length
max_length = max([len(s.split()) for s in train_docs])
print('Maximum length: %d' % max_length)
# encode data
Xtrain = encode_docs(tokenizer, max_length, train_docs)
Xtest = encode_docs(tokenizer, max_length, test_docs)
# load the model
model = load_model('model.h5')
# evaluate model on training dataset
_, acc = model.evaluate(Xtrain, np.array(ytrain), verbose=0)
print('Train Accuracy: %.2f' % (acc*100))
# evaluate model on test dataset
_, acc = model.evaluate(Xtest, np.array(ytest), verbose=0)
print('Test Accuracy: %.2f' % (acc*100))
# test positive text
text = '''guaranteed satisfaction , cheapest pr ) escription drug / s on the net !'''
percent, sentiment = predict_sentiment(text, vocab, tokenizer, max_length, model)
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment, percent*100))
# test negative text
text = '''re : 6 . 933 , misc : english only , a footnote on banning of german a brief comment on on banning german in the us midwest during wartime : kurt vonnegut 's semi-autobiographical _ slapstick _ mentions how his german - speaking family self-censored the german out of their speech , music , etc . loren billings billings @ princeton . edu billings @ pucc . bitnet'''
percent, sentiment = predict_sentiment(text, vocab, tokenizer, max_length, model)
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment, percent*100))

Vocabulary size: 36730
Maximum length: 9683
Train Accuracy: 99.00
Test Accuracy: 95.16
Review: [guaranteed satisfaction , cheapest pr ) escription drug / s on the net !]
Sentiment: Phishing (82.319%)
Review: [re : 6 . 933 , misc : english only , a footnote on banning of german a brief comment on on banning german in the us midwest during wartime : kurt vonnegut 's semi-autobiographical _ slapstick _ mentions how his german - speaking family self-censored the german out of their speech , music , etc . loren billings billings @ princeton . edu billings @ pucc . bitnet]
Sentiment: safe (100.000%)
