In [85]:

import nltk 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize, sent_tokenize 
import os
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
stop_words = set(stopwords.words('english')) 
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dense, LSTM,Dropout, Activation,Bidirectional

In [2]:
'''
Breaking whole data into list of sentences
'''
def seperate_sentences(data):
    return sent_tokenize(data) 

In [3]:
def Sentence_to_POS(all_sentences):

    data=[]
    for sentence in all_sentences: 
        one_sentence_pos = []
        '''
        Word tokenizers is used to find the words and punctuation in a string 
        '''  
        wordsList = nltk.word_tokenize(sentence) 

        '''
        Removing stop words from wordList.
        ''' 
        wordsList = [w for w in wordsList if not w in stop_words]  

        '''
        Using a Tagger. Which is part-of-speech tagger or POS-tagger.  
        '''

        tagged = nltk.pos_tag(wordsList) 
        for x in tagged:
            one_sentence_pos.append(x[1])
        data.append(one_sentence_pos)

    return data

In [13]:
def Fianl_X_and_Y(novel_POS,novel_label,number_of_sentenes_in_one):
    '''
    This funtion takes generates more data for training by breaking one novel into multiple novels
    with k number of sentences (number_of_sentenes_in_one) .we are ignoring some last sentences if they
    dont fit
    '''
    i=0
    X=[]
    Y=[]
    for novel in novel_POS:
        start = 0
        end = number_of_sentenes_in_one
        while(end<=len(novel)):
            X.append(novel[start:end])
            Y.append(novel_label[i])
            start=end
            end=end+number_of_sentenes_in_one
        i+=1
    return X,Y

In [5]:
def Get_X_and_Y_In_POS_Form(path_of_dataset_directory,number_of_sentences_in_one):
    '''
    this function takes the path of directory where the dataset is and returns the processed X and Y.
    The function calls will let u understand more the flow of code
    '''
    all_files = os.listdir(path_of_dataset_directory)
    novel_POS=[]
    novel_label=[]
    for filename in all_files:
        full_path = 'dataset/three_author_dataset/'+filename
        with open(full_path, 'r') as f:
            data = f.read().replace('"\n"','').replace('\n',' ').replace('- ','')
            all_sentences = seperate_sentences(data)
            sentences_to_pos = Sentence_to_POS(all_sentences)
            novel_POS.append(sentences_to_pos)
            novel_label.append(filename[8])
    return Fianl_X_and_Y(novel_POS,novel_label,number_of_sentences_in_one)


In [6]:
def tag_to_index_dictionary(X):
    '''
    This function just creates a dictionary from the POS tags which we
    encountered in our dataset.As the network works with numbers so simple will make
    dictionary which stores the index of associated POS tag
    '''
    tag = set([])
    for doc in X:
        for sentence in doc:
            for word in sentence:
                tag.add(word)
    tag2index = {t: i + 1 for i, t in enumerate(list(tag))}
    tag2index['-PAD-'] = 0
    return tag2index

In [7]:
def convert_tag_to_sequence_numbers(X):
    tag2index = tag_to_index_dictionary(X)
    '''
    using the tag2index dictionary assign indexs to the POS tags in our data
    '''
    new_X=[]
    for doc in X:
        new_S = []
        for sentence in doc:
            new_W=[]
            for word in sentence:
                new_W.append(tag2index[word])
            new_S.append(new_W)
        new_X.append(new_S)
    return new_X

In [8]:
def pad_zeros_to_sequence(X,max_length):
    '''
    This function padds zeros to the sequences so as to make fixed sequences
    According to paper 15 is best value for max_length
    '''
    
    new_X=[]
    for doc in X:
        new_X.append(pad_sequences(doc, maxlen=max_length, padding='post'))
    return new_X

In [9]:
def Encode_Labels(Y):
    '''
    This fucntion encodes the labels i.e., assign numbers to the labels
    '''
    le = LabelEncoder()
    return le.fit_transform(Y)

In [102]:
X,Y = Get_X_and_Y_In_POS_Form('dataset/three_author_dataset',100)

In [103]:
np.unique(Y,return_counts=True)

(array(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
        'N'], dtype='<U1'),
 array([ 95, 264, 191, 125, 180,  72, 100, 159,  89, 104,  79,  47,  60,
         74]))

In [106]:
DICTIONARY_LENGTH = len(tag_to_index_dictionary(X))
MAX_LENGTH = 30

In [107]:
X = convert_tag_to_sequence_numbers(X)

In [108]:
X = pad_zeros_to_sequence(X,MAX_LENGTH)

In [109]:
X = np.array(X)

In [110]:
Y = Encode_Labels(Y) 

In [123]:
print(X.shape)
X_train = X.reshape(1639,100*30)
   

(1639, 100, 30)


In [125]:
from tensorflow.keras.utils import to_categorical
y_binary = to_categorical(Y)
y_binary.shape

(1639, 14)

In [None]:
model = Sequential()
model.add(Embedding(DICTIONARY_LENGTH, 100, input_length=3000))
model.add(LSTM(100, recurrent_dropout=0.2,return_sequences=True,activation='relu'))
model.add(Bidirectional(LSTM(100)))
model.add(Dense(14, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='nadam', metrics=['accuracy'])
model.fit(X_train, y_binary, validation_split=0.2, epochs=100)

Train on 1311 samples, validate on 328 samples
Epoch 1/100


(21, 80, 15)