In [1]:
import numpy as np
import pandas as pd
import keras
import keras.backend as K
from keras.layers import Dense, GlobalAveragePooling1D, Embedding
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [31]:
df = pd.read_csv(r'C:\Users\Sabeeha\Desktop\project\spooky\train.csv',index_col='id')
test = pd.read_csv(r'C:\Users\Sabeeha\Desktop\project\spooky\test.csv',index_col='id')
submission = pd.read_csv(r'C:\Users\Sabeeha\Desktop\project\spooky\sample_submission.csv')
#df = pd.concat([train,test], axis=0,ignore_index=True)

In [32]:
df.head()

Unnamed: 0_level_0,text,author
id,Unnamed: 1_level_1,Unnamed: 2_level_1
id26305,"This process, however, afforded me no means of...",EAP
id17569,It never once occurred to me that the fumbling...,HPL
id11008,"In his left hand was a gold snuff box, from wh...",EAP
id27763,How lovely is spring As we looked from Windsor...,MWS
id12958,"Finding nothing else, not even gold, the Super...",HPL


In [33]:
df.shape

(19579, 2)

In [34]:
 #Split data into train and test
train_size = int(len(df) * .8)
print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(df) - train_size))

Train size: 15663
Test size: 3916


In [35]:
X_train = df['text'][:train_size]
Y_train = df['author'][:train_size]

X_test = df['text'][train_size:]
Y_test = df['author'][train_size:]

In [36]:
# tokenize text
from keras.preprocessing.text import Tokenizer
tokenize = Tokenizer(num_words=1000, char_level=False)
tokenize.fit_on_texts(list(X_train)+list(X_test)) # only fit on train

x_train = tokenize.texts_to_matrix(X_train)
x_test = tokenize.texts_to_matrix(X_test)

In [39]:
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
encoder = LabelEncoder()
encoder.fit(Y_train)
y_train = encoder.transform(Y_train)
y_test = encoder.transform(Y_test)

In [41]:
# Binarize the label for neural net
from keras import utils 
y_train = utils.to_categorical(y_train, num_classes=3)
y_test = utils.to_categorical(y_test, num_classes=3)

In [42]:
# Inspect the dimenstions of our training and test data (this is helpful to debug)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

x_train shape: (15663, 1000)
x_test shape: (3916, 1000)
y_train shape: (15663, 3)
y_test shape: (3916, 3)


In [43]:
# ANN model
from keras.layers import Dense, Activation, Dropout
from keras.layers.normalization import BatchNormalization

# Build the model
model = Sequential()
model.add(Dense(300, input_shape=(1000,)))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(300, activation='relu'))
model.add(Dropout(0.3))
model.add(BatchNormalization())

# output layer
model.add(Dense(3))
model.add(Activation('softmax'))

# compile
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
# embeding model
model = Sequential()
model.add(Embedding(input_dim=input_dim, output_dim=embedding_dims))
model.add(GlobalAveragePooling1D())
model.add(Dense(3, activation='softmax'))

model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

In [47]:
history = model.fit(x_train, y_train,
                    batch_size=32,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1)

Train on 14096 samples, validate on 1567 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [48]:
# Evaluate the accuracy of our trained model
score = model.evaluate(x_test, y_test,
                       batch_size=32, 
                       verbose=1)

print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 0.8123261775741538
Test accuracy: 0.7132278084754944


In [53]:
# log loss
from sklearn.metrics import log_loss
y_pred = model.predict_proba(x_test)
log_loss(y_test,y_pred)

0.8123261694903681

In [49]:
# Here's how to generate a prediction on individual examples
text_labels = encoder.classes_ 

for i in range(10):
    prediction = model.predict(np.array([x_test[i]]))
    predicted_label = text_labels[np.argmax(prediction)]
    print(test.iloc[i][:50], "...")
    print('Actual label:' + Y_test.iloc[i])
    print("Predicted label: " + predicted_label + "\n")

text    Still, as I urged our leaving Ireland with suc...
Name: id02310, dtype: object ...
Actual label:EAP
Predicted label: MWS

text    If a fire wanted fanning, it could readily be ...
Name: id24541, dtype: object ...
Actual label:EAP
Predicted label: EAP

text    And when they had broken down the frail door t...
Name: id00134, dtype: object ...
Actual label:HPL
Predicted label: HPL

text    While I was thinking how I should possibly man...
Name: id27757, dtype: object ...
Actual label:EAP
Predicted label: MWS

text    I am not sure to what limit his knowledge may ...
Name: id04081, dtype: object ...
Actual label:HPL
Predicted label: HPL

text    "The thick and peculiar mist, or smoke, which ...
Name: id27337, dtype: object ...
Actual label:HPL
Predicted label: EAP

text    That which is not matter, is not at all unless...
Name: id24265, dtype: object ...
Actual label:EAP
Predicted label: EAP

text    I sought for repose although I did not hope fo...
Name: id25917, dtype: object ...

# Embedding layer

Embedding layer creates embedding vectors out of the input words, similarly like word2vec or precalculated glove would do.

__input_dim__ : the vocabulary size. This is how many unique words are represented in your corpus.

__output_dim__ : the desired dimension of the word vector. For example, if output_dim = 100, then every word will be mapped onto a vector with 100 elements.

__input_length__ : the length of your sequences. For example, if your data consists of sentences, then this variable represents how many words there are in a sentence. As disparate sentences typically contain different number of words, it is usually required to pad your sequences such that all sentences are of equal length. The keras.preprocessing.pad_sequence method can be used for this

In [86]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences 
from keras.utils import to_categorical

#texts = ['This is a text','This is not a text']

texts = [
   # Positive Reviews

    'This is an excellent movie',
    'The move was fantastic I like it',
    'You should watch it is brilliant',
    'Exceptionally good',
    'Wonderfully directed and executed I like it',
    'Its a fantastic series',
    'Never watched such a brillent movie',
    'It is a Wonderful movie',

    # Negtive Reviews

    "horrible acting",
    'waste of money',
    'pathetic picture',
    'It was very boring',
    'I did not like the movie',
    'The movie was horrible',
    'I will not recommend',
    'The acting is pathe']


#num_words is tne number of unique words in the sequence, if there's more top count words are taken
tokenizer = Tokenizer(num_words=10)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
tokenizer.word_index

{'movie': 1,
 'it': 2,
 'is': 3,
 'the': 4,
 'i': 5,
 'was': 6,
 'like': 7,
 'a': 8,
 'fantastic': 9,
 'horrible': 10,
 'acting': 11,
 'not': 12,
 'this': 13,
 'an': 14,
 'excellent': 15,
 'move': 16,
 'you': 17,
 'should': 18,
 'watch': 19,
 'brilliant': 20,
 'exceptionally': 21,
 'good': 22,
 'wonderfully': 23,
 'directed': 24,
 'and': 25,
 'executed': 26,
 'its': 27,
 'series': 28,
 'never': 29,
 'watched': 30,
 'such': 31,
 'brillent': 32,
 'wonderful': 33,
 'waste': 34,
 'of': 35,
 'money': 36,
 'pathetic': 37,
 'picture': 38,
 'very': 39,
 'boring': 40,
 'did': 41,
 'will': 42,
 'recommend': 43,
 'pathe': 44}

In [87]:
# define vocabulary size (total number of unique word in corpus)
input_dim = len(tokenizer.word_index) + 1
input_dim

45

In [88]:
# input length --> maximum length of input documents
max_length = max([len(s.split()) for s in texts])
max_length

7

In [90]:
# We add padding to make all the vectors of same length
data = pad_sequences(sequences, max_length)
print('Shape of data tensor:', data.shape)
print(data)

Shape of data tensor: (16, 7)
[[0 0 0 0 0 3 1]
 [0 4 6 9 5 7 2]
 [0 0 0 0 0 2 3]
 [0 0 0 0 0 0 0]
 [0 0 0 0 5 7 2]
 [0 0 0 0 0 8 9]
 [0 0 0 0 0 8 1]
 [0 0 0 2 3 8 1]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 2 6]
 [0 0 0 5 7 4 1]
 [0 0 0 0 4 1 6]
 [0 0 0 0 0 0 5]
 [0 0 0 0 0 4 3]]


In [89]:
embedding_vecor_length = 4

In [91]:
from keras.models import Sequential
from keras.layers import Embedding

model = Sequential()
model.add(Embedding(input_dim=top_words, output_dim=embedding_vecor_length, input_length=max_length,mask_zero=True))
model.compile(optimizer='adam', loss='categorical_crossentropy')

output_array = model.predict(data)

In [92]:
output_array

array([[[-0.00111023, -0.01840474,  0.02020403, -0.02580997],
        [-0.00111023, -0.01840474,  0.02020403, -0.02580997],
        [-0.00111023, -0.01840474,  0.02020403, -0.02580997],
        [-0.00111023, -0.01840474,  0.02020403, -0.02580997],
        [-0.00111023, -0.01840474,  0.02020403, -0.02580997],
        [-0.00634675,  0.03606481,  0.04974747,  0.03115224],
        [-0.0340276 , -0.03345232, -0.01004336, -0.02263137]],

       [[-0.00111023, -0.01840474,  0.02020403, -0.02580997],
        [-0.01352594, -0.01442224, -0.02554727,  0.04339388],
        [ 0.02604628,  0.02247224,  0.01502315, -0.03371266],
        [-0.0202619 , -0.03797339, -0.02806621, -0.03407003],
        [ 0.03944676,  0.00259546, -0.00703113, -0.01197033],
        [-0.03315636, -0.00247096,  0.02489186,  0.00848497],
        [-0.0041375 , -0.03498287,  0.02002242, -0.03391895]],

       [[-0.00111023, -0.01840474,  0.02020403, -0.02580997],
        [-0.00111023, -0.01840474,  0.02020403, -0.02580997],
    