In [1]:
%matplotlib inline
import os
import zipfile
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
from scipy.spatial.distance import cdist
from sklearn.model_selection import train_test_split

In [2]:
from keras.models import Sequential
from keras.layers import Dense, GRU, Embedding
from keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [3]:
tf.__version__

'1.3.0'

In [4]:
# data path initialisation

BASE_DIR='./'
TEXT_DATA_DIR=BASE_DIR + 'Data/'
#TEXT_DATA_FILE='train.csv'

# parameters initialization
VALIDATION_SPLIT = 0.1
RANDOM_SEED = 42


Header=True

# Load Data

In [5]:
# read data
def load_data(file_name,_type):
    "function to read data from directory"
    x=[]
    y=[]
    
    with open(os.path.join(TEXT_DATA_DIR,file_name),"r",encoding='utf-8') as f:
        if Header:
            _=next(f)
        if _type=='train':
            for line in f:
                _,temp_y, temp_x =line.rstrip('\n').split(',',2)
                x.append(temp_x.replace("'", ""))
                y.append(temp_y)
            return x,y
        else:
            for line in f:
                _, temp_x =line.rstrip('\n').split(',',1)
                x.append(temp_x.replace("'", ""))
            return x
            
x, y = load_data('train.csv','train')
y = np.asarray(y, dtype='int8')

# spliting our original data on train and validation sets
# spliting our original data on train and validation sets
data_train, data_val, labels_train, labels_val = train_test_split(x, np.asarray(y, dtype='int32'),test_size=VALIDATION_SPLIT, random_state=RANDOM_SEED, stratify=y)
test=load_data('test.csv','test')

In [6]:
# Initialise dictionary size and maximum sentence length

MAX_NB_WORDS=10000
MAX_SEQUENCE_LENGTH=40

print('Original sentence:',data_train[0])

# Create a dictionary with Tokenizer
tokenizer=Tokenizer(num_words=MAX_NB_WORDS, filters='#$%&()*+-/:;<=>@[\\]^{|}~\t\n,.!"')
tokenizer.fit_on_texts(data_train)

# Replace words with there indexes from out dictionary
X_train= tokenizer.texts_to_sequences(data_train)
X_val=tokenizer.texts_to_sequences(data_val)

print("Sentence in indexes:\n ", X_train[0])


# fit each sentence to max length
X_train=pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH)
X_val=pad_sequences(X_val, maxlen=MAX_SEQUENCE_LENGTH)

print("Sentence fitted to max length:\n", X_train[0])


Original sentence: @user   bihday greg t
Sentence in indexes:
  [1, 61, 9833, 651]
Sentence fitted to max length:
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    1   61 9833  651]


In [7]:
print(tokenizer.word_counts['is'])
print(tokenizer.word_index['is'])
print(tokenizer.word_docs['is'])

3759
11
3458


## Embeddings

In [8]:
# Path to embedding file
EMBEDDINGS_DIR= BASE_DIR + 'Embeddings'
ZIP_FILE='glove.6B.zip'
EMBEDDINGS_FILE='glove.6B.50d.txt'

EMBEDDING_DIM = 50

# Choosing only first 10000 words
first ={k:v for k, v in tokenizer.word_index.items() if v<10000}

# Upload embeddings
embeddings={}
with zipfile.ZipFile(os.path.join(EMBEDDINGS_DIR,ZIP_FILE)) as myzip:
    with myzip.open(EMBEDDINGS_FILE) as f:
        for line in f:
            values=line.split()
            word=values[0].decode('UTF-8')
            coefs=np.asarray(values[1:],dtype='float32')
            embeddings[word]=coefs
            
            del values,word,coefs,line
            
print("Number of words with vector representation:", len(embeddings))

Number of words with vector representation: 400000


In [9]:
# Prepare embedding matrix

embeddings_matrix=np.zeros((tokenizer.num_words,EMBEDDING_DIM))
for word, i in first.items():
    embedding_vector=embeddings.get(word)
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector

## Recurrent Neural Network

In [10]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding
from keras.layers import SimpleRNN
from keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard

Name="SimpleRNN"

#Embedding layer initailisation

embedding_layer= Embedding(tokenizer.num_words, EMBEDDING_DIM, weights=[embeddings_matrix],input_length=MAX_SEQUENCE_LENGTH, trainable=False)
model= Sequential()
model.add(embedding_layer)
model.add(SimpleRNN(100))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',optimizer='adam',  metrics=['accuracy'])


In [11]:
model.fit(X_train, labels_train, validation_data=[X_val, labels_val])

Train on 28765 samples, validate on 3197 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fe4cd5c3550>