# Using Randomized search CV with Keras

The objectif of this Kernel is to use Randomized search cv to test different parameter and architecture for a GRU  

This kernel is inspired by :
* Miha Skalic kernel : LSTM is all you need! well, maybe embeddings also.
* Shujian Liu Discussion topic :  3 Methods to combine embeddings
* SRK kerne : A look at different embeddings.!

## Import of libraries

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib as mp
import matplotlib.pyplot as plt
import os
from sklearn.metrics import confusion_matrix,f1_score
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,cross_validate,train_test_split,StratifiedKFold
from datetime import datetime

In [None]:
#Import des librairies
from keras.models import Sequential
from keras.layers import *
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.wrappers.scikit_learn import KerasClassifier

## Data upload

In [None]:
train=pd.read_csv("../input/train.csv",sep=',')
test=pd.read_csv("../input/test.csv",sep=',')

##  Preprocessing

In [None]:
#Setting param
vocabulary_size=50000 #number of word used in encoding
vec_size=70 #size of encoded question
emb_size=300 #size of emberdding

In [None]:
#Tokenizer hot encode the corpus 
tokenizer = Tokenizer(num_words=vocabulary_size)
tokenizer.fit_on_texts(train.question_text)
#Transforming question_text to a sequences of hot encoded word 
sequences = tokenizer.texts_to_sequences(train.question_text)

In [None]:
#We can get the encoding of trump
tokenizer.word_index.get('trump')

In [None]:
#Tokenizer can also give word counts 
tokenizer.word_counts.get('trump')

In [None]:
#let see the encoding of the first question
sequences[0]

In [None]:
#pad_sequences set the size of question vector to vec_size
X = pad_sequences(sequences, maxlen=vec_size)

In [None]:
X[0]

## Embedding

In [None]:
#get_emb_index is function how take the file path and get the embedding index
def get_emb_index (filepath) :
    embeddings_index = dict()
    f = open(filepath,errors='ignore')
    for line in f:
        if len(line)>100 :
            values = line.split(" ")
            #the first value of the line is the word
            word = values[0]
            #the rest of values of the line are the coef
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    f.close()
    return embeddings_index

In [None]:
#get_emb_matrix is function how take the a embedding index and get a coef matrix for Tokenizer corpus
def get_emb_matrix(embeddings_index) :
    #calculating the mean and the std of embedding index
    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    emb_mean,emb_std
    #random initialization of the matrix
    embedding_matrix = np.random.normal(emb_mean, emb_std,(vocabulary_size, emb_size))
    #for each word in tokenizer corpus with index < vocabulary index we search coef in the embedding index
    for word, index in tokenizer.word_index.items():
        if index > vocabulary_size - 1:
            break
        else:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[index] = embedding_vector
    embedding_matrix= (embedding_matrix-emb_mean)/emb_std
    return embedding_matrix

In [None]:
%%time
#glove embeddding
glove_emb_index=get_emb_index('../input/embeddings/glove.840B.300d/glove.840B.300d.txt')
glove_emb_matrix= get_emb_matrix(glove_emb_index)

In [None]:
%%time 
#wikinews embedding
wiki_emb_index=get_emb_index('../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec')
wiki_emb_matrix= get_emb_matrix(wiki_emb_index)

In [None]:
%%time
#paragram embedding
paragram_emb_index=get_emb_index('../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt')
paragram_emb_matrix= get_emb_matrix(paragram_emb_index)

In [None]:
#let see the coef of trump in different embedding
pd.DataFrame({'glove':glove_emb_index.get('trump'),'paragram' : paragram_emb_index.get('trump'),'wikiNews':wiki_emb_index.get('trump')}).T

In [None]:
#average of embedding matrix
avg_emb_matrix =(glove_emb_matrix+paragram_emb_matrix+wiki_emb_matrix)/3

### GRU model 

In [None]:
%%time
#Our model : Embedding -> Spatialdropout -> CuDNNGRU -> GlobalMaxPool1D-> Dense -> Dropout -> Dense
def get_model_gru(dropout=0.2,gru_units=64,dense_units=16,kernel_initializer='he_normal', embedding='avg') :
    inp = Input(shape=(vec_size,))
    if embedding =='avg':
        x = Embedding(vocabulary_size, emb_size, weights=[avg_emb_matrix])(inp)
    elif embedding=='glove' :
        x = Embedding(vocabulary_size, emb_size, weights=[glove_emb_matrix])(inp)
    elif embedding=='paragram':
        x = Embedding(vocabulary_size, emb_size, weights=[paragram_emb_matrix])(inp)
    elif embedding =='wiki':
        x = Embedding(vocabulary_size, emb_size, weights=[wiki_emb_matrix])(inp)
    x = SpatialDropout1D(dropout)(x)
    x = Bidirectional(CuDNNGRU(gru_units, return_sequences=True))(x)
    x = GlobalMaxPool1D()(x)
    x = Dense(dense_units,kernel_initializer=kernel_initializer, activation="relu")(x)
    x = Dropout(dropout)(x)
    x = Dense(1, kernel_initializer=kernel_initializer,activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

## Randomized search CV

In [None]:
#The grid for Randomized search CV
param = {
    'dropout': [0.1,0.2,0.5],
    'gru_units': [16, 32, 64],
    'dense_units': [16, 32, 64],
    'kernel_initializer': ['he_normal','uniform'],
    #'kernel_initializer': ['he_normal'],
    'embedding': ['avg','glove','paragram','wiki']
}

In [None]:
#Before using Randomized search CV we have to specify the number of epochs and batch_size
model = KerasClassifier(build_fn=get_model_gru, epochs=3, batch_size=2000,verbose=False)
rnn_model=RandomizedSearchCV(model,n_iter=10, param_distributions=param,cv=3,scoring='f1',return_train_score=True)

In [None]:
%%time
#rnn_model.fit(X[:1000],train.target[:1000])
rnn_model.fit(X,train.target)

In [None]:
#let's see the result
pd.DataFrame(rnn_model.cv_results_)

In [None]:
pred_train=rnn_model.predict(X)

In [None]:
#f1 score and confusion matrix for threshold to
to=0.50
print('F1 score :%s \n'% f1_score(train.target,pred_train>to) )
print('Confusion matrix \n%s'%confusion_matrix(train.target,pred_train))

In [None]:
#Analizing best model's errors
train[train.target!=pred_train[:,0]]

## Test data transformation

In [None]:
sequences_t = tokenizer.texts_to_sequences(test.question_text)

In [None]:
X_t= pad_sequences(sequences_t, maxlen=vec_size)

## Prediction on test data  and submission

In [None]:
result=pd.read_csv("../input/sample_submission.csv",sep=',')
pred_res=rnn_model.predict(X_t)>to
result['prediction']=pd.DataFrame(pred_res)
result.to_csv('submission.csv',index=False)