In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import tensorflow as tf
import tensorflow_hub as hub #to re use existing models of ML
import keras
import keras.backend as K
from keras.layers import *
from keras.callbacks import *
from keras.optimizers import *
from keras import Model

import pickle    
import os

def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
def load_obj(name ):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)
                
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/universalsentenceencoderlarge4/saved_model.pb
/kaggle/input/universalsentenceencoderlarge4/variables/variables.data-00000-of-00001
/kaggle/input/universalsentenceencoderlarge4/variables/variables.index
/kaggle/input/google-quest-challenge/train.csv
/kaggle/input/google-quest-challenge/sample_submission.csv
/kaggle/input/google-quest-challenge/test.csv


Using TensorFlow backend.


In [2]:
train = pd.read_csv('/kaggle/input/google-quest-challenge/train.csv')
test = pd.read_csv('/kaggle/input/google-quest-challenge/test.csv')
submission = pd.read_csv('/kaggle/input/google-quest-challenge/sample_submission.csv')

module_url = "/kaggle/input/universalsentenceencoderlarge4/"
embed = hub.load(module_url)
# For the keras Lambda
def UniversalEmbedding(x):
    results = embed(tf.squeeze(tf.cast(x, tf.string)))["outputs"]
    print(results)
    return keras.backend.concatenate([results])

In [3]:
# setup training data
targets = [
        'question_asker_intent_understanding',
        'question_body_critical',
        'question_conversational',
        'question_expect_short_answer',
        'question_fact_seeking',
        'question_has_commonly_accepted_answer',
        'question_interestingness_others',
        'question_interestingness_self',
        'question_multi_intent',
        'question_not_really_a_question',
        'question_opinion_seeking',
        'question_type_choice',
        'question_type_compare',
        'question_type_consequence',
        'question_type_definition',
        'question_type_entity',
        'question_type_instructions',
        'question_type_procedure',
        'question_type_reason_explanation',
        'question_type_spelling',
        'question_well_written',
        'answer_helpful',
        'answer_level_of_information',
        'answer_plausible',
        'answer_relevance',
        'answer_satisfaction',
        'answer_type_instructions',
        'answer_type_procedure',
        'answer_type_reason_explanation',
        'answer_well_written'    
    ]

input_columns = ['question_title','question_body','answer']

X1 = train[input_columns[0]].values.tolist()
X2 = train[input_columns[1]].values.tolist()
X3 = train[input_columns[2]].values.tolist()
X1 = [x.replace('?','.').replace('!','.') for x in X1]
X2 = [x.replace('?','.').replace('!','.') for x in X2]
X3 = [x.replace('?','.').replace('!','.') for x in X3]

X = [X1,X2,X3]
y = train[targets].values.tolist()

In [4]:
# build network
def swish(x):
    return K.sigmoid(x) * x

embed_size = 512 #must be 512 for univerasl embedding layer

input_text1 = Input(shape=(1,), dtype=tf.string)
embedding1 = Lambda(UniversalEmbedding, output_shape=(embed_size,))(input_text1)
input_text2 = Input(shape=(1,), dtype=tf.string)
embedding2 = Lambda(UniversalEmbedding, output_shape=(embed_size,))(input_text2)
input_text3 = Input(shape=(1,), dtype=tf.string)
embedding3 = Lambda(UniversalEmbedding, output_shape=(embed_size,))(input_text3)

x = Concatenate()([embedding1,embedding2,embedding3])
x = Dense(256, activation=swish)(x)
x = Dropout(0.4)(x)
x = BatchNormalization()(x)
x = Dense(64, activation=swish, kernel_regularizer=keras.regularizers.l2(0.001))(x)
x = Dropout(0.4)(x)
x = BatchNormalization()(x)
output = Dense(len(targets),activation='sigmoid',name='output')(x)

Tensor("lambda_1/StatefulPartitionedCall:0", shape=(None, 512), dtype=float32)
Tensor("lambda_2/StatefulPartitionedCall:0", shape=(None, 512), dtype=float32)
Tensor("lambda_3/StatefulPartitionedCall:0", shape=(None, 512), dtype=float32)


In [5]:
model = Model(inputs=[input_text1,input_text2,input_text3], outputs=[output])
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
lambda_1 (Lambda)               (None, 512)          0           input_1[0][0]                    
____________________________________________________________________________________________

In [6]:
# clean up as much as possible
import gc
print(gc.collect())
# Train the network
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1,
                              patience=2, min_lr=1e-7, verbose=1)
optimizer = Adadelta()

model.compile(optimizer=optimizer, loss='binary_crossentropy')
model.fit(X, [y], epochs=20, validation_split=.1,batch_size=32,callbacks=[reduce_lr])

8
Train on 5471 samples, validate on 608 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20

Epoch 00014: ReduceLROnPlateau reducing learning rate to 0.1.
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Epoch 00020: ReduceLROnPlateau reducing learning rate to 0.010000000149011612.


<keras.callbacks.callbacks.History at 0x7f55062e2eb8>

In [7]:
# prep test data
X1 = test[input_columns[0]].values.tolist()
X2 = test[input_columns[1]].values.tolist()
X3 = test[input_columns[2]].values.tolist()
X1 = [x.replace('?','.').replace('!','.') for x in X1]
X2 = [x.replace('?','.').replace('!','.') for x in X2]
X3 = [x.replace('?','.').replace('!','.') for x in X3]

pred_X = [X1,X2,X3]
# Make a prediction
pred_y = model.predict(pred_X)
# Check the submission
submission = pd.read_csv('/kaggle/input/google-quest-challenge/sample_submission.csv')
submission[targets] = pred_y
submission.head()

Unnamed: 0,qa_id,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,39,0.924706,0.71797,0.361762,0.757099,0.46951,0.595545,0.656191,0.611598,0.214955,...,0.878272,0.928102,0.668519,0.967651,0.971278,0.856516,0.023118,0.048309,0.938198,0.930954
1,46,0.872782,0.555062,0.00228,0.633589,0.912243,0.939106,0.555607,0.441975,0.188445,...,0.794167,0.930423,0.622766,0.957904,0.971309,0.844175,0.878718,0.197939,0.163132,0.891977
2,70,0.898625,0.622473,0.018558,0.759433,0.886882,0.905188,0.594162,0.504091,0.371287,...,0.838177,0.924725,0.637207,0.965682,0.967866,0.843614,0.205841,0.085576,0.729787,0.91501
3,132,0.853233,0.409977,0.005684,0.709513,0.863309,0.925419,0.556805,0.453179,0.362997,...,0.70109,0.947137,0.688902,0.971202,0.980154,0.8817,0.6831,0.177826,0.75338,0.908707
4,200,0.917019,0.60192,0.01763,0.830541,0.82595,0.917814,0.598242,0.532566,0.308121,...,0.829304,0.94807,0.678869,0.972736,0.975923,0.886673,0.435941,0.145375,0.47865,0.917339


In [8]:
# Save the result
submission.to_csv("submission.csv", index = False)