## Dependencies

In [1]:
import warnings
import pandas as pd
import seaborn as sns
import tensorflow_hub as hub
import matplotlib.pyplot as plt
from tensorflow.keras import Model, optimizers
from tensorflow.keras.layers import Lambda, Input, Dense, Dropout, Concatenate, BatchNormalization, Activation
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from googleqa_utilityscript import *


SEED = 0
seed_everything(SEED)
warnings.filterwarnings("ignore")

## Load data

In [2]:
hold_out = pd.read_csv('/kaggle/input/googleqa-dataset/hold-out.csv')
train = hold_out[hold_out['set'] == 'train']
validation = hold_out[hold_out['set'] == 'validation']
test = pd.read_csv('/kaggle/input/google-quest-challenge/test.csv')

print('Train samples: %s' % len(train))
print('Validation samples: %s' % len(validation))
print('Test samples: %s' % len(test))
display(train.head())

Train samples: 4863
Validation samples: 1216
Test samples: 476


Unnamed: 0,qa_id,question_title,question_body,question_user_name,question_user_page,answer,answer_user_name,answer_user_page,url,category,...,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written,set
0,6148,Important non-technical course for programmers?,What kind of non-technical training course do ...,Louis Rhys,https://programmers.stackexchange.com/users/8486,Business\n\nThe biggest problem I've seen with...,Ryan Hayes,https://programmers.stackexchange.com/users/1521,http://programmers.stackexchange.com/questions...,TECHNOLOGY,...,1.0,0.666667,1.0,1.0,0.933333,0.0,0.0,1.0,0.888889,train
1,3971,Water : Aquatic :: Sand : xxx?,Just as aquatic is to water and aerial is to a...,coleopterist,https://english.stackexchange.com/users/23608,"Well, fancy words for ‘sandy’ are arenarious a...",tchrist,https://english.stackexchange.com/users/2085,http://english.stackexchange.com/questions/938...,CULTURE,...,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.888889,train
2,4367,"Learning the musical concepts in the book ""Göd...","I am reading the book ""Gödel, Escher, Bach"", i...",Otavio Macedo,https://music.stackexchange.com/users/12666,I agree with luser droog's answer.\n\nMy under...,luser droog,https://music.stackexchange.com/users/1344,http://music.stackexchange.com/questions/22216...,LIFE_ARTS,...,1.0,0.666667,1.0,1.0,0.9,0.333333,0.333333,1.0,0.777778,train
3,6605,HowTo: Add Class to Sidebar Widget List-Items,The newest version of Bootstrap (v3.0) adds a ...,sleeper,https://wordpress.stackexchange.com/users/12116,Update: I fixed the debug warning and everythi...,sleeper,https://wordpress.stackexchange.com/users/12116,http://wordpress.stackexchange.com/questions/1...,TECHNOLOGY,...,0.777778,0.555556,1.0,1.0,0.733333,0.5,0.0,0.5,0.888889,train
4,6522,MVC4 jQuery UI does not work,I cannot get jQuery UI working in my ASP.NET M...,Petr,https://stackoverflow.com/users/958557,"In the layout.cshtml view, move \n\n@Scripts.R...",Yuri Morales,https://stackoverflow.com/users/1459163,http://stackoverflow.com/questions/13112519/mv...,STACKOVERFLOW,...,1.0,0.666667,1.0,1.0,1.0,0.0,1.0,0.0,1.0,train


In [3]:
question_target_cols = ['question_asker_intent_understanding','question_body_critical', 'question_conversational', 
                        'question_expect_short_answer', 'question_fact_seeking', 'question_has_commonly_accepted_answer',
                        'question_interestingness_others', 'question_interestingness_self', 'question_multi_intent', 
                        'question_not_really_a_question', 'question_opinion_seeking', 'question_type_choice',
                        'question_type_compare', 'question_type_consequence', 'question_type_definition', 
                        'question_type_entity', 'question_type_instructions', 'question_type_procedure',
                        'question_type_reason_explanation', 'question_type_spelling', 'question_well_written']
answer_target_cols = ['answer_helpful', 'answer_level_of_information', 'answer_plausible', 'answer_relevance',
                      'answer_satisfaction', 'answer_type_instructions', 'answer_type_procedure', 
                      'answer_type_reason_explanation', 'answer_well_written']
target_cols = question_target_cols + answer_target_cols

## Train/validation sets

In [4]:
# Train features
X_train_title = train['question_title']
X_train_body = train['question_body']
X_train_answer = train['answer']

X_train = [X_train_title, X_train_body, X_train_answer]
Y_train = train[target_cols].values

# Validation features
X_valid_title = validation['question_title']
X_valid_body = validation['question_body']
X_valid_answer = validation['answer']

X_valid = [X_valid_title, X_valid_body, X_valid_answer]
Y_valid = validation[target_cols].values

print('Train samples: %d' % len(Y_train))
print('Validation samples: %d' % len(Y_valid))

Train samples: 4863
Validation samples: 1216


# Model parameters

In [5]:
EPOCHS = 30
BATCH_SIZE = 64
LEARNING_RATE = 3e-4
EMBEDDDING_SIZE = 512
N_CLASS = len(target_cols)
ES_PATIENCE = 3
RLROP_PATIENCE = 2
DECAY_DROP = 0.5
module_url = '../input/universalsentenceencodermodels/universal-sentence-encoder-models/use-large'
model_path = '../working/use_baseline.h5'

# Model

In [6]:
es = EarlyStopping(monitor='val_loss', mode='min', patience=ES_PATIENCE, restore_best_weights=True, verbose=1)
rlrop = ReduceLROnPlateau(monitor='val_loss', mode='min', patience=RLROP_PATIENCE, factor=DECAY_DROP, min_lr=1e-6, verbose=1)
use_embed = hub.load(module_url)

def USEEmbedding(x):
    return use_embed(tf.squeeze(tf.cast(x, tf.string)))

In [7]:
input_title = Input(shape=(1,), dtype=tf.string, name='input_title')
embedding_title = Lambda(USEEmbedding, output_shape=(EMBEDDDING_SIZE,))(input_title)

input_body = Input(shape=(1,), dtype=tf.string, name='input_body')
embedding_body = Lambda(USEEmbedding, output_shape=(EMBEDDDING_SIZE,))(input_body)

input_answer = Input(shape=(1,), dtype=tf.string, name='input_answer')
embedding_answer = Lambda(USEEmbedding, output_shape=(EMBEDDDING_SIZE,))(input_answer)

x = Concatenate()([embedding_title, embedding_body, embedding_answer])
x = Dropout(0.5)(x)
x = Dense(512, activation='relu')(x)
x = Dropout(0.5)(x)
output = Dense(N_CLASS, activation='sigmoid', name='output')(x)
model = Model(inputs=[input_title, input_body, input_answer], outputs=[output])

model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_title (InputLayer)        [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_body (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_answer (InputLayer)       [(None, 1)]          0                                            
__________________________________________________________________________________________________
lambda (Lambda)                 (None, 512)          0           input_title[0][0]                
______________________________________________________________________________________________

# Train model

In [8]:
optimizer = optimizers.Adam(LEARNING_RATE)
callback_list = [es, rlrop, SpearmanRhoCallback(training_data=(X_train, Y_train), validation_data=(X_valid, Y_valid))]
model.compile(optimizer=optimizer, loss='binary_crossentropy')

history = model.fit(X_train, Y_train, 
                    validation_data=(X_valid, Y_valid), 
                    callbacks=callback_list, 
                    epochs=EPOCHS, 
                    verbose=2).history

Train on 4863 samples, validate on 1216 samples
Epoch 1/30
Train spearman-rho: 0.2249 Validation spearman-rho: 0.2163
4863/4863 - 289s - loss: 0.5033 - val_loss: 0.4114
Epoch 2/30
Train spearman-rho: 0.3069 Validation spearman-rho: 0.2917
4863/4863 - 129s - loss: 0.4068 - val_loss: 0.3926
Epoch 3/30
Train spearman-rho: 0.3487 Validation spearman-rho: 0.3272
4863/4863 - 128s - loss: 0.3947 - val_loss: 0.3843
Epoch 4/30
Train spearman-rho: 0.3718 Validation spearman-rho: 0.3435
4863/4863 - 127s - loss: 0.3882 - val_loss: 0.3797
Epoch 5/30
Train spearman-rho: 0.3886 Validation spearman-rho: 0.3537
4863/4863 - 127s - loss: 0.3834 - val_loss: 0.3768
Epoch 6/30
Train spearman-rho: 0.4003 Validation spearman-rho: 0.3603
4863/4863 - 127s - loss: 0.3804 - val_loss: 0.3749
Epoch 7/30
Train spearman-rho: 0.4113 Validation spearman-rho: 0.3651
4863/4863 - 125s - loss: 0.3781 - val_loss: 0.3737
Epoch 8/30
Train spearman-rho: 0.4201 Validation spearman-rho: 0.3692
4863/4863 - 126s - loss: 0.3758 - v

#### Save model

In [9]:
model.save_weights(model_path)

# Evaluation

In [10]:
preds_train = model.predict(X_train)
preds_val = model.predict(X_valid)

rho_train = np.mean([spearmanr(Y_train[:, ind], preds_train[:, ind] + np.random.normal(0, 1e-7, preds_train.shape[0])).correlation for ind in range(preds_train.shape[1])])
rho_val = np.mean([spearmanr(Y_valid[:, ind], preds_val[:, ind] + np.random.normal(0, 1e-7, preds_val.shape[0])).correlation for ind in range(preds_val.shape[1])])

print('Train spearman-rho: %.3f' % rho_train)
print('Validation spearman-rho: %.3f' % rho_val)

Train spearman-rho: 0.501
Validation spearman-rho: 0.393


# Make predictions on test

In [11]:
# Test features
X_test_title = test['question_title']
X_test_body = test['question_body']
X_test_answer = test['answer']

X_test = [X_test_title, X_test_body, X_test_answer]
Y_test = model.predict(X_test)

In [12]:
submission = pd.read_csv('/kaggle/input/google-quest-challenge/sample_submission.csv')
submission[target_cols] = Y_test
submission.to_csv("submission.csv", index=False)
display(submission.head())

Unnamed: 0,qa_id,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,39,0.898401,0.701306,0.195216,0.77575,0.657974,0.790829,0.592663,0.528118,0.189943,...,0.890848,0.907991,0.62895,0.955336,0.965077,0.849752,0.051199,0.074912,0.860011,0.918877
1,46,0.868948,0.560108,0.00161,0.604412,0.84555,0.923662,0.575871,0.468556,0.112189,...,0.744621,0.930638,0.624883,0.96026,0.970378,0.859254,0.945876,0.148701,0.08564,0.884112
2,70,0.888613,0.617331,0.011006,0.746879,0.869945,0.943018,0.586665,0.461787,0.341399,...,0.836117,0.922925,0.615265,0.964007,0.964906,0.830993,0.148751,0.101253,0.835232,0.9017
3,132,0.813217,0.391429,0.011527,0.688667,0.805134,0.863633,0.541603,0.439496,0.411479,...,0.684726,0.936774,0.67681,0.960521,0.974875,0.875578,0.76669,0.225465,0.652762,0.875146
4,200,0.934949,0.51731,0.010834,0.848519,0.827655,0.937916,0.601781,0.572593,0.319013,...,0.768523,0.948583,0.665946,0.976426,0.97759,0.86693,0.436373,0.232336,0.615588,0.908965
