## Dependencies

In [1]:
import glob
import warnings
from tensorflow_hub import KerasLayer
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Dense, Dropout, GlobalAveragePooling1D, SpatialDropout1D, Concatenate
from googleqa_utilityscript import *
from googleqa_map_utilityscript import *
import bert_tokenization as tokenization


SEED = 0
seed_everything(SEED)
warnings.filterwarnings("ignore")

## Load data

In [2]:
BERT_PATH = '/kaggle/input/tf-hub-bert-base/bert_base_uncased'
VOCAB_PATH = BERT_PATH + '/assets/vocab.txt'
model_path_list = glob.glob('/kaggle/input/135bert-base-last/' + '*.h5')
model_path_list.sort()
print('Models to predict:')
print(*model_path_list, sep = "\n")

test = pd.read_csv('/kaggle/input/google-quest-challenge/test.csv')

print('Test samples: %s' % len(test))
display(test.head())

Models to predict:
/kaggle/input/135bert-base-last/135-BERT_base_uncased_model_fold_1_last_epoch.h5
/kaggle/input/135bert-base-last/135-BERT_base_uncased_model_fold_2_last_epoch.h5
/kaggle/input/135bert-base-last/135-BERT_base_uncased_model_fold_3_last_epoch.h5
/kaggle/input/135bert-base-last/135-BERT_base_uncased_model_fold_4_last_epoch.h5
/kaggle/input/135bert-base-last/135-BERT_base_uncased_model_fold_5_last_epoch.h5
Test samples: 476


Unnamed: 0,qa_id,question_title,question_body,question_user_name,question_user_page,answer,answer_user_name,answer_user_page,url,category,host
0,39,Will leaving corpses lying around upset my pri...,I see questions/information online about how t...,Dylan,https://gaming.stackexchange.com/users/64471,There is no consequence for leaving corpses an...,Nelson868,https://gaming.stackexchange.com/users/97324,http://gaming.stackexchange.com/questions/1979...,CULTURE,gaming.stackexchange.com
1,46,Url link to feature image in the portfolio,I am new to Wordpress. i have issue with Featu...,Anu,https://wordpress.stackexchange.com/users/72927,I think it is possible with custom fields.\n\n...,Irina,https://wordpress.stackexchange.com/users/27233,http://wordpress.stackexchange.com/questions/1...,TECHNOLOGY,wordpress.stackexchange.com
2,70,"Is accuracy, recoil or bullet spread affected ...","To experiment I started a bot game, toggled in...",Konsta,https://gaming.stackexchange.com/users/37545,You do not have armour in the screenshots. Thi...,Damon Smithies,https://gaming.stackexchange.com/users/70641,http://gaming.stackexchange.com/questions/2154...,CULTURE,gaming.stackexchange.com
3,132,Suddenly got an I/O error from my external HDD,I have used my Raspberry Pi as a torrent-serve...,robbannn,https://raspberrypi.stackexchange.com/users/17341,Your Western Digital hard drive is disappearin...,HeatfanJohn,https://raspberrypi.stackexchange.com/users/1311,http://raspberrypi.stackexchange.com/questions...,TECHNOLOGY,raspberrypi.stackexchange.com
4,200,Passenger Name - Flight Booking Passenger only...,I have bought Delhi-London return flights for ...,Amit,https://travel.stackexchange.com/users/29089,I called two persons who work for Saudia (tick...,Nean Der Thal,https://travel.stackexchange.com/users/10051,http://travel.stackexchange.com/questions/4704...,CULTURE,travel.stackexchange.com


In [3]:
question_target_cols = ['question_asker_intent_understanding','question_body_critical', 'question_conversational', 
                        'question_expect_short_answer', 'question_fact_seeking', 'question_has_commonly_accepted_answer',
                        'question_interestingness_others', 'question_interestingness_self', 'question_multi_intent', 
                        'question_not_really_a_question', 'question_opinion_seeking', 'question_type_choice',
                        'question_type_compare', 'question_type_consequence', 'question_type_definition', 
                        'question_type_entity', 'question_type_instructions', 'question_type_procedure',
                        'question_type_reason_explanation', 'question_type_spelling', 'question_well_written']
answer_target_cols = ['answer_helpful', 'answer_level_of_information', 'answer_plausible', 'answer_relevance',
                      'answer_satisfaction', 'answer_type_instructions', 'answer_type_procedure', 
                      'answer_type_reason_explanation', 'answer_well_written']
target_cols = question_target_cols + answer_target_cols

## Pre-process data

In [4]:
text_features = ['question_title', 'question_body', 'answer']

# for feature in text_features:
#     # Lower
#     test[feature] = test[feature].apply(lambda x: x.lower())
#     # Map misspellings
#     test[feature] = test[feature].apply(lambda x: map_misspellings(x))
#     # Map contractions
#     test[feature] = test[feature].apply(lambda x: map_contraction(x))
#     # Trim text
#     test[feature] = test[feature].apply(lambda x: x.strip())

# Model parameters

In [5]:
N_CLASS = len(target_cols)
MAX_SEQUENCE_LENGTH = 512

## Test set

In [6]:
tokenizer = tokenization.FullTokenizer(VOCAB_PATH, do_lower_case=True)

# Test features
X_test = compute_input_arays(test, text_features, tokenizer, MAX_SEQUENCE_LENGTH)

# Model

In [7]:
def model_fn():
    input_word_ids = Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_word_ids')
    input_masks = Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_masks')
    segment_ids = Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='segment_ids')

    bert_layer = KerasLayer(BERT_PATH, trainable=False)
    pooled_output, sequence_output = bert_layer([input_word_ids, input_masks, segment_ids])

    x = GlobalAveragePooling1D()(sequence_output)
    x = Dropout(0.2)(x)
    output = Dense(N_CLASS, activation="sigmoid", name="output")(x)

    model = Model(inputs=[input_word_ids, input_masks, segment_ids], outputs=output)
    
    return model

# Make predictions

In [8]:
Y_test = np.zeros((len(test), N_CLASS))
weights = [.3, .2, .1, .1, .3]

for index, model_path in enumerate(model_path_list):
    model = model_fn()
    model.load_weights(model_path)
    Y_test += model.predict(X_test) * weights[index]

In [9]:
submission = pd.read_csv('/kaggle/input/google-quest-challenge/sample_submission.csv')
submission[target_cols] = Y_test
submission.to_csv("submission.csv", index=False)
display(submission.head())
display(submission.describe())

Unnamed: 0,qa_id,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,39,0.895536,0.555735,0.322475,0.380108,0.635984,0.471719,0.656469,0.625424,0.72195,...,0.894331,0.85704,0.519965,0.932292,0.919175,0.733459,0.074201,0.059778,0.763274,0.877482
1,46,0.901822,0.566569,0.003445,0.84024,0.820373,0.957518,0.596275,0.506407,0.03369,...,0.732552,0.957699,0.669121,0.975998,0.984432,0.880605,0.926036,0.070481,0.045227,0.894294
2,70,0.918739,0.661961,0.018679,0.756351,0.92514,0.957413,0.585662,0.522341,0.227406,...,0.853483,0.940579,0.607729,0.97337,0.97247,0.849647,0.110504,0.09452,0.87885,0.898942
3,132,0.898074,0.423668,0.004068,0.712666,0.820665,0.911964,0.51698,0.424555,0.088527,...,0.75376,0.945499,0.66548,0.975789,0.982685,0.895975,0.854,0.196019,0.406849,0.89919
4,200,0.903802,0.390666,0.039749,0.817033,0.77824,0.858819,0.631552,0.608585,0.093188,...,0.698909,0.902946,0.615498,0.951958,0.943384,0.798314,0.175539,0.125746,0.53728,0.885585


Unnamed: 0,qa_id,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
count,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,...,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0
mean,5029.186975,0.891994,0.589385,0.03911,0.701933,0.835598,0.845021,0.578338,0.500195,0.232175,...,0.822656,0.924092,0.651918,0.961975,0.965686,0.85294,0.505005,0.143444,0.489232,0.900773
std,2812.67006,0.046781,0.1297,0.077553,0.123516,0.101595,0.138607,0.053288,0.091335,0.197455,...,0.07673,0.031847,0.047906,0.014756,0.017109,0.047625,0.325125,0.076047,0.283907,0.023034
min,39.0,0.753068,0.35145,0.001195,0.201682,0.372178,0.184568,0.44637,0.338075,0.001847,...,0.572062,0.807332,0.519965,0.908791,0.873343,0.703558,0.003,0.003318,0.018122,0.802288
25%,2572.0,0.8593,0.4731,0.006324,0.629407,0.789672,0.813827,0.53973,0.432949,0.07529,...,0.762441,0.906778,0.618854,0.952332,0.956982,0.825065,0.161955,0.09007,0.242369,0.886035
50%,5093.0,0.892093,0.584871,0.010805,0.701824,0.849253,0.897441,0.570311,0.478114,0.16138,...,0.824134,0.930322,0.65257,0.964221,0.969579,0.858558,0.56322,0.139412,0.466014,0.902515
75%,7482.0,0.928578,0.68563,0.028497,0.785195,0.90895,0.932091,0.612673,0.545454,0.350386,...,0.88858,0.947306,0.683774,0.972454,0.977805,0.886692,0.807932,0.192743,0.733659,0.916019
max,9640.0,0.98484,0.927453,0.58952,0.979659,0.986447,0.982007,0.733354,0.798037,0.87131,...,0.967778,0.98683,0.824299,0.991248,0.993644,0.973432,0.969983,0.389829,0.992685,0.956264
