## Dependencies

In [1]:
import glob
import warnings
from tensorflow_hub import KerasLayer
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Dense, Dropout, GlobalAveragePooling1D, SpatialDropout1D, Concatenate
from googleqa_utilityscript import *
from googleqa_map_utilityscript import *
import bert_tokenization as tokenization
from transformers import BertConfig, BertTokenizer, TFBertModel


SEED = 0
seed_everything(SEED)
warnings.filterwarnings("ignore")

## Load data

In [2]:
BERT_PATH = '/kaggle/input/bert-base-uncased-huggingface-transformer/bert-base-uncased-tf_model.h5'
VOCAB_PATH = '/kaggle/input/bert-base-uncased-huggingface-transformer/bert-base-uncased-vocab.txt'
model_path_list = glob.glob('/kaggle/input/140-googleq-a-train-3fold-bert-base-unc-raw-huface/' + '*.h5')
model_path_list.sort()
print('Models to predict:')
print(*model_path_list, sep = "\n")

test = pd.read_csv('/kaggle/input/google-quest-challenge/test.csv')

print('Test samples: %s' % len(test))
display(test.head())

Models to predict:
/kaggle/input/140-googleq-a-train-3fold-bert-base-unc-raw-huface/model_fold_1.h5
/kaggle/input/140-googleq-a-train-3fold-bert-base-unc-raw-huface/model_fold_1_last_epoch.h5
/kaggle/input/140-googleq-a-train-3fold-bert-base-unc-raw-huface/model_fold_2.h5
/kaggle/input/140-googleq-a-train-3fold-bert-base-unc-raw-huface/model_fold_2_last_epoch.h5
/kaggle/input/140-googleq-a-train-3fold-bert-base-unc-raw-huface/model_fold_3.h5
/kaggle/input/140-googleq-a-train-3fold-bert-base-unc-raw-huface/model_fold_3_last_epoch.h5
Test samples: 476


Unnamed: 0,qa_id,question_title,question_body,question_user_name,question_user_page,answer,answer_user_name,answer_user_page,url,category,host
0,39,Will leaving corpses lying around upset my pri...,I see questions/information online about how t...,Dylan,https://gaming.stackexchange.com/users/64471,There is no consequence for leaving corpses an...,Nelson868,https://gaming.stackexchange.com/users/97324,http://gaming.stackexchange.com/questions/1979...,CULTURE,gaming.stackexchange.com
1,46,Url link to feature image in the portfolio,I am new to Wordpress. i have issue with Featu...,Anu,https://wordpress.stackexchange.com/users/72927,I think it is possible with custom fields.\n\n...,Irina,https://wordpress.stackexchange.com/users/27233,http://wordpress.stackexchange.com/questions/1...,TECHNOLOGY,wordpress.stackexchange.com
2,70,"Is accuracy, recoil or bullet spread affected ...","To experiment I started a bot game, toggled in...",Konsta,https://gaming.stackexchange.com/users/37545,You do not have armour in the screenshots. Thi...,Damon Smithies,https://gaming.stackexchange.com/users/70641,http://gaming.stackexchange.com/questions/2154...,CULTURE,gaming.stackexchange.com
3,132,Suddenly got an I/O error from my external HDD,I have used my Raspberry Pi as a torrent-serve...,robbannn,https://raspberrypi.stackexchange.com/users/17341,Your Western Digital hard drive is disappearin...,HeatfanJohn,https://raspberrypi.stackexchange.com/users/1311,http://raspberrypi.stackexchange.com/questions...,TECHNOLOGY,raspberrypi.stackexchange.com
4,200,Passenger Name - Flight Booking Passenger only...,I have bought Delhi-London return flights for ...,Amit,https://travel.stackexchange.com/users/29089,I called two persons who work for Saudia (tick...,Nean Der Thal,https://travel.stackexchange.com/users/10051,http://travel.stackexchange.com/questions/4704...,CULTURE,travel.stackexchange.com


In [3]:
question_target_cols = ['question_asker_intent_understanding','question_body_critical', 'question_conversational', 
                        'question_expect_short_answer', 'question_fact_seeking', 'question_has_commonly_accepted_answer',
                        'question_interestingness_others', 'question_interestingness_self', 'question_multi_intent', 
                        'question_not_really_a_question', 'question_opinion_seeking', 'question_type_choice',
                        'question_type_compare', 'question_type_consequence', 'question_type_definition', 
                        'question_type_entity', 'question_type_instructions', 'question_type_procedure',
                        'question_type_reason_explanation', 'question_type_spelling', 'question_well_written']
answer_target_cols = ['answer_helpful', 'answer_level_of_information', 'answer_plausible', 'answer_relevance',
                      'answer_satisfaction', 'answer_type_instructions', 'answer_type_procedure', 
                      'answer_type_reason_explanation', 'answer_well_written']
target_cols = question_target_cols + answer_target_cols

## Pre-process data

In [4]:
text_features = ['question_title', 'question_body', 'answer']

# for feature in text_features:
#     # Lower
#     test[feature] = test[feature].apply(lambda x: x.lower())
#     # Map misspellings
#     test[feature] = test[feature].apply(lambda x: map_misspellings(x))
#     # Map contractions
#     test[feature] = test[feature].apply(lambda x: map_contraction(x))
#     # Trim text
#     test[feature] = test[feature].apply(lambda x: x.strip())

# Model parameters

In [5]:
N_CLASS = len(target_cols)
MAX_SEQUENCE_LENGTH = 512

## Test set

In [6]:
tokenizer = BertTokenizer.from_pretrained(VOCAB_PATH, do_lower_case=True)

# Test features
X_test = compute_input_arays(test, text_features, tokenizer, MAX_SEQUENCE_LENGTH)

# Model

In [7]:
bert_config = BertConfig()
bert_config.output_hidden_states=False

def model_fn():
    input_word_ids = Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_word_ids')
    input_masks = Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_masks')
    segment_ids = Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='segment_ids')

    bert_model = TFBertModel.from_pretrained(BERT_PATH, config=bert_config)
    sequence_output, pooled_output = bert_model([input_word_ids, input_masks, segment_ids])

    x = GlobalAveragePooling1D()(sequence_output)
    x = Dropout(0.2)(x)
    output = Dense(N_CLASS, activation="sigmoid", name="output")(x)

    model = Model(inputs=[input_word_ids, input_masks, segment_ids], outputs=output)
    
    return model

# Make predictions

In [8]:
Y_test = np.zeros((len(test), N_CLASS))

for model_path in model_path_list:
    model = model_fn()
    model.load_weights(model_path)
    Y_test += model.predict(X_test) / len(model_path_list)

In [9]:
submission = pd.read_csv('/kaggle/input/google-quest-challenge/sample_submission.csv')
submission[target_cols] = Y_test
submission.to_csv("submission.csv", index=False)
display(submission.head())
display(submission.describe())

Unnamed: 0,qa_id,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,39,0.943763,0.637455,0.294886,0.479707,0.5307,0.46917,0.692218,0.67912,0.674268,...,0.915582,0.903388,0.571028,0.961831,0.960375,0.774385,0.04298,0.048798,0.836394,0.906649
1,46,0.881283,0.528797,0.005412,0.779087,0.82543,0.942839,0.543622,0.455638,0.064143,...,0.735873,0.943388,0.632242,0.966054,0.976433,0.85672,0.899604,0.10891,0.109833,0.883231
2,70,0.919925,0.665407,0.033948,0.747615,0.881721,0.913133,0.603162,0.516996,0.168392,...,0.871049,0.92532,0.608065,0.960952,0.960716,0.847049,0.062768,0.048551,0.838409,0.904208
3,132,0.899867,0.429542,0.007107,0.742958,0.764963,0.922831,0.532483,0.399694,0.084767,...,0.710349,0.955482,0.680324,0.968839,0.982715,0.902767,0.851296,0.138599,0.388503,0.905
4,200,0.929088,0.45026,0.038712,0.864192,0.754539,0.853496,0.630181,0.567182,0.121507,...,0.683187,0.901983,0.624438,0.9667,0.962901,0.831365,0.160806,0.076208,0.81176,0.904644


Unnamed: 0,qa_id,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
count,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,...,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0
mean,5029.186975,0.900391,0.593133,0.044885,0.742103,0.804074,0.850655,0.574271,0.487764,0.220211,...,0.796659,0.932643,0.65963,0.962308,0.969565,0.860003,0.486605,0.121101,0.539474,0.902123
std,2812.67006,0.03581,0.119826,0.08071,0.094122,0.101121,0.128804,0.049697,0.088864,0.188117,...,0.076335,0.022241,0.046663,0.010719,0.011321,0.038524,0.315795,0.056417,0.271438,0.021371
min,39.0,0.782171,0.362191,0.003953,0.289915,0.244518,0.232204,0.487795,0.340841,0.011189,...,0.624508,0.834655,0.548159,0.91409,0.908252,0.719322,0.005291,0.008357,0.032528,0.824021
25%,2572.0,0.875991,0.488643,0.008556,0.696352,0.763958,0.833056,0.535817,0.423379,0.078654,...,0.736439,0.920035,0.626639,0.955209,0.963429,0.836667,0.151375,0.079049,0.313673,0.888086
50%,5093.0,0.899609,0.585631,0.014244,0.746952,0.814968,0.90091,0.563196,0.46144,0.145827,...,0.799856,0.935288,0.657499,0.964229,0.971434,0.866338,0.557246,0.122886,0.535116,0.9045
75%,7482.0,0.926758,0.688823,0.035897,0.799833,0.867415,0.927952,0.604044,0.528176,0.323015,...,0.862736,0.949319,0.690665,0.970035,0.978249,0.887732,0.773442,0.160665,0.782128,0.91696
max,9640.0,0.977837,0.867819,0.633421,0.966489,0.97393,0.97514,0.725532,0.775717,0.857777,...,0.944728,0.97852,0.802991,0.985078,0.990153,0.953803,0.956558,0.280685,0.989548,0.957647
