In [36]:
!pwd

/Users/emiljoswin/study/Kaggle/Google Quest


In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))
display(HTML("<style>.run_this_cell {display: block !important;} </style"))

In [2]:
import re
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, concatenate, LSTM, Bidirectional, Embedding, add, Dense
from tensorflow.keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D
from tensorflow.keras.models import Model

from tensorflow.keras import models, layers, optimizers


from datetime import datetime
from typing import List, Dict, Tuple, Any

from collections import Counter
from tqdm.notebook import trange, tqdm
from gensim.models import KeyedVectors

In [3]:
import os

os.environ['KMP_DUPLICATE_LIB_OK']='True'

### Utility functions

In [7]:
def clean_text(x):

    x = str(x)
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^`{|}~' + '“”’':
        x = x.replace(punct, f' {punct}')
   
    for punct in '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n“”’\'∞θ÷α•à−β∅³π‘₹´°£€\×™√²—_':
        x = x.replace(punct, f' {punct}')
 
    return x

In [8]:
def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re

In [9]:
mispell_dict = {"usepackage" : "use package",
                'instrumentsettingsid':'instrumental settings id',
                'RippleShaderProgram' : 'ripple shader program',
                'ShaderProgramConstants':'shader program constants',
                'storedElements':'stored elements',
                'stackSize' : 'stack size',
                '_':' '

                }

In [10]:
mispellings, mispellings_re = _get_mispell(mispell_dict)

In [11]:
def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]

    return mispellings_re.sub(replace, text)

### Loading Data

In [12]:
# train_loc = '/kaggle/input/google-quest-challenge/train.csv'
# test_loc= '/kaggle/input/google-quest-challenge/test.csv'

train_loc = 'google-quest-challenge/train.csv'
test_loc= 'google-quest-challenge/test.csv'

train_df = pd.read_csv(train_loc)
test_df = pd.read_csv(test_loc)


### Preprocessing

In [13]:
train_df["question_body"] = train_df["question_body"].apply(clean_text)
train_df["question_body"] = train_df["question_body"].apply(replace_typical_misspell)

train_df["question_title"] = train_df["question_title"].apply(clean_text)
train_df["question_title"] = train_df["question_title"].apply(replace_typical_misspell)

train_df["category"] = train_df["category"].apply(clean_text)
train_df["category"] = train_df["category"].apply(replace_typical_misspell)

train_df["answer"] = train_df["answer"].apply(clean_text)
train_df["answer"] = train_df["answer"].apply(replace_typical_misspell)


test_df["question_body"] = test_df["question_body"].apply(clean_text)
test_df["question_body"] = test_df["question_body"].apply(replace_typical_misspell)

test_df["question_title"] = test_df["question_title"].apply(clean_text)
test_df["question_title"] = test_df["question_title"].apply(replace_typical_misspell)

test_df["category"] = test_df["category"].apply(clean_text)
test_df["category"] = test_df["category"].apply(replace_typical_misspell)

test_df["answer"] = test_df["answer"].apply(clean_text)
test_df["answer"] = test_df["answer"].apply(replace_typical_misspell)

In [14]:
question_body = train_df['question_body']
answer = train_df['answer']
question_title = train_df["question_title"]
category = train_df["category"]

question_body_test = test_df['question_body']
answer_test = test_df['answer']
question_title_test = test_df["question_title"]
category_test = test_df["category"]

In [15]:
target = train_df[train_df.columns[-30:]]

### Creating Tokenziers

In [16]:
all_text = pd.concat([
                    train_df['question_body'],
                     train_df['answer'],
                     test_df['question_body'],
                     test_df['answer'],
                     train_df["question_title"],
                     train_df["category"],
                     test_df["question_title"],
                     test_df["category"]
                    ])

In [17]:
len(train_df), len(test_df), 6079*4 + 476 * 4, len(all_text)

(6079, 476, 26220, 26220)

In [18]:
tokenizer = Tokenizer(num_words=1000000, lower=False,filters='')

tokenizer.fit_on_texts(all_text)

In [19]:
len(tokenizer.word_counts), tokenizer.word_index['the'], tokenizer.word_index['ABC']

(100533, 4, 8784)

In [22]:
question_body = tokenizer.texts_to_sequences(question_body)
answer = tokenizer.texts_to_sequences(answer)
question_title = tokenizer.texts_to_sequences(question_title)
category = tokenizer.texts_to_sequences(category)

question_body_test = tokenizer.texts_to_sequences(question_body_test)
answer_test = tokenizer.texts_to_sequences(answer_test)
question_title_test = tokenizer.texts_to_sequences(question_title_test)
category_test = tokenizer.texts_to_sequences(category_test)

In [23]:
len(question_body), len(question_body[0])

(6079, 174)

In [24]:
vocab_size = len(tokenizer.word_index) + 1
maxlen = 245

In [25]:
question_body = pad_sequences(question_body, padding='post', maxlen=maxlen)
answer = pad_sequences(answer, padding='post', maxlen=maxlen)
question_title = pad_sequences(question_title, padding='post', maxlen=maxlen)
category = pad_sequences(category, padding='post', maxlen=maxlen)


question_body_test = pad_sequences(question_body_test, padding='post', maxlen=maxlen)
answer_test = pad_sequences(answer_test, padding='post', maxlen=maxlen)
question_title_test = pad_sequences(question_title_test, padding='post', maxlen=maxlen)
category_test = pad_sequences(category_test, padding='post', maxlen=maxlen)

### Build embedding matrix for the full dataset

Gensim word vectors can be found here https://www.kaggle.com/iezepov/gensim-embeddings-dataset

In [26]:
def build_matrix(word_index, path):
    embedding_index = KeyedVectors.load(path, mmap='r')
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    for word, i in word_index.items():
        for candidate in [word, word.lower()]:
            if candidate in embedding_index:
                embedding_matrix[i] = embedding_index[candidate]
                break
    return embedding_matrix

In [27]:
EMBEDDING_FILES = ['/Users/emiljoswin/study/playground/jupyter_notebooks/NLP/crawl-300d-2M.gensim', 
                  '/Users/emiljoswin/study/playground/jupyter_notebooks/NLP/glove.840B.300d.gensim']

embedding_index = KeyedVectors.load(EMBEDDING_FILES[0], mmap='r')
embedding_index = KeyedVectors.load(EMBEDDING_FILES[1], mmap='r')

In [28]:
m1 = build_matrix(tokenizer.word_index, EMBEDDING_FILES[0])
m2 = build_matrix(tokenizer.word_index, EMBEDDING_FILES[1])
m1.shape, m2.shape, 

((100534, 300), (100534, 300))

In [29]:
embedding_matrix = np.concatenate([m1, m2], axis=-1)
embedding_matrix.shape

(100534, 600)

### Model

In [30]:
LSTM_UNITS = 64
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
BATCH_SIZE = 128
EPOCHS = 10

In [31]:
inp1 = Input(shape=(None,))
inp2 = Input(shape=(None,))
inp3 = Input(shape=(None,))
inp4 = Input(shape=(None,))
words = concatenate([inp1,inp2,inp3,inp4])

In [32]:
x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words)
x = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(x)
x = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(x)

hidden = concatenate([
        GlobalMaxPooling1D()(x),
        GlobalAveragePooling1D()(x),
    ])

hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])

In [33]:
result = Dense(30, activation='sigmoid')(hidden)

In [34]:
model = Model(inputs=[inp1,inp2,inp3,inp4], outputs=[result])
model.compile(loss='mean_squared_error', optimizer='rmsprop',metrics=['mae'])

In [35]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, None)]       0                                            
______________________________________________________________________________________________

In [None]:
[type(i) for i in [question_body, question_title, category, answer, target]]

In [None]:
target = target.to_numpy()

In [None]:
model.fit(
           [question_body, question_title, category, answer], [target],
            batch_size=128,
            epochs=10,
            verbose=1,
        )

In [None]:
predictions = model.predict([question_body_test, question_title_test, category_test, answer_test])

In [None]:
target_cols = ['question_asker_intent_understanding',
       'question_body_critical', 'question_conversational',
       'question_expect_short_answer', 'question_fact_seeking',
       'question_has_commonly_accepted_answer',
       'question_interestingness_others', 'question_interestingness_self',
       'question_multi_intent', 'question_not_really_a_question',
       'question_opinion_seeking', 'question_type_choice',
       'question_type_compare', 'question_type_consequence',
       'question_type_definition', 'question_type_entity',
       'question_type_instructions', 'question_type_procedure',
       'question_type_reason_explanation', 'question_type_spelling',
       'question_well_written', 'answer_helpful',
       'answer_level_of_information', 'answer_plausible', 'answer_relevance',
       'answer_satisfaction', 'answer_type_instructions',
       'answer_type_procedure', 'answer_type_reason_explanation',
       'answer_well_written']

In [None]:
sub = pd.read_csv("google-quest-challenge/sample_submission.csv")

for col_index, col in enumerate(target_cols):
    sub[col] = predictions[:, col_index]

In [None]:
now = datetime.now()

s = now.strftime("%d-%m-%Y::%H:%M:%S")

In [None]:
s.split()

In [None]:
name_items = ['submission', 'kernel_1', *s.split()]
name = '_'.join(name_items) + '.csv'

sub.to_csv(name, index = False)